All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup
@ 2017-05-02  7:14 Jianbo Liu
  2017-05-02  7:14 ` [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h Jianbo Liu
                   ` (7 more replies)
  0 siblings, 8 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-02  7:14 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob; +Cc: Jianbo Liu

Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c         |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h     | 302 ++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +----------------------------------
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 0000000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv4_5tuple_host key[8];
+
+	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+				&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv6_5tuple_host key[8];
+
+	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+			&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) uint16_t
+em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	uint8_t next_hop;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	uint32_t tcp_or_udp;
+	uint32_t l3_ptypes;
+
+	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
+
+	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
+
+		/* Handle IPv4 headers.*/
+		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
+				qconf->ipv4_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
+
+		/* Handle IPv6 headers.*/
+		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
+				qconf->ipv6_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	}
+
+	return portid;
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+		uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+
+	/*
+	 * Send nb_rx - nb_rx%8 packets
+	 * in groups of 8.
+	 */
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+
+	for (j = 0; j < n; j += 8) {
+
+		uint32_t pkt_type =
+			pkts_burst[j]->packet_type &
+			pkts_burst[j+1]->packet_type &
+			pkts_burst[j+2]->packet_type &
+			pkts_burst[j+3]->packet_type &
+			pkts_burst[j+4]->packet_type &
+			pkts_burst[j+5]->packet_type &
+			pkts_burst[j+6]->packet_type &
+			pkts_burst[j+7]->packet_type;
+
+		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		uint32_t tcp_or_udp = pkt_type &
+			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+
+		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
+
+			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
+
+			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else {
+			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
+							portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
+							portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
+							portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
+							portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
+							portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
+							portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
+							portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
+							portid);
+		}
+	}
+
+	for (; j < nb_rx; j++)
+		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+
+}
+#endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 7714a20..cb1304f 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,104 +34,16 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-#include "l3fwd_sse.h"
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, __m128i mask0,
+		union ipv4_5tuple_host *key)
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-	__m128i data[8];
-
-	data[0] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[0], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[1] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[1], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[2] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[2], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[3] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[3], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[4] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[4], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[5] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[5], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[6] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[6], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[7] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[7], __m128i *,
+	 __m128i tmpdata0 = _mm_loadu_si128(
+			rte_pktmbuf_mtod_offset(m0, __m128i *,
 				sizeof(struct ether_hdr) +
 				offsetof(struct ipv4_hdr, time_to_live)));
 
-	key[0].xmm = _mm_and_si128(data[0], mask0.x);
-	key[1].xmm = _mm_and_si128(data[1], mask0.x);
-	key[2].xmm = _mm_and_si128(data[2], mask0.x);
-	key[3].xmm = _mm_and_si128(data[3], mask0.x);
-	key[4].xmm = _mm_and_si128(data[4], mask0.x);
-	key[5].xmm = _mm_and_si128(data[5], mask0.x);
-	key[6].xmm = _mm_and_si128(data[6], mask0.x);
-	key[7].xmm = _mm_and_si128(data[7], mask0.x);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
+	key->xmm = _mm_and_si128(tmpdata0, mask0);
 }
 
 static inline void
@@ -159,184 +71,4 @@ static inline __attribute__((always_inline)) void
 	key->xmm[1] = tmpdata1;
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
-{
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
-}
-
-static inline __attribute__((always_inline)) uint16_t
-em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint8_t next_hop;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ipv6_hdr *ipv6_hdr;
-	uint32_t tcp_or_udp;
-	uint32_t l3_ptypes;
-
-	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
-
-	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
-
-		/* Handle IPv4 headers.*/
-		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
-				qconf->ipv4_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
-
-		/* Handle IPv6 headers.*/
-		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
-				qconf->ipv6_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	}
-
-	return portid;
-}
-
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
-static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint8_t portid, struct lcore_conf *qconf)
-{
-	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
-
-	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
-	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
-
-	for (j = 0; j < n; j += 8) {
-
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
-
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
-		}
-	}
-
-	for (; j < nb_rx; j++)
-		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
-
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
-
-}
 #endif /* __L3FWD_EM_SSE_HLM_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
@ 2017-05-02  7:14 ` Jianbo Liu
  2017-05-02  9:40   ` Sekhar, Ashwin
  2017-05-02  7:14 ` [PATCH 3/5] examples/l3fwd: extract common code from multi packet send Jianbo Liu
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-02  7:14 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob; +Cc: Jianbo Liu

The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for single hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c                            | 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_single.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_single.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..cccf797 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_single.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_single.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_single.h
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 3/5] examples/l3fwd: extract common code from multi packet send
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
  2017-05-02  7:14 ` [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h Jianbo Liu
@ 2017-05-02  7:14 ` Jianbo Liu
  2017-05-02  7:14 ` [PATCH 4/5] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-02  7:14 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob; +Cc: Jianbo Liu

Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_common.h | 293 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h    | 255 +-----------------------------------
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 0000000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+	uint8_t ihl;
+
+	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+			dp[0] = BAD_PORT;
+
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {             \
+		(lp)[0]++;                                   \
+	} else {                                         \
+		(dlp) = (dcp)[idx];                          \
+		(lp) = (pn) + (idx);                         \
+		(lp)[0] = 1;                                 \
+	}                                                \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+static inline __attribute__((always_inline)) void
+send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
+		uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 3:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 2:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 1:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	qconf->tx_mbufs[port].len = len;
+}
+
+#endif /* _L3FWD_COMMON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 1afa1f0..d99842b 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -32,53 +32,11 @@
  */
 
 
-#ifndef _L3FWD_COMMON_H_
-#define _L3FWD_COMMON_H_
+#ifndef _L3FWD_SSE_H_
+#define _L3FWD_SSE_H_
 
 #include "l3fwd.h"
-
-#ifdef DO_RFC_1812_CHECKS
-
-#define	IPV4_MIN_VER_IHL	0x45
-#define	IPV4_MAX_VER_IHL	0x4f
-#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
-
-/* Minimum value of IPV4 total length (20B) in network byte order. */
-#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
-
-/*
- * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
- * - The IP version number must be 4.
- * - The IP header length field must be large enough to hold the
- *    minimum length legal IP datagram (20 bytes = 5 words).
- * - The IP total length field must be large enough to hold the IP
- *   datagram header, whose length is specified in the IP header length
- *   field.
- * If we encounter invalid IPV4 packet, then set destination port for it
- * to BAD_PORT value.
- */
-static inline __attribute__((always_inline)) void
-rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
-{
-	uint8_t ihl;
-
-	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
-		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
-
-		ipv4_hdr->time_to_live--;
-		ipv4_hdr->hdr_checksum++;
-
-		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
-				((uint8_t)ipv4_hdr->total_length == 0 &&
-				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
-			dp[0] = BAD_PORT;
-
-	}
-}
-
-#else
-#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
-#endif /* DO_RFC_1812_CHECKS */
+#include "l3fwd_common.h"
 
 /*
  * Update source and destination MAC addresses in the ethernet header.
@@ -130,30 +88,6 @@ static inline __attribute__((always_inline)) void
 }
 
 /*
- * We group consecutive packets with the same destionation port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-/*
  * Group consecutive packets with the same destination port in bursts of 4.
  * Suppose we have array of destionation ports:
  * dst_port[] = {a, b, c, d,, e, ... }
@@ -164,109 +98,6 @@ static inline __attribute__((always_inline)) void
 static inline uint16_t *
 port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
 {
-	static const struct {
-		uint64_t pnum; /* prebuild 4 values for pnum[]. */
-		int32_t  idx;  /* index for new last updated elemnet. */
-		uint16_t lpv;  /* add value to the last updated element. */
-	} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-	};
-
 	union {
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
@@ -314,84 +145,6 @@ static inline __attribute__((always_inline)) void
 	_mm_storeu_si128((__m128i *)eth_hdr, te);
 }
 
-static inline __attribute__((always_inline)) void
-send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
-		uint32_t num)
-{
-	uint32_t len, j, n;
-
-	len = qconf->tx_mbufs[port].len;
-
-	/*
-	 * If TX buffer for that queue is empty, and we have enough packets,
-	 * then send them straightway.
-	 */
-	if (num >= MAX_TX_BURST && len == 0) {
-		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
-		if (unlikely(n < num)) {
-			do {
-				rte_pktmbuf_free(m[n]);
-			} while (++n < num);
-		}
-		return;
-	}
-
-	/*
-	 * Put packets into TX buffer for that queue.
-	 */
-
-	n = len + num;
-	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
-
-	j = 0;
-	switch (n % FWDSTEP) {
-	while (j < n) {
-	case 0:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 3:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 2:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 1:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	}
-	}
-
-	len += n;
-
-	/* enough pkts to be sent */
-	if (unlikely(len == MAX_PKT_BURST)) {
-
-		send_burst(qconf, MAX_PKT_BURST, port);
-
-		/* copy rest of the packets into the TX buffer. */
-		len = num - n;
-		j = 0;
-		switch (len % FWDSTEP) {
-		while (j < len) {
-		case 0:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 3:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 2:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 1:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		}
-		}
-	}
-
-	qconf->tx_mbufs[port].len = len;
-}
-
 /**
  * Send packets burst from pkts_burst to the ports in dst_port array
  */
@@ -498,4 +251,4 @@ static inline __attribute__((always_inline)) void
 	}
 }
 
-#endif /* _L3FWD_COMMON_H_ */
+#endif /* _L3FWD_SSE_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 4/5] examples/l3fwd: rearrange the code for lpm_l3fwd
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
  2017-05-02  7:14 ` [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h Jianbo Liu
  2017-05-02  7:14 ` [PATCH 3/5] examples/l3fwd: extract common code from multi packet send Jianbo Liu
@ 2017-05-02  7:14 ` Jianbo Liu
  2017-05-02  7:14 ` [PATCH 5/5] examples/l3fwd: add neon support for l3fwd Jianbo Liu
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-02  7:14 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c     | 83 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.h     | 26 +------------
 examples/l3fwd/l3fwd_lpm_sse.h | 66 ---------------------------------
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm *ipv4_l3fwd_lookup_struct =
+		(struct rte_lpm *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+		&next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+		(struct rte_lpm6 *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+			&next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	struct ipv6_hdr *ipv6_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+					     qconf->ipv4_lookup_struct);
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					     qconf->ipv6_lookup_struct);
+	}
+
+	return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint8_t portid)
+{
+	uint32_t next_hop;
+	struct ipv6_hdr *ipv6_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+						   dst_ipv4, &next_hop) == 0)
+				   ? next_hop : portid);
+
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+				ipv6_hdr->dst_addr, &next_hop) == 0)
+				? next_hop : portid);
+
+	}
+
+	return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm *ipv4_l3fwd_lookup_struct =
-		(struct rte_lpm *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-		&next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-		(struct rte_lpm6 *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-			&next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((always_inline)) void
 l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint8_t portid,
 		struct lcore_conf *qconf)
 {
 	struct ether_hdr *eth_hdr;
 	struct ipv4_hdr *ipv4_hdr;
-	uint8_t dst_port;
+	uint16_t dst_port;
 
 	eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index aa06b6d..4a9b7ed 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -36,72 +36,6 @@
 
 #include "l3fwd_sse.h"
 
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) (
-			(rte_lpm_lookup(qconf->ipv4_lookup_struct,
-					rte_be_to_cpu_32(ipv4_hdr->dst_addr),
-					&next_hop) == 0) ?
-						next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-}
-
-/*
- * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
- * precalculated. If packet is ipv6 dst_addr is taken directly from packet
- * header and dst_ipv4 value is not used.
- */
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-	uint32_t dst_ipv4, uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
-			&next_hop) == 0) ? next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-
-}
-
 /*
  * Read packet_type and destination IPV4 addresses from 4 mbufs.
  */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                   ` (2 preceding siblings ...)
  2017-05-02  7:14 ` [PATCH 4/5] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
@ 2017-05-02  7:14 ` Jianbo Liu
  2017-05-02 11:20   ` Sekhar, Ashwin
  2017-05-02 11:47   ` Sekhar, Ashwin
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                   ` (3 subsequent siblings)
  7 siblings, 2 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-02  7:14 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob; +Cc: Jianbo Liu

Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd.h             |   4 -
 examples/l3fwd/l3fwd_em.c          |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h      |   5 +
 examples/l3fwd/l3fwd_em_hlm_neon.h |  74 +++++++++++
 examples/l3fwd/l3fwd_em_single.h   |   4 +
 examples/l3fwd/l3fwd_lpm.c         |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h    | 157 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h        | 259 +++++++++++++++++++++++++++++++++++++
 8 files changed, 504 insertions(+), 7 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 011ba14..c45589a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,10 +40,6 @@
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
-#if !defined(NO_HASH_MULTI_LOOKUP) && defined(RTE_MACHINE_CPUFLAG_NEON)
-#define NO_HASH_MULTI_LOOKUP 1
-#endif
-
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index cccf797..ac1e2e0 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_single.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..3329c1a 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_single.h b/examples/l3fwd/l3fwd_em_single.h
index c0a9725..8604571 100644
--- a/examples/l3fwd/l3fwd_em_single.h
+++ b/examples/l3fwd/l3fwd_em_single.h
@@ -43,7 +43,11 @@
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index fc554fc..ddef250 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@ static inline __attribute__((always_inline)) uint16_t
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@ static inline __attribute__((always_inline)) uint16_t
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..772e54b
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,157 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+	uint8x16_t bswap_mask = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8,
+				 15, 14, 13, 12};
+
+	/* Byte swap 4 IPV4 addresses. */
+	dip = vreinterpretq_s32_u8(vqtbl1q_u8(vreinterpretq_u8_s32(dip),
+					      bswap_mask));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
+	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	for (j = 0; j != k; j += FWDSTEP)
+		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
+				&ipv4_flag[j / FWDSTEP]);
+
+	for (j = 0; j != k; j += FWDSTEP)
+		processx4_step2(qconf, dip[j / FWDSTEP],
+				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j],
+				&dst_port[j]);
+
+	/* Classify last up to 3 packets one by one */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 2:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 1:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..75c8976
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static inline __attribute__((always_inline)) void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h
  2017-05-02  7:14 ` [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h Jianbo Liu
@ 2017-05-02  9:40   ` Sekhar, Ashwin
  0 siblings, 0 replies; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-02  9:40 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

On Tue, 2017-05-02 at 15:14 +0800, Jianbo Liu wrote:
> The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
> Renaming it because it's only for single hash lookup,
> and doesn't include any x86 SSE instructions.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  examples/l3fwd/l3fwd_em.c                            | 2 +-
>  examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_single.h} | 0
>  2 files changed, 1 insertion(+), 1 deletion(-)
>  rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_single.h} (100%)
> 
> diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
> index 939a16d..cccf797 100644
> --- a/examples/l3fwd/l3fwd_em.c
> +++ b/examples/l3fwd/l3fwd_em.c
> @@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
>  
>  #if defined(__SSE4_1__)
>  #if defined(NO_HASH_MULTI_LOOKUP)
> -#include "l3fwd_em_sse.h"
> +#include "l3fwd_em_single.h"
>  #else
>  #include "l3fwd_em_hlm.h"
>  #endif
> diff --git a/examples/l3fwd/l3fwd_em_sse.h
> b/examples/l3fwd/l3fwd_em_single.h
> similarity index 100%
> rename from examples/l3fwd/l3fwd_em_sse.h
> rename to examples/l3fwd/l3fwd_em_single.h

Shouldn't the guard __L3FWD_EM_SSE_H__ be update
to __L3FWD_EM_SINGLE_H__ to maintain consistency ?

Thanks and Regards,
Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-02  7:14 ` [PATCH 5/5] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-02 11:20   ` Sekhar, Ashwin
  2017-05-02 11:47   ` Sekhar, Ashwin
  1 sibling, 0 replies; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-02 11:20 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

Hi,

Please find comments inline.

On Tue, 2017-05-02 at 15:14 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  examples/l3fwd/l3fwd.h             |   4 -
>  examples/l3fwd/l3fwd_em.c          |   4 +-
>  examples/l3fwd/l3fwd_em_hlm.h      |   5 +
>  examples/l3fwd/l3fwd_em_hlm_neon.h |  74 +++++++++++
>  examples/l3fwd/l3fwd_em_single.h   |   4 +
>  examples/l3fwd/l3fwd_lpm.c         |   4 +-
>  examples/l3fwd/l3fwd_lpm_neon.h    | 157 ++++++++++++++++++++++
>  examples/l3fwd/l3fwd_neon.h        | 259
> +++++++++++++++++++++++++++++++++++++
>  8 files changed, 504 insertions(+), 7 deletions(-)
>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_neon.h
> 
> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
> index 011ba14..c45589a 100644
> --- a/examples/l3fwd/l3fwd.h
> +++ b/examples/l3fwd/l3fwd.h
> @@ -40,10 +40,6 @@
>  
>  #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
>  
> -#if !defined(NO_HASH_MULTI_LOOKUP) &&
> defined(RTE_MACHINE_CPUFLAG_NEON)
> -#define NO_HASH_MULTI_LOOKUP 1
> -#endif
> -
>  #define MAX_PKT_BURST     32
>  #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
>  
> diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
> index cccf797..ac1e2e0 100644
> --- a/examples/l3fwd/l3fwd_em.c
> +++ b/examples/l3fwd/l3fwd_em.c
> @@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
>  	return (uint8_t)((ret < 0) ? portid :
> ipv6_l3fwd_out_if[ret]);
>  }
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  #if defined(NO_HASH_MULTI_LOOKUP)
>  #include "l3fwd_em_single.h"
>  #else
> @@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
>  			if (nb_rx == 0)
>  				continue;
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  			l3fwd_em_send_packets(nb_rx, pkts_burst,
>  							portid,
> qconf);
>  #else
> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
> b/examples/l3fwd/l3fwd_em_hlm.h
> index 636dea4..3329c1a 100644
> --- a/examples/l3fwd/l3fwd_em_hlm.h
> +++ b/examples/l3fwd/l3fwd_em_hlm.h
> @@ -35,8 +35,13 @@
>  #ifndef __L3FWD_EM_HLM_H__
>  #define __L3FWD_EM_HLM_H__
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
>  #include "l3fwd_em_hlm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#include "l3fwd_em_hlm_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) void
>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
> *m[8],
> diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h
> b/examples/l3fwd/l3fwd_em_hlm_neon.h
> new file mode 100644
> index 0000000..dae1acf
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
> @@ -0,0 +1,74 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef __L3FWD_EM_HLM_NEON_H__
> +#define __L3FWD_EM_HLM_NEON_H__
> +
> +#include <arm_neon.h>
> +
> +static inline void
> +get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
> +		union ipv4_5tuple_host *key)
> +{
> +	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0,
> int32_t *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv4_hdr,
> time_to_live)));
> +
> +	key->xmm = vandq_s32(tmpdata0, mask0);
> +}
> +
> +static inline void
> +get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
> +		int32x4_t mask1, union ipv6_5tuple_host *key)
> +{
> +	int32x4_t tmpdata0 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len)));
> +
> +	int32x4_t tmpdata1 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len) + 8));
> +
> +	int32x4_t tmpdata2 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len) + 16));
> +
> +	key->xmm[0] = vandq_s32(tmpdata0, mask0);
> +	key->xmm[1] = tmpdata1;
> +	key->xmm[2] = vandq_s32(tmpdata2, mask1);
> +}
> +#endif /* __L3FWD_EM_HLM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_em_single.h
> b/examples/l3fwd/l3fwd_em_single.h
> index c0a9725..8604571 100644
> --- a/examples/l3fwd/l3fwd_em_single.h
> +++ b/examples/l3fwd/l3fwd_em_single.h
> @@ -43,7 +43,11 @@
>   * compilation time.
>   */
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) uint16_t
>  em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf
> *pkt,
> diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
> index fc554fc..ddef250 100644
> --- a/examples/l3fwd/l3fwd_lpm.c
> +++ b/examples/l3fwd/l3fwd_lpm.c
> @@ -189,6 +189,8 @@ static inline __attribute__((always_inline))
> uint16_t
>  
>  #if defined(__SSE4_1__)
>  #include "l3fwd_lpm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_lpm_neon.h"
>  #else
>  #include "l3fwd_lpm.h"
>  #endif
> @@ -261,7 +263,7 @@ static inline __attribute__((always_inline))
> uint16_t
>  			if (nb_rx == 0)
>  				continue;
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
>  						portid, qconf);
>  #else
> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
> b/examples/l3fwd/l3fwd_lpm_neon.h
> new file mode 100644
> index 0000000..772e54b
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_lpm_neon.h
> @@ -0,0 +1,157 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef __L3FWD_LPM_NEON_H__
> +#define __L3FWD_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +
> +#include "l3fwd_neon.h"
> +
> +/*
> + * Read packet_type and destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
> +		int32x4_t *dip,
> +		uint32_t *ipv4_flag)
> +{
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[0] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[1] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[1]->packet_type;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[2] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[2]->packet_type;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[3] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[3]->packet_type;
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + * If lookup fails, use incoming port (portid) as destination port.
> + */
> +static inline void
> +processx4_step2(const struct lcore_conf *qconf,
> +		int32x4_t dip,
> +		uint32_t ipv4_flag,
> +		uint8_t portid,
> +		struct rte_mbuf *pkt[FWDSTEP],
> +		uint16_t dprt[FWDSTEP])
> +{
> +	rte_xmm_t dst;
> +	uint8x16_t bswap_mask = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
> 8,
> +				 15, 14, 13, 12};
> +
> +	/* Byte swap 4 IPV4 addresses. */
> +	dip =
> vreinterpretq_s32_u8(vqtbl1q_u8(vreinterpretq_u8_s32(dip),
> +					      bswap_mask));
> +
This can be easily done by vrev32q_u8. With this we can avoid the need
for bswap_mask. Also TBL instruction has higher latency compared to the
rev32 instruction in thunderx, thunderx2t99 and cortexa57. 

> +	/* if all 4 packets are IPV4. */
> +	if (likely(ipv4_flag)) {
> +		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip,
> dst.u32,
> +			portid);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +	} else {
> +		dst.x = dip;
> +		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
> +						     dst.u32[0],
> portid);
> +		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
> +						     dst.u32[1],
> portid);
> +		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
> +						     dst.u32[2],
> portid);
> +		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
> +						     dst.u32[3],
> portid);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of packets, invoked
> + * from main_loop.
> + */
> +static inline void
> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
> +			uint8_t portid, struct lcore_conf *qconf)
> +{
> +	int32_t j;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
> +	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	for (j = 0; j != k; j += FWDSTEP)
> +		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
> +				&ipv4_flag[j / FWDSTEP]);
> +
> +	for (j = 0; j != k; j += FWDSTEP)
> +		processx4_step2(qconf, dip[j / FWDSTEP],
> +				ipv4_flag[j / FWDSTEP], portid,
> &pkts_burst[j],
> +				&dst_port[j]);
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +		/* fallthrough */
> +	case 2:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +		/* fallthrough */
> +	case 1:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +	}
> +
> +	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> +}
> +
> +#endif /* __L3FWD_LPM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_neon.h
> b/examples/l3fwd/l3fwd_neon.h
> new file mode 100644
> index 0000000..75c8976
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_neon.h
> @@ -0,0 +1,259 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +
> +#ifndef _L3FWD_NEON_H_
> +#define _L3FWD_NEON_H_
> +
> +#include "l3fwd.h"
> +#include "l3fwd_common.h"
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t
> dst_port[FWDSTEP])
> +{
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +
> +	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
> +	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
> +	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
> +	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
> +
> +	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +	te[0] = vld1q_u32(p[0]);
> +
> +	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
> +	te[1] = vld1q_u32(p[1]);
> +
> +	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
> +	te[2] = vld1q_u32(p[2]);
> +
> +	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
> +	te[3] = vld1q_u32(p[3]);
> +
> +	/* Update last 4 bytes */
> +	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
> +	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
> +	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
> +	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
> +
> +	vst1q_u32(p[0], ve[0]);
> +	vst1q_u32(p[1], ve[1]);
> +	vst1q_u32(p[2], ve[2]);
> +	vst1q_u32(p[3], ve[3]);
> +
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0]
> + 1),
> +		&dst_port[0], pkt[0]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1]
> + 1),
> +		&dst_port[1], pkt[1]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2]
> + 1),
> +		&dst_port[2], pkt[2]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3]
> + 1),
> +		&dst_port[3], pkt[3]->packet_type);
> +}
> +
> +/*
> + * Group consecutive packets with the same destination port in
> bursts of 4.
> + * Suppose we have array of destionation ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisions at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +	     uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	int32_t v;
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +/**
> + * Process one packet:
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> +{
> +	struct ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +
> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> +			pkt->packet_type);
> +
> +	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +}
> +
> +/**
> + * Send packets burst from pkts_burst to the ports in dst_port array
> + */
> +static inline __attribute__((always_inline)) void
> +send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf
> **pkts_burst,
> +		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
> +{
> +	int32_t k;
> +	int j = 0;
> +	uint16_t dlp;
> +	uint16_t *lp;
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts_burst, dst_port);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (j = FWDSTEP; j != k; j += FWDSTEP) {
> +			processx4_step3(&pkts_burst[j],
> &dst_port[j]);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> +			lp  = port_groupx4(&pnum[j - FWDSTEP], lp,
> dp1, dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2,
> 3);
> +		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[j - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (j = 0; j < nb_rx; j += k) {
> +
> +		int32_t m;
> +		uint16_t pn;
> +
> +		pn = dst_port[j];
> +		k = pnum[j];
> +
> +		if (likely(pn != BAD_PORT))
> +			send_packetsx4(qconf, pn, pkts_burst + j,
> k);
> +		else
> +			for (m = j; m != j + k; m++)
> +				rte_pktmbuf_free(pkts_burst[m]);
> +
> +	}
> +}
> +
> +#endif /* _L3FWD_NEON_H_ */

Thanks and Regards
Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-02  7:14 ` [PATCH 5/5] examples/l3fwd: add neon support for l3fwd Jianbo Liu
  2017-05-02 11:20   ` Sekhar, Ashwin
@ 2017-05-02 11:47   ` Sekhar, Ashwin
  2017-05-03  5:24     ` Jianbo Liu
  1 sibling, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-02 11:47 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

Hi Jianbo,

I tested your neon changes on thunderx. I am seeing a performance
regression of ~10% for LPM case and ~20% for EM case with your changes.
Did you see improvement on any arm64 platform with these changes. If
yes, how much was the improvement?

FYI, I had also tried vectorizing the l3fwd app with neon. Few of the
optimizations that I can suggest that helped in my case.

* Packet data prefetch is missing in the x86 sse version compared to
the scalar version (l3fwd_lpm_send_packets vs
l3fwd_lpm_no_opt_send_packets) . I couldn't understand why this was not
done in x86. But adding the prefetch was improving performance for
thunderx.

* Offsets to some packet elements like eth_hdr, ip header, packet type
etc. are recalculated in different functions. Calculating them once,
caching them and passing them directly to different functions was
improving performance.

* There are 3 different loops in l3fwd_lpm_send_packets where we
iterate over the packets. One each for processx4_step1 and
processx4_step2 and one in send_packets_multi. Unifying these loops
were also helping.

Thanks and Regards
Ashwin


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-02 11:47   ` Sekhar, Ashwin
@ 2017-05-03  5:24     ` Jianbo Liu
  2017-05-04  8:42       ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-03  5:24 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: tomasz.kantecki, Jacob, Jerin, dev

Hi Ashwin,

On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> Hi Jianbo,
>
> I tested your neon changes on thunderx. I am seeing a performance
> regression of ~10% for LPM case and ~20% for EM case with your changes.
> Did you see improvement on any arm64 platform with these changes. If
> yes, how much was the improvement?

Thanks for your reviewing and testing.
For some reason, I have not done much with the performance testing.
I'll send a new version later after tuning the performance.

Thanks!
Jianbo

>
> FYI, I had also tried vectorizing the l3fwd app with neon. Few of the
> optimizations that I can suggest that helped in my case.
>
> * Packet data prefetch is missing in the x86 sse version compared to
> the scalar version (l3fwd_lpm_send_packets vs
> l3fwd_lpm_no_opt_send_packets) . I couldn't understand why this was not
> done in x86. But adding the prefetch was improving performance for
> thunderx.
>
> * Offsets to some packet elements like eth_hdr, ip header, packet type
> etc. are recalculated in different functions. Calculating them once,
> caching them and passing them directly to different functions was
> improving performance.
>
> * There are 3 different loops in l3fwd_lpm_send_packets where we
> iterate over the packets. One each for processx4_step1 and
> processx4_step2 and one in send_packets_multi. Unifying these loops
> were also helping.
>
> Thanks and Regards
> Ashwin
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-03  5:24     ` Jianbo Liu
@ 2017-05-04  8:42       ` Jianbo Liu
  2017-05-05  4:24         ` Sekhar, Ashwin
  0 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-04  8:42 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: tomasz.kantecki, Jacob, Jerin, dev

Hi Ashwin,

On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:
> Hi Ashwin,
>
> On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
>> Hi Jianbo,
>>
>> I tested your neon changes on thunderx. I am seeing a performance
>> regression of ~10% for LPM case and ~20% for EM case with your changes.
>> Did you see improvement on any arm64 platform with these changes. If
>> yes, how much was the improvement?
>
> Thanks for your reviewing and testing.
> For some reason, I have not done much with the performance testing.
> I'll send a new version later after tuning the performance.
>

Can you tell me how did you test?
My testing shows that EM case is much better, while LPM is almost the
same as before.

Thanks!
Jianbo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-04  8:42       ` Jianbo Liu
@ 2017-05-05  4:24         ` Sekhar, Ashwin
  2017-05-05  5:43           ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-05  4:24 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu; +Cc: Jacob,  Jerin, tomasz.kantecki, dev

On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
> Hi Ashwin,
> 
> On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:
> > 
> > Hi Ashwin,
> > 
> > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
> > wrote:
> > > 
> > > Hi Jianbo,
> > > 
> > > I tested your neon changes on thunderx. I am seeing a performance
> > > regression of ~10% for LPM case and ~20% for EM case with your
> > > changes.
> > > Did you see improvement on any arm64 platform with these changes.
> > > If
> > > yes, how much was the improvement?
> > Thanks for your reviewing and testing.
> > For some reason, I have not done much with the performance testing.
> > I'll send a new version later after tuning the performance.
> > 
> Can you tell me how did you test?
Built with following commands.
make config T=arm64-thunderx-linuxapp-gcc
make -j32

Tested LPM with
sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)"

Tested EM with
sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)" -E

> My testing shows that EM case is much better, while LPM is almost the
> same as before.
Could you please tell on which arm64 processor/platform you tested.
Also how much was the percentage increase in performance for EM ?

> Thanks!
> Jianbo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-05  4:24         ` Sekhar, Ashwin
@ 2017-05-05  5:43           ` Jianbo Liu
  2017-05-09  8:10             ` Sekhar, Ashwin
  0 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-05  5:43 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: Jacob, Jerin, tomasz.kantecki, dev

On 5 May 2017 at 12:24, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
>> Hi Ashwin,
>>
>> On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:
>> >
>> > Hi Ashwin,
>> >
>> > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
>> > wrote:
>> > >
>> > > Hi Jianbo,
>> > >
>> > > I tested your neon changes on thunderx. I am seeing a performance
>> > > regression of ~10% for LPM case and ~20% for EM case with your
>> > > changes.
>> > > Did you see improvement on any arm64 platform with these changes.
>> > > If
>> > > yes, how much was the improvement?
>> > Thanks for your reviewing and testing.
>> > For some reason, I have not done much with the performance testing.
>> > I'll send a new version later after tuning the performance.
>> >
>> Can you tell me how did you test?
> Built with following commands.
> make config T=arm64-thunderx-linuxapp-gcc
> make -j32
>
> Tested LPM with
> sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)"
>
> Tested EM with
> sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)" -E
>

Only one port? What's the network topology, and lpm/em rules? How did
you stress traffic...?

>> My testing shows that EM case is much better, while LPM is almost the
>> same as before.
> Could you please tell on which arm64 processor/platform you tested.
> Also how much was the percentage increase in performance for EM ?
>

I'm sorry I can't tell you what's arm64 platform I tested on. But I
can get a ThunderX, and replicate your testing environment if you can
tell me more...

Thanks!
Jianbo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-05  5:43           ` Jianbo Liu
@ 2017-05-09  8:10             ` Sekhar, Ashwin
  2017-05-10  2:39               ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-09  8:10 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu; +Cc: Jacob,  Jerin, tomasz.kantecki, dev

On Fri, 2017-05-05 at 13:43 +0800, Jianbo Liu wrote:
> On 5 May 2017 at 12:24, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
> wrote:
> > 
> > On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
> > > 
> > > Hi Ashwin,
> > > 
> > > On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:
> > > > 
> > > > 
> > > > Hi Ashwin,
> > > > 
> > > > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.co
> > > > m>
> > > > wrote:
> > > > > 
> > > > > 
> > > > > Hi Jianbo,
> > > > > 
> > > > > I tested your neon changes on thunderx. I am seeing a
> > > > > performance
> > > > > regression of ~10% for LPM case and ~20% for EM case with
> > > > > your
> > > > > changes.
> > > > > Did you see improvement on any arm64 platform with these
> > > > > changes.
> > > > > If
> > > > > yes, how much was the improvement?
> > > > Thanks for your reviewing and testing.
> > > > For some reason, I have not done much with the performance
> > > > testing.
> > > > I'll send a new version later after tuning the performance.
> > > > 
> > > Can you tell me how did you test?
> > Built with following commands.
> > make config T=arm64-thunderx-linuxapp-gcc
> > make -j32
> > 
> > Tested LPM with
> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p
> > 0x1 --config="(0,0,10)"
> > 
> > Tested EM with
> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p
> > 0x1 --config="(0,0,10)" -E
> > 
> Only one port? What's the network topology, and lpm/em rules? How did
> you stress traffic...?
port - 1 topology: DUT connected back to back to traffic generator.

We are using the default rules in the C code. flow generation is:
src.ip.min 192.168.18.1
src.ip.max 192.168.18.90
src.ip.inc 1

Also, Please let us know the topology that you are using.
> 
> > 
> > > 
> > > My testing shows that EM case is much better, while LPM is almost
> > > the
> > > same as before.
> > Could you please tell on which arm64 processor/platform you tested.
> > Also how much was the percentage increase in performance for EM ?
> > 
> I'm sorry I can't tell you what's arm64 platform I tested on. But I
> can get a ThunderX, and replicate your testing environment if you can
> tell me more...
Thanks.
> 
> Thanks!
> Jianbo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                   ` (3 preceding siblings ...)
  2017-05-02  7:14 ` [PATCH 5/5] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-10  2:30 ` Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                     ` (6 more replies)
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                   ` (2 subsequent siblings)
  7 siblings, 7 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (7):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard micro name for header file

 examples/l3fwd/l3fwd_common.h                      | 293 +++++++++++++++++++++
 examples/l3fwd/l3fwd_em.c                          |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h                      | 220 ++++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_neon.h                 |  74 ++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h                  | 280 +-------------------
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}      |  26 +-
 examples/l3fwd/l3fwd_lpm.c                         |  87 +++++-
 examples/l3fwd/l3fwd_lpm.h                         |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h                    | 165 ++++++++++++
 examples/l3fwd/l3fwd_lpm_sse.h                     |  66 -----
 examples/l3fwd/l3fwd_neon.h                        | 259 ++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h                         | 255 +-----------------
 12 files changed, 1133 insertions(+), 626 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (86%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c         |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h     | 302 ++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +----------------------------------
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 0000000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv4_5tuple_host key[8];
+
+	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+				&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv6_5tuple_host key[8];
+
+	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+			&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) uint16_t
+em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	uint8_t next_hop;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	uint32_t tcp_or_udp;
+	uint32_t l3_ptypes;
+
+	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
+
+	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
+
+		/* Handle IPv4 headers.*/
+		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
+				qconf->ipv4_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
+
+		/* Handle IPv6 headers.*/
+		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
+				qconf->ipv6_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	}
+
+	return portid;
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+		uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+
+	/*
+	 * Send nb_rx - nb_rx%8 packets
+	 * in groups of 8.
+	 */
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+
+	for (j = 0; j < n; j += 8) {
+
+		uint32_t pkt_type =
+			pkts_burst[j]->packet_type &
+			pkts_burst[j+1]->packet_type &
+			pkts_burst[j+2]->packet_type &
+			pkts_burst[j+3]->packet_type &
+			pkts_burst[j+4]->packet_type &
+			pkts_burst[j+5]->packet_type &
+			pkts_burst[j+6]->packet_type &
+			pkts_burst[j+7]->packet_type;
+
+		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		uint32_t tcp_or_udp = pkt_type &
+			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+
+		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
+
+			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
+
+			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else {
+			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
+							portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
+							portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
+							portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
+							portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
+							portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
+							portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
+							portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
+							portid);
+		}
+	}
+
+	for (; j < nb_rx; j++)
+		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+
+}
+#endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 7714a20..cb1304f 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,104 +34,16 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-#include "l3fwd_sse.h"
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, __m128i mask0,
+		union ipv4_5tuple_host *key)
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-	__m128i data[8];
-
-	data[0] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[0], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[1] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[1], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[2] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[2], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[3] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[3], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[4] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[4], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[5] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[5], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[6] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[6], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[7] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[7], __m128i *,
+	 __m128i tmpdata0 = _mm_loadu_si128(
+			rte_pktmbuf_mtod_offset(m0, __m128i *,
 				sizeof(struct ether_hdr) +
 				offsetof(struct ipv4_hdr, time_to_live)));
 
-	key[0].xmm = _mm_and_si128(data[0], mask0.x);
-	key[1].xmm = _mm_and_si128(data[1], mask0.x);
-	key[2].xmm = _mm_and_si128(data[2], mask0.x);
-	key[3].xmm = _mm_and_si128(data[3], mask0.x);
-	key[4].xmm = _mm_and_si128(data[4], mask0.x);
-	key[5].xmm = _mm_and_si128(data[5], mask0.x);
-	key[6].xmm = _mm_and_si128(data[6], mask0.x);
-	key[7].xmm = _mm_and_si128(data[7], mask0.x);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
+	key->xmm = _mm_and_si128(tmpdata0, mask0);
 }
 
 static inline void
@@ -159,184 +71,4 @@ static inline __attribute__((always_inline)) void
 	key->xmm[1] = tmpdata1;
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
-{
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
-}
-
-static inline __attribute__((always_inline)) uint16_t
-em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint8_t next_hop;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ipv6_hdr *ipv6_hdr;
-	uint32_t tcp_or_udp;
-	uint32_t l3_ptypes;
-
-	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
-
-	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
-
-		/* Handle IPv4 headers.*/
-		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
-				qconf->ipv4_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
-
-		/* Handle IPv6 headers.*/
-		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
-				qconf->ipv6_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	}
-
-	return portid;
-}
-
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
-static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint8_t portid, struct lcore_conf *qconf)
-{
-	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
-
-	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
-	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
-
-	for (j = 0; j < n; j += 8) {
-
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
-
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
-		}
-	}
-
-	for (; j < nb_rx; j++)
-		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
-
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
-
-}
 #endif /* __L3FWD_EM_SSE_HLM_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v2 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c                                | 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v2 3/7] examples/l3fwd: extract common code from multi packet send
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_common.h | 293 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h    | 255 +-----------------------------------
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 0000000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+	uint8_t ihl;
+
+	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+			dp[0] = BAD_PORT;
+
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {             \
+		(lp)[0]++;                                   \
+	} else {                                         \
+		(dlp) = (dcp)[idx];                          \
+		(lp) = (pn) + (idx);                         \
+		(lp)[0] = 1;                                 \
+	}                                                \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+static inline __attribute__((always_inline)) void
+send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
+		uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 3:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 2:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 1:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	qconf->tx_mbufs[port].len = len;
+}
+
+#endif /* _L3FWD_COMMON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 1afa1f0..d99842b 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -32,53 +32,11 @@
  */
 
 
-#ifndef _L3FWD_COMMON_H_
-#define _L3FWD_COMMON_H_
+#ifndef _L3FWD_SSE_H_
+#define _L3FWD_SSE_H_
 
 #include "l3fwd.h"
-
-#ifdef DO_RFC_1812_CHECKS
-
-#define	IPV4_MIN_VER_IHL	0x45
-#define	IPV4_MAX_VER_IHL	0x4f
-#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
-
-/* Minimum value of IPV4 total length (20B) in network byte order. */
-#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
-
-/*
- * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
- * - The IP version number must be 4.
- * - The IP header length field must be large enough to hold the
- *    minimum length legal IP datagram (20 bytes = 5 words).
- * - The IP total length field must be large enough to hold the IP
- *   datagram header, whose length is specified in the IP header length
- *   field.
- * If we encounter invalid IPV4 packet, then set destination port for it
- * to BAD_PORT value.
- */
-static inline __attribute__((always_inline)) void
-rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
-{
-	uint8_t ihl;
-
-	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
-		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
-
-		ipv4_hdr->time_to_live--;
-		ipv4_hdr->hdr_checksum++;
-
-		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
-				((uint8_t)ipv4_hdr->total_length == 0 &&
-				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
-			dp[0] = BAD_PORT;
-
-	}
-}
-
-#else
-#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
-#endif /* DO_RFC_1812_CHECKS */
+#include "l3fwd_common.h"
 
 /*
  * Update source and destination MAC addresses in the ethernet header.
@@ -130,30 +88,6 @@ static inline __attribute__((always_inline)) void
 }
 
 /*
- * We group consecutive packets with the same destionation port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-/*
  * Group consecutive packets with the same destination port in bursts of 4.
  * Suppose we have array of destionation ports:
  * dst_port[] = {a, b, c, d,, e, ... }
@@ -164,109 +98,6 @@ static inline __attribute__((always_inline)) void
 static inline uint16_t *
 port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
 {
-	static const struct {
-		uint64_t pnum; /* prebuild 4 values for pnum[]. */
-		int32_t  idx;  /* index for new last updated elemnet. */
-		uint16_t lpv;  /* add value to the last updated element. */
-	} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-	};
-
 	union {
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
@@ -314,84 +145,6 @@ static inline __attribute__((always_inline)) void
 	_mm_storeu_si128((__m128i *)eth_hdr, te);
 }
 
-static inline __attribute__((always_inline)) void
-send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
-		uint32_t num)
-{
-	uint32_t len, j, n;
-
-	len = qconf->tx_mbufs[port].len;
-
-	/*
-	 * If TX buffer for that queue is empty, and we have enough packets,
-	 * then send them straightway.
-	 */
-	if (num >= MAX_TX_BURST && len == 0) {
-		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
-		if (unlikely(n < num)) {
-			do {
-				rte_pktmbuf_free(m[n]);
-			} while (++n < num);
-		}
-		return;
-	}
-
-	/*
-	 * Put packets into TX buffer for that queue.
-	 */
-
-	n = len + num;
-	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
-
-	j = 0;
-	switch (n % FWDSTEP) {
-	while (j < n) {
-	case 0:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 3:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 2:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 1:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	}
-	}
-
-	len += n;
-
-	/* enough pkts to be sent */
-	if (unlikely(len == MAX_PKT_BURST)) {
-
-		send_burst(qconf, MAX_PKT_BURST, port);
-
-		/* copy rest of the packets into the TX buffer. */
-		len = num - n;
-		j = 0;
-		switch (len % FWDSTEP) {
-		while (j < len) {
-		case 0:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 3:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 2:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 1:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		}
-		}
-	}
-
-	qconf->tx_mbufs[port].len = len;
-}
-
 /**
  * Send packets burst from pkts_burst to the ports in dst_port array
  */
@@ -498,4 +251,4 @@ static inline __attribute__((always_inline)) void
 	}
 }
 
-#endif /* _L3FWD_COMMON_H_ */
+#endif /* _L3FWD_SSE_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v2 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (2 preceding siblings ...)
  2017-05-10  2:30   ` [PATCH v2 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c     | 83 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.h     | 26 +------------
 examples/l3fwd/l3fwd_lpm_sse.h | 66 ---------------------------------
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm *ipv4_l3fwd_lookup_struct =
+		(struct rte_lpm *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+		&next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+		(struct rte_lpm6 *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+			&next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	struct ipv6_hdr *ipv6_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+					     qconf->ipv4_lookup_struct);
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					     qconf->ipv6_lookup_struct);
+	}
+
+	return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint8_t portid)
+{
+	uint32_t next_hop;
+	struct ipv6_hdr *ipv6_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+						   dst_ipv4, &next_hop) == 0)
+				   ? next_hop : portid);
+
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+				ipv6_hdr->dst_addr, &next_hop) == 0)
+				? next_hop : portid);
+
+	}
+
+	return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm *ipv4_l3fwd_lookup_struct =
-		(struct rte_lpm *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-		&next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-		(struct rte_lpm6 *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-			&next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((always_inline)) void
 l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint8_t portid,
 		struct lcore_conf *qconf)
 {
 	struct ether_hdr *eth_hdr;
 	struct ipv4_hdr *ipv4_hdr;
-	uint8_t dst_port;
+	uint16_t dst_port;
 
 	eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index aa06b6d..4a9b7ed 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -36,72 +36,6 @@
 
 #include "l3fwd_sse.h"
 
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) (
-			(rte_lpm_lookup(qconf->ipv4_lookup_struct,
-					rte_be_to_cpu_32(ipv4_hdr->dst_addr),
-					&next_hop) == 0) ?
-						next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-}
-
-/*
- * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
- * precalculated. If packet is ipv6 dst_addr is taken directly from packet
- * header and dst_ipv4 value is not used.
- */
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-	uint32_t dst_ipv4, uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
-			&next_hop) == 0) ? next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-
-}
-
 /*
  * Read packet_type and destination IPV4 addresses from 4 mbufs.
  */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (3 preceding siblings ...)
  2017-05-10  2:30   ` [PATCH v2 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10 15:00     ` Sekhar, Ashwin
  2017-05-10  2:30   ` [PATCH v2 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file Jianbo Liu
  6 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c            |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
 examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
 examples/l3fwd/l3fwd_lpm.c           |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h      | 165 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h          | 259 +++++++++++++++++++++++++++++++++++
 7 files changed, 539 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..4ec600a 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j, pos;
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
@@ -247,6 +252,12 @@ static inline __attribute__((always_inline)) uint16_t
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+	for (j = 0; j < 8 && j < nb_rx; j++) {
+		rte_prefetch0(pkts_burst[j]);
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
 	for (j = 0; j < n; j += 8) {
 
 		uint32_t pkt_type =
@@ -263,6 +274,12 @@ static inline __attribute__((always_inline)) uint16_t
 		uint32_t tcp_or_udp = pkt_type &
 			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+			rte_prefetch0(pkts_burst[pos]);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
 			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index c0a9725..c3df473 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -43,7 +43,11 @@
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
@@ -101,11 +105,23 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j;
 	uint16_t dst_port[MAX_PKT_BURST];
 
-	for (j = 0; j < nb_rx; j++)
+	if (nb_rx > 0) {
+		rte_prefetch0(pkts_burst[0]);
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (i = 1, j = 0; j < nb_rx; i++, j++) {
+		if (i < nb_rx) {
+			rte_prefetch0(pkts_burst[i]);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						       struct ether_hdr *) + 1);
+		}
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+	}
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index fc554fc..ddef250 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@ static inline __attribute__((always_inline)) uint16_t
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@ static inline __attribute__((always_inline)) uint16_t
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..2f047b3
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,165 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t i, j, pos;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
+	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	for (j = 0; j < FWDSTEP && j < nb_rx; j++) {
+		rte_prefetch0(pkts_burst[j]);
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (j = 0; j != k; j += FWDSTEP) {
+		for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos < nb_rx;
+		     i++, pos++) {
+			rte_prefetch0(pkts_burst[pos]);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
+				&ipv4_flag[j / FWDSTEP]);
+
+		processx4_step2(qconf, dip[j / FWDSTEP],
+				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j],
+				&dst_port[j]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 2:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 1:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..75c8976
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static inline __attribute__((always_inline)) void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v2 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (4 preceding siblings ...)
  2017-05-10  2:30   ` [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10  2:30   ` [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file Jianbo Liu
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

New micro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 4ec600a..10a9c95 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-
-	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv4_l3fwd_out_if[ret[i]]);
 
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv6xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv6_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv6_5tuple(m[i], mask1.x, mask2.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
 
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv6_l3fwd_out_if[ret[i]]);
+
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static inline __attribute__((always_inline)) uint16_t
@@ -247,34 +164,31 @@ static inline __attribute__((always_inline)) uint16_t
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
+	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
+	 * in groups of EM_HASH_LOOKUP_COUNT.
 	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < 8 && j < nb_rx; j++) {
+	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
 		rte_prefetch0(pkts_burst[j]);
 		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
 					       struct ether_hdr *) + 1);
 	}
 
-	for (j = 0; j < n; j += 8) {
+	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
+
+		uint32_t pkt_type = RTE_PTYPE_L3_MASK |
+				    RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
+		uint32_t l3_type, tcp_or_udp;
 
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+			pkt_type &= pkts_burst[j + i]->packet_type;
 
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
-		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
+		     i < EM_HASH_LOOKUP_COUNT && pos < nb_rx; i++, pos++) {
 			rte_prefetch0(pkts_burst[pos]);
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
 						       struct ether_hdr *) + 1);
@@ -282,31 +196,18 @@ static inline __attribute__((always_inline)) uint16_t
 
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv4xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
 
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv6xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
-							portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
-							portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
-							portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
-							portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
-							portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
-							portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
-							portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
-							portid);
+			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+				dst_port[j + i] = em_get_dst_port(qconf,
+						pkts_burst[j + i], portid);
 		}
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (5 preceding siblings ...)
  2017-05-10  2:30   ` [PATCH v2 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
@ 2017-05-10  2:30   ` Jianbo Liu
  2017-05-10 11:57     ` Sekhar, Ashwin
  6 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:30 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index c3df473..63c5c12 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -125,4 +125,4 @@ static inline __attribute__((always_inline)) uint16_t
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
  2017-05-09  8:10             ` Sekhar, Ashwin
@ 2017-05-10  2:39               ` Jianbo Liu
  0 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-10  2:39 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: Jacob, Jerin, tomasz.kantecki, dev

Hi Ashwin,

On 9 May 2017 at 16:10, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Fri, 2017-05-05 at 13:43 +0800, Jianbo Liu wrote:
>> On 5 May 2017 at 12:24, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
>> wrote:
>> >
>> > On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
>> > >
>> > > Hi Ashwin,
>> > >
>> > > On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:
>> > > >
>> > > >
>> > > > Hi Ashwin,
>> > > >
>> > > > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.co
>> > > > m>
>> > > > wrote:
>> > > > >
>> > > > >
>> > > > > Hi Jianbo,
>> > > > >
>> > > > > I tested your neon changes on thunderx. I am seeing a
>> > > > > performance
>> > > > > regression of ~10% for LPM case and ~20% for EM case with
>> > > > > your
>> > > > > changes.
>> > > > > Did you see improvement on any arm64 platform with these
>> > > > > changes.
>> > > > > If
>> > > > > yes, how much was the improvement?
>> > > > Thanks for your reviewing and testing.
>> > > > For some reason, I have not done much with the performance
>> > > > testing.
>> > > > I'll send a new version later after tuning the performance.
>> > > >
>> > > Can you tell me how did you test?
>> > Built with following commands.
>> > make config T=arm64-thunderx-linuxapp-gcc
>> > make -j32
>> >
>> > Tested LPM with
>> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p
>> > 0x1 --config="(0,0,10)"
>> >
>> > Tested EM with
>> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p
>> > 0x1 --config="(0,0,10)" -E
>> >
>> Only one port? What's the network topology, and lpm/em rules? How did
>> you stress traffic...?
> port - 1 topology: DUT connected back to back to traffic generator.
>
> We are using the default rules in the C code. flow generation is:
> src.ip.min 192.168.18.1
> src.ip.max 192.168.18.90
> src.ip.inc 1
>
> Also, Please let us know the topology that you are using.

I used two ports with one rule to forward packets from one to the other.
Sent v2, please try this new version.

Thanks!
Jianbo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file
  2017-05-10  2:30   ` [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file Jianbo Liu
@ 2017-05-10 11:57     ` Sekhar, Ashwin
  0 siblings, 0 replies; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-10 11:57 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

In commit message:
s/micro/macro/

On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
> As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the
> macro
> to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/examples/l3fwd/l3fwd_em_sequential.h
> b/examples/l3fwd/l3fwd_em_sequential.h
> index c3df473..63c5c12 100644
> --- a/examples/l3fwd/l3fwd_em_sequential.h
> +++ b/examples/l3fwd/l3fwd_em_sequential.h
> @@ -31,8 +31,8 @@
>   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
>   */
>  
> -#ifndef __L3FWD_EM_SSE_H__
> -#define __L3FWD_EM_SSE_H__
> +#ifndef __L3FWD_EM_SEQUENTIAL_H__
> +#define __L3FWD_EM_SEQUENTIAL_H__
>  
>  /**
>   * @file
> @@ -125,4 +125,4 @@ static inline __attribute__((always_inline))
> uint16_t
>  
>  	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
>  }
> -#endif /* __L3FWD_EM_SSE_H__ */
> +#endif /* __L3FWD_EM_SEQUENTIAL_H__ */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-10  2:30   ` [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-10 15:00     ` Sekhar, Ashwin
  2017-05-11  3:16       ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-10 15:00 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

Hi Jianbo,

Thanks for version v2. Addition of the prefetch instructions is
definitely helping performance on ThunderX. But still performance is
slightly less than that of scalar.

I tried few small tweaks which helped improve performance on my
Thunderx setup. For details see comments inline.


On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  examples/l3fwd/l3fwd_em.c            |   4 +-
>  examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-
>  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
>  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
>  examples/l3fwd/l3fwd_lpm.c           |   4 +-
>  examples/l3fwd/l3fwd_lpm_neon.h      | 165 ++++++++++++++++++++++
>  examples/l3fwd/l3fwd_neon.h          | 259
> +++++++++++++++++++++++++++++++++++
>  7 files changed, 539 insertions(+), 6 deletions(-)
>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_neon.h
> 
> [...]
> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
> b/examples/l3fwd/l3fwd_em_hlm.h
> index 636dea4..4ec600a 100644
> --- a/examples/l3fwd/l3fwd_em_hlm.h
> +++ b/examples/l3fwd/l3fwd_em_hlm.h
> @@ -35,8 +35,13 @@
>  #ifndef __L3FWD_EM_HLM_H__
>  #define __L3FWD_EM_HLM_H__
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
>  #include "l3fwd_em_hlm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#include "l3fwd_em_hlm_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) void
>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
> *m[8],
> @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))
> uint16_t
>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>  		uint8_t portid, struct lcore_conf *qconf)
>  {
> -	int32_t j;
> +	int32_t i, j, pos;
>  	uint16_t dst_port[MAX_PKT_BURST];
>  
>  	/*
> @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))
> uint16_t
>  	 */
>  	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
>  
> +	for (j = 0; j < 8 && j < nb_rx; j++) {
> +		rte_prefetch0(pkts_burst[j]);
The above prefetch of rte_mbuf struct is unnecessary. With this we wont
see any performance improvement as the contents of rte_mbuf (buf_addr
and data_off) is used in right next instruction. Removing the above
prefetch and similar prefetches at multiple places was improving
performance on my ThunderX setup.

> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
> +					       struct ether_hdr *) + 
> 1);
Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In
process_packet in l3fwd_neon.h, eth_header is accessed.

> +	}
> +
>  	for (j = 0; j < n; j += 8) {
>  
>  		uint32_t pkt_type =
> @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))
> uint16_t
>  		uint32_t tcp_or_udp = pkt_type &
>  			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
>  
> +		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, 
> pos++) {
> +			rte_prefetch0(pkts_burst[pos]);
The above prefetch of rte_mbuf struct is unnecessary.

> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
> s],
> +						       struct
> ether_hdr *) + 1);
Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +		}
> +
>  		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>  
>  			em_get_dst_port_ipv4x8(qconf,
> &pkts_burst[j], portid,
> 
> [...]

> diff --git a/examples/l3fwd/l3fwd_em_sequential.h
> b/examples/l3fwd/l3fwd_em_sequential.h
> index c0a9725..c3df473 100644
> --- a/examples/l3fwd/l3fwd_em_sequential.h
> +++ b/examples/l3fwd/l3fwd_em_sequential.h
> @@ -43,7 +43,11 @@
>   * compilation time.
>   */
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) uint16_t
>  em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf
> *pkt,
> @@ -101,11 +105,23 @@ static inline __attribute__((always_inline))
> uint16_t
>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>  			uint8_t portid, struct lcore_conf *qconf)
>  {
> -	int32_t j;
> +	int32_t i, j;
>  	uint16_t dst_port[MAX_PKT_BURST];
>  
> -	for (j = 0; j < nb_rx; j++)
> +	if (nb_rx > 0) {
> +		rte_prefetch0(pkts_burst[0]);
The above prefetch of rte_mbuf struct is unnecessary.

> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
> +					       struct ether_hdr *) +
> 1);
Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +	}
> +
> +	for (i = 1, j = 0; j < nb_rx; i++, j++) {
> +		if (i < nb_rx) {
> +			rte_prefetch0(pkts_burst[i]);
The above prefetch of rte_mbuf struct is unnecessary.

> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i]
> ,
> +						       struct
> ether_hdr *) + 1);
Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +		}
>  		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j],
> portid);
> +	}
>  
>  	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
>  }
> [...]

> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
> b/examples/l3fwd/l3fwd_lpm_neon.h
> new file mode 100644
> index 0000000..2f047b3
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_lpm_neon.h
> 
> [...]

> +/*
> + * Buffer optimized handling of packets, invoked
> + * from main_loop.
> + */
> +static inline void
> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
> +			uint8_t portid, struct lcore_conf *qconf)
> +{
> +	int32_t i, j, pos;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
If you see carefully, we dont need an array of dip. We just need a
single element. dip value is calculated in processx4_step1 and consumed
in processx4_step2, and thats it. No need to save it in an array.

> +	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
Same as dip. We dont need an array of ipv4_flag.

> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	for (j = 0; j < FWDSTEP && j < nb_rx; j++) {
> +		rte_prefetch0(pkts_burst[j]);
The above prefetch of rte_mbuf struct is unnecessary.

> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
> +					       struct ether_hdr *) +
> 1);
Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +	}
> +
> +	for (j = 0; j != k; j += FWDSTEP) {
> +		for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos <
> nb_rx;
> +		     i++, pos++) {
> +			rte_prefetch0(pkts_burst[pos]);
The above prefetch of rte_mbuf struct is unnecessary.

> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
> s],
> +						       struct
> ether_hdr *) + 1);
Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +		}
> +		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
> +				&ipv4_flag[j / FWDSTEP]);
> +
> +		processx4_step2(qconf, dip[j / FWDSTEP],
> +				ipv4_flag[j / FWDSTEP], portid,
> &pkts_burst[j],
> +				&dst_port[j]);
> +	}
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +		/* fallthrough */
> +	case 2:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +		/* fallthrough */
> +	case 1:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +	}
> +
> +	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> +}
> +
> +#endif /* __L3FWD_LPM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_neon.h
> b/examples/l3fwd/l3fwd_neon.h
> new file mode 100644
> index 0000000..75c8976
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_neon.h
> [...]

> +
> +/**
> + * Process one packet:
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> +{
> +	struct ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
eth_hdr accessed here. Hence the earlier comments about prefetching at
eth header.

> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +
> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> +			pkt->packet_type);
> +
> +	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
Use vcopyq_laneq_u32 for easily doing the above.

> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +}
> +
> [...]
> +#endif /* _L3FWD_NEON_H_ */

Combining all the above comments, I made some changes on top of your
patch. These changes are giving 3-4% improvement over your version.

You may find the changes at
https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc

Please check it out and let me know your comments.

Thanks
Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-10 15:00     ` Sekhar, Ashwin
@ 2017-05-11  3:16       ` Jianbo Liu
  2017-05-11  4:14         ` Sekhar, Ashwin
  0 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  3:16 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: tomasz.kantecki, Jacob, Jerin, dev

Hi Ashwin,

On 10 May 2017 at 23:00, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> Hi Jianbo,
>
> Thanks for version v2. Addition of the prefetch instructions is
> definitely helping performance on ThunderX. But still performance is
> slightly less than that of scalar.
>
> I tried few small tweaks which helped improve performance on my
> Thunderx setup. For details see comments inline.
>
>
> On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
>> Use ARM NEON intrinsics to accelerate l3 fowarding.
>>
>> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> ---
>>  examples/l3fwd/l3fwd_em.c            |   4 +-
>>  examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-
>>  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
>>  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
>>  examples/l3fwd/l3fwd_lpm.c           |   4 +-
>>  examples/l3fwd/l3fwd_lpm_neon.h      | 165 ++++++++++++++++++++++
>>  examples/l3fwd/l3fwd_neon.h          | 259
>> +++++++++++++++++++++++++++++++++++
>>  7 files changed, 539 insertions(+), 6 deletions(-)
>>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>>  create mode 100644 examples/l3fwd/l3fwd_neon.h
>>
>> [...]
>> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
>> b/examples/l3fwd/l3fwd_em_hlm.h
>> index 636dea4..4ec600a 100644
>> --- a/examples/l3fwd/l3fwd_em_hlm.h
>> +++ b/examples/l3fwd/l3fwd_em_hlm.h
>> @@ -35,8 +35,13 @@
>>  #ifndef __L3FWD_EM_HLM_H__
>>  #define __L3FWD_EM_HLM_H__
>>
>> +#if defined(__SSE4_1__)
>>  #include "l3fwd_sse.h"
>>  #include "l3fwd_em_hlm_sse.h"
>> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
>> +#include "l3fwd_neon.h"
>> +#include "l3fwd_em_hlm_neon.h"
>> +#endif
>>
>>  static inline __attribute__((always_inline)) void
>>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
>> *m[8],
>> @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))
>> uint16_t
>>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>>               uint8_t portid, struct lcore_conf *qconf)
>>  {
>> -     int32_t j;
>> +     int32_t i, j, pos;
>>       uint16_t dst_port[MAX_PKT_BURST];
>>
>>       /*
>> @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))
>> uint16_t
>>        */
>>       int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
>>
>> +     for (j = 0; j < 8 && j < nb_rx; j++) {
>> +             rte_prefetch0(pkts_burst[j]);
> The above prefetch of rte_mbuf struct is unnecessary. With this we wont
> see any performance improvement as the contents of rte_mbuf (buf_addr
> and data_off) is used in right next instruction. Removing the above
> prefetch and similar prefetches at multiple places was improving
> performance on my ThunderX setup.

Yes, will remove them.

>
>> +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
>> +                                            struct ether_hdr *) +
>> 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In
> process_packet in l3fwd_neon.h, eth_header is accessed in
>

But ip headers are used right in each 8/FWDSTEP loop.
Since ip headers are accessed first, we should prefetch eth_hdr + 1 first.
After all nb_rx packets are handled in above small loop, their
eth_header are then accessed in processx4_step3 over again.
I'm not sure prefretching eth_hdr still works if we prefetch eth_hdr
in first step,  as cache may be already filled with new data at that
time.

>> +     }
>> +
>>       for (j = 0; j < n; j += 8) {
>>
>>               uint32_t pkt_type =
>> @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))
>> uint16_t
>>               uint32_t tcp_or_udp = pkt_type &
>>                       (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
>>
>> +             for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++,
>> pos++) {
>> +                     rte_prefetch0(pkts_burst[pos]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
>> s],
>> +                                                    struct
>> ether_hdr *) + 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> +             }
>> +
>>               if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>>
>>                       em_get_dst_port_ipv4x8(qconf,
>> &pkts_burst[j], portid,
>>
>> [...]
>

....

>> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
>> b/examples/l3fwd/l3fwd_lpm_neon.h
>> new file mode 100644
>> index 0000000..2f047b3
>> --- /dev/null
>> +++ b/examples/l3fwd/l3fwd_lpm_neon.h
>>
>> [...]
>
>> +/*
>> + * Buffer optimized handling of packets, invoked
>> + * from main_loop.
>> + */
>> +static inline void
>> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>> +                     uint8_t portid, struct lcore_conf *qconf)
>> +{
>> +     int32_t i, j, pos;
>> +     uint16_t dst_port[MAX_PKT_BURST];
>> +     int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
> If you see carefully, we dont need an array of dip. We just need a
> single element. dip value is calculated in processx4_step1 and consumed
> in processx4_step2, and thats it. No need to save it in an array.
>

Will change, thanks!

>> +     uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> Same as dip. We dont need an array of ipv4_flag.
>
>> +     const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>> +
>> +     for (j = 0; j < FWDSTEP && j < nb_rx; j++) {
>> +             rte_prefetch0(pkts_burst[j]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
>> +                                            struct ether_hdr *) +
>> 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> +     }
>> +
>> +     for (j = 0; j != k; j += FWDSTEP) {
>> +             for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos <
>> nb_rx;
>> +                  i++, pos++) {
>> +                     rte_prefetch0(pkts_burst[pos]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
>> s],
>> +                                                    struct
>> ether_hdr *) + 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> +             }
>> +             processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
>> +                             &ipv4_flag[j / FWDSTEP]);
>> +
>> +             processx4_step2(qconf, dip[j / FWDSTEP],
>> +                             ipv4_flag[j / FWDSTEP], portid,
>> &pkts_burst[j],
>> +                             &dst_port[j]);
>> +     }
>> +
>> +     /* Classify last up to 3 packets one by one */
>> +     switch (nb_rx % FWDSTEP) {
>> +     case 3:
>> +             dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
>> portid);
>> +             j++;
>> +             /* fallthrough */
>> +     case 2:
>> +             dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
>> portid);
>> +             j++;
>> +             /* fallthrough */
>> +     case 1:
>> +             dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
>> portid);
>> +             j++;
>> +     }
>> +
>> +     send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
>> +}
>> +
>> +#endif /* __L3FWD_LPM_NEON_H__ */
>> diff --git a/examples/l3fwd/l3fwd_neon.h
>> b/examples/l3fwd/l3fwd_neon.h
>> new file mode 100644
>> index 0000000..75c8976
>> --- /dev/null
>> +++ b/examples/l3fwd/l3fwd_neon.h
>> [...]
>
>> +
>> +/**
>> + * Process one packet:
>> + * Update source and destination MAC addresses in the ethernet
>> header.
>> + * Perform RFC1812 checks and updates for IPV4 packets.
>> + */
>> +static inline void
>> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
>> +{
>> +     struct ether_hdr *eth_hdr;
>> +     uint32x4_t te, ve;
>> +
>> +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> eth_hdr accessed here. Hence the earlier comments about prefetching at
> eth header.
>

process_packet is called only for the last 1-3 packets, most are
handled in processx4_step3.
As these 2 functions access packets from the first one once again, the
prefetch may not work.
Please see my explanation in the above...

>> +
>> +     te = vld1q_u32((uint32_t *)eth_hdr);
>> +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
>> +
>> +
>> +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
>> +                     pkt->packet_type);
>> +
>> +     ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
> Use vcopyq_laneq_u32 for easily doing the above.
>

Will change. Thanks!

>> +     vst1q_u32((uint32_t *)eth_hdr, ve);
>> +}
>> +
>> [...]
>> +#endif /* _L3FWD_NEON_H_ */
>
> Combining all the above comments, I made some changes on top of your
> patch. These changes are giving 3-4% improvement over your version.
>
> You may find the changes at
> https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc
>

Is the correct in Line 103/104, you only process one packets in the
last FWDSTEP packets?
Actually, I don't like your change in l3fwd_lpm_send_packets, making
the simple logic complicated. And I don't think it can help to improve
performance. :-)

> Please check it out and let me know your comments.
>
> Thanks
> Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11  3:16       ` Jianbo Liu
@ 2017-05-11  4:14         ` Sekhar, Ashwin
  2017-05-11  4:27           ` Sekhar, Ashwin
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-11  4:14 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu; +Cc: Jacob,  Jerin, tomasz.kantecki, dev

On Thu, 2017-05-11 at 11:16 +0800, Jianbo Liu wrote:
> Hi Ashwin,
> 
> On 10 May 2017 at 23:00, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
> wrote:
> > 
> > Hi Jianbo,
> > 
> > Thanks for version v2. Addition of the prefetch instructions is
> > definitely helping performance on ThunderX. But still performance
> > is
> > slightly less than that of scalar.
> > 
> > I tried few small tweaks which helped improve performance on my
> > Thunderx setup. For details see comments inline.
> > 
> > 
> > On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
> > > 
> > > Use ARM NEON intrinsics to accelerate l3 fowarding.
> > > 
> > > Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> > > ---
> > >  examples/l3fwd/l3fwd_em.c            |   4 +-
> > >  examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-
> > >  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
> > >  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
> > >  examples/l3fwd/l3fwd_lpm.c           |   4 +-
> > >  examples/l3fwd/l3fwd_lpm_neon.h      | 165
> > > ++++++++++++++++++++++
> > >  examples/l3fwd/l3fwd_neon.h          | 259
> > > +++++++++++++++++++++++++++++++++++
> > >  7 files changed, 539 insertions(+), 6 deletions(-)
> > >  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
> > >  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
> > >  create mode 100644 examples/l3fwd/l3fwd_neon.h
> > > 
> > > [...]
> > > diff --git a/examples/l3fwd/l3fwd_em_hlm.h
> > > b/examples/l3fwd/l3fwd_em_hlm.h
> > > index 636dea4..4ec600a 100644
> > > --- a/examples/l3fwd/l3fwd_em_hlm.h
> > > +++ b/examples/l3fwd/l3fwd_em_hlm.h
> > > @@ -35,8 +35,13 @@
> > >  #ifndef __L3FWD_EM_HLM_H__
> > >  #define __L3FWD_EM_HLM_H__
> > > 
> > > +#if defined(__SSE4_1__)
> > >  #include "l3fwd_sse.h"
> > >  #include "l3fwd_em_hlm_sse.h"
> > > +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> > > +#include "l3fwd_neon.h"
> > > +#include "l3fwd_em_hlm_neon.h"
> > > +#endif
> > > 
> > >  static inline __attribute__((always_inline)) void
> > >  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
> > > *m[8],
> > > @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))
> > > uint16_t
> > >  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
> > >               uint8_t portid, struct lcore_conf *qconf)
> > >  {
> > > -     int32_t j;
> > > +     int32_t i, j, pos;
> > >       uint16_t dst_port[MAX_PKT_BURST];
> > > 
> > >       /*
> > > @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))
> > > uint16_t
> > >        */
> > >       int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
> > > 
> > > +     for (j = 0; j < 8 && j < nb_rx; j++) {
> > > +             rte_prefetch0(pkts_burst[j]);
> > The above prefetch of rte_mbuf struct is unnecessary. With this we
> > wont
> > see any performance improvement as the contents of rte_mbuf
> > (buf_addr
> > and data_off) is used in right next instruction. Removing the above
> > prefetch and similar prefetches at multiple places was improving
> > performance on my ThunderX setup.
> Yes, will remove them.
> 
> > 
> > 
> > > 
> > > +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
> > > +                                            struct ether_hdr *)
> > > +
> > > 1);
> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In
> > process_packet in l3fwd_neon.h, eth_header is accessed in
> > 
> But ip headers are used right in each 8/FWDSTEP loop.
> Since ip headers are accessed first, we should prefetch eth_hdr + 1
> first.
> After all nb_rx packets are handled in above small loop, their
> eth_header are then accessed in processx4_step3 over again.
> I'm not sure prefretching eth_hdr still works if we prefetch eth_hdr
> in first step,  as cache may be already filled with new data at that
> time.
> 
Okay. 
Also, I guess if the ethernet header and ip header falls in the same
cache line (which I think would be the case mostly as I hope the packet
data will be cache aligned), it doesn't make much of a  difference
whether you prefetch at ethernet header address or ip header address.
> > 
> > > 
> > > +     }
> > > +
> > >       for (j = 0; j < n; j += 8) {
> > > 
> > >               uint32_t pkt_type =
> > > @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))
> > > uint16_t
> > >               uint32_t tcp_or_udp = pkt_type &
> > >                       (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
> > > 
> > > +             for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++,
> > > pos++) {
> > > +                     rte_prefetch0(pkts_burst[pos]);
> > The above prefetch of rte_mbuf struct is unnecessary.
> > 
> > > 
> > > +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[p
> > > o
> > > s],
> > > +                                                    struct
> > > ether_hdr *) + 1);
> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1
> > 
> > > 
> > > +             }
> > > +
> > >               if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
> > > 
> > >                       em_get_dst_port_ipv4x8(qconf,
> > > &pkts_burst[j], portid,
> > > 
> > > [...]
> ....
> 
> > 
> > > 
> > > diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
> > > b/examples/l3fwd/l3fwd_lpm_neon.h
> > > new file mode 100644
> > > index 0000000..2f047b3
> > > --- /dev/null
> > > +++ b/examples/l3fwd/l3fwd_lpm_neon.h
> > > 
> > > [...]
> > > 
> > > +/*
> > > + * Buffer optimized handling of packets, invoked
> > > + * from main_loop.
> > > + */
> > > +static inline void
> > > +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
> > > +                     uint8_t portid, struct lcore_conf *qconf)
> > > +{
> > > +     int32_t i, j, pos;
> > > +     uint16_t dst_port[MAX_PKT_BURST];
> > > +     int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
> > If you see carefully, we dont need an array of dip. We just need a
> > single element. dip value is calculated in processx4_step1 and
> > consumed
> > in processx4_step2, and thats it. No need to save it in an array.
> > 
> Will change, thanks!
> 
> > 
> > > 
> > > +     uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> > Same as dip. We dont need an array of ipv4_flag.
> > 
> > > 
> > > +     const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> > > +
> > > +     for (j = 0; j < FWDSTEP && j < nb_rx; j++) {
> > > +             rte_prefetch0(pkts_burst[j]);
> > The above prefetch of rte_mbuf struct is unnecessary.
> > 
> > > 
> > > +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
> > > +                                            struct ether_hdr *)
> > > +
> > > 1);
> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1
> > 
> > > 
> > > +     }
> > > +
> > > +     for (j = 0; j != k; j += FWDSTEP) {
> > > +             for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos <
> > > nb_rx;
> > > +                  i++, pos++) {
> > > +                     rte_prefetch0(pkts_burst[pos]);
> > The above prefetch of rte_mbuf struct is unnecessary.
> > 
> > > 
> > > +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[p
> > > o
> > > s],
> > > +                                                    struct
> > > ether_hdr *) + 1);
> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1
> > 
> > > 
> > > +             }
> > > +             processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
> > > +                             &ipv4_flag[j / FWDSTEP]);
> > > +
> > > +             processx4_step2(qconf, dip[j / FWDSTEP],
> > > +                             ipv4_flag[j / FWDSTEP], portid,
> > > &pkts_burst[j],
> > > +                             &dst_port[j]);
> > > +     }
> > > +
> > > +     /* Classify last up to 3 packets one by one */
> > > +     switch (nb_rx % FWDSTEP) {
> > > +     case 3:
> > > +             dst_port[j] = lpm_get_dst_port(qconf,
> > > pkts_burst[j],
> > > portid);
> > > +             j++;
> > > +             /* fallthrough */
> > > +     case 2:
> > > +             dst_port[j] = lpm_get_dst_port(qconf,
> > > pkts_burst[j],
> > > portid);
> > > +             j++;
> > > +             /* fallthrough */
> > > +     case 1:
> > > +             dst_port[j] = lpm_get_dst_port(qconf,
> > > pkts_burst[j],
> > > portid);
> > > +             j++;
> > > +     }
> > > +
> > > +     send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> > > +}
> > > +
> > > +#endif /* __L3FWD_LPM_NEON_H__ */
> > > diff --git a/examples/l3fwd/l3fwd_neon.h
> > > b/examples/l3fwd/l3fwd_neon.h
> > > new file mode 100644
> > > index 0000000..75c8976
> > > --- /dev/null
> > > +++ b/examples/l3fwd/l3fwd_neon.h
> > > [...]
> > > 
> > > +
> > > +/**
> > > + * Process one packet:
> > > + * Update source and destination MAC addresses in the ethernet
> > > header.
> > > + * Perform RFC1812 checks and updates for IPV4 packets.
> > > + */
> > > +static inline void
> > > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> > > +{
> > > +     struct ether_hdr *eth_hdr;
> > > +     uint32x4_t te, ve;
> > > +
> > > +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> > eth_hdr accessed here. Hence the earlier comments about prefetching
> > at
> > eth header.
> > 
> process_packet is called only for the last 1-3 packets, most are
> handled in processx4_step3.
> As these 2 functions access packets from the first one once again,
> the
> prefetch may not work.
> Please see my explanation in the above...
> 
Okay.
> > 
> > > 
> > > +
> > > +     te = vld1q_u32((uint32_t *)eth_hdr);
> > > +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> > > +
> > > +
> > > +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> > > +                     pkt->packet_type);
> > > +
> > > +     ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
> > Use vcopyq_laneq_u32 for easily doing the above.
> > 
> Will change. Thanks!
> 
> > 
> > > 
> > > +     vst1q_u32((uint32_t *)eth_hdr, ve);
> > > +}
> > > +
> > > [...]
> > > +#endif /* _L3FWD_NEON_H_ */
> > Combining all the above comments, I made some changes on top of
> > your
> > patch. These changes are giving 3-4% improvement over your version.
> > 
> > You may find the changes at
> > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc
> > 
> Is the correct in Line 103/104, you only process one packets in the
> last FWDSTEP packets?
Its doing processx4_* there. So its processing 4 packets.

> Actually, I don't like your change in l3fwd_lpm_send_packets, making
> the simple logic complicated. And I don't think it can help to
> improve
> performance. :-)
Its not making it complicated. The number of lines of code may be
higher by may be 10 lines, but the conditions of the loops are
simplified which reduces the number of branch instructions and helps
the processor to go through them faster.

If possible, please try it out on your machine.
> 
> > 
> > Please check it out and let me know your comments.
> > 
> > Thanks
> > Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11  4:14         ` Sekhar, Ashwin
@ 2017-05-11  4:27           ` Sekhar, Ashwin
  2017-05-11  6:11             ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-11  4:27 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu; +Cc: Jacob,  Jerin, tomasz.kantecki, dev


On Thu, 2017-05-11 at 04:14 +0000, Sekhar, Ashwin wrote:
...
> > > Combining all the above comments, I made some changes on top of
> > > your
> > > patch. These changes are giving 3-4% improvement over your
> > > version.
> > > 
> > > You may find the changes at
> > > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587faf
> > > c
> > > 
> > Is the correct in Line 103/104, you only process one packets in the
> > last FWDSTEP packets?
> Its doing processx4_* there. So its processing 4 packets.
> 
> > 
> > Actually, I don't like your change in l3fwd_lpm_send_packets,
> > making
> > the simple logic complicated. And I don't think it can help to
> > improve
> > performance. :-)
> Its not making it complicated. The number of lines of code may be
> higher by may be 10 lines, but the conditions of the loops are
> simplified which reduces the number of branch instructions and helps
> the processor to go through them faster.
> 
> If possible, please try it out on your machine.

Missed out one point.
Since 2 loops are form "for (i = 0; i < FWDSTEP; i++)" i.e. looping for
constant number of iterations, compiler will easily unroll them.

Thanks
Ashwin
> > 
> > 
> > > 
> > > 
> > > Please check it out and let me know your comments.
> > > 
> > > Thanks
> > > Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11  4:27           ` Sekhar, Ashwin
@ 2017-05-11  6:11             ` Jianbo Liu
  0 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  6:11 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: Jacob, Jerin, tomasz.kantecki, dev

On 11 May 2017 at 12:27, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
>
> On Thu, 2017-05-11 at 04:14 +0000, Sekhar, Ashwin wrote:
> ...
>> > > Combining all the above comments, I made some changes on top of
>> > > your
>> > > patch. These changes are giving 3-4% improvement over your
>> > > version.
>> > >
>> > > You may find the changes at
>> > > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587faf
>> > > c
>> > >
>> > Is the correct in Line 103/104, you only process one packets in the
>> > last FWDSTEP packets?
>> Its doing processx4_* there. So its processing 4 packets.
>>
>> >
>> > Actually, I don't like your change in l3fwd_lpm_send_packets,
>> > making
>> > the simple logic complicated. And I don't think it can help to
>> > improve
>> > performance. :-)
>> Its not making it complicated. The number of lines of code may be
>> higher by may be 10 lines, but the conditions of the loops are
>> simplified which reduces the number of branch instructions and helps
>> the processor to go through them faster.

I suspected not much improvement we can get.

>>
>> If possible, please try it out on your machine.

OK, I'll test. If no performance regression, I'll adopt your suggestion in v3.

>
> Missed out one point.
> Since 2 loops are form "for (i = 0; i < FWDSTEP; i++)" i.e. looping for
> constant number of iterations, compiler will easily unroll them.
>
> Thanks
> Ashwin
>> >
>> >
>> > >
>> > >
>> > > Please check it out and let me know your comments.
>> > >
>> > > Thanks
>> > > Ashwin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                   ` (4 preceding siblings ...)
  2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
@ 2017-05-11  9:25 ` Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                     ` (6 more replies)
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
  7 siblings, 7 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

v3:
  - remove unnecessary perfetch for rte_mbuf
  - fix typo in git log
  - Ashwin's suggestions for performance on ThunderX

v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (7):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard macro name for header file

 examples/l3fwd/l3fwd_common.h                      | 293 +++++++++++++++++++++
 examples/l3fwd/l3fwd_em.c                          |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h                      | 218 +++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_neon.h                 |  74 ++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h                  | 280 +-------------------
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}      |  24 +-
 examples/l3fwd/l3fwd_lpm.c                         |  87 +++++-
 examples/l3fwd/l3fwd_lpm.h                         |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h                    | 193 ++++++++++++++
 examples/l3fwd/l3fwd_lpm_sse.h                     |  66 -----
 examples/l3fwd/l3fwd_neon.h                        | 259 ++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h                         | 255 +-----------------
 12 files changed, 1157 insertions(+), 626 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (88%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c         |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h     | 302 ++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +----------------------------------
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 0000000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv4_5tuple_host key[8];
+
+	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+				&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv6_5tuple_host key[8];
+
+	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+			&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) uint16_t
+em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	uint8_t next_hop;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	uint32_t tcp_or_udp;
+	uint32_t l3_ptypes;
+
+	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
+
+	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
+
+		/* Handle IPv4 headers.*/
+		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
+				qconf->ipv4_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
+
+		/* Handle IPv6 headers.*/
+		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
+				qconf->ipv6_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	}
+
+	return portid;
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+		uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+
+	/*
+	 * Send nb_rx - nb_rx%8 packets
+	 * in groups of 8.
+	 */
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+
+	for (j = 0; j < n; j += 8) {
+
+		uint32_t pkt_type =
+			pkts_burst[j]->packet_type &
+			pkts_burst[j+1]->packet_type &
+			pkts_burst[j+2]->packet_type &
+			pkts_burst[j+3]->packet_type &
+			pkts_burst[j+4]->packet_type &
+			pkts_burst[j+5]->packet_type &
+			pkts_burst[j+6]->packet_type &
+			pkts_burst[j+7]->packet_type;
+
+		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		uint32_t tcp_or_udp = pkt_type &
+			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+
+		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
+
+			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
+
+			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else {
+			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
+							portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
+							portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
+							portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
+							portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
+							portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
+							portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
+							portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
+							portid);
+		}
+	}
+
+	for (; j < nb_rx; j++)
+		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+
+}
+#endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 7714a20..cb1304f 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,104 +34,16 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-#include "l3fwd_sse.h"
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, __m128i mask0,
+		union ipv4_5tuple_host *key)
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-	__m128i data[8];
-
-	data[0] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[0], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[1] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[1], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[2] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[2], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[3] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[3], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[4] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[4], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[5] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[5], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[6] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[6], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[7] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[7], __m128i *,
+	 __m128i tmpdata0 = _mm_loadu_si128(
+			rte_pktmbuf_mtod_offset(m0, __m128i *,
 				sizeof(struct ether_hdr) +
 				offsetof(struct ipv4_hdr, time_to_live)));
 
-	key[0].xmm = _mm_and_si128(data[0], mask0.x);
-	key[1].xmm = _mm_and_si128(data[1], mask0.x);
-	key[2].xmm = _mm_and_si128(data[2], mask0.x);
-	key[3].xmm = _mm_and_si128(data[3], mask0.x);
-	key[4].xmm = _mm_and_si128(data[4], mask0.x);
-	key[5].xmm = _mm_and_si128(data[5], mask0.x);
-	key[6].xmm = _mm_and_si128(data[6], mask0.x);
-	key[7].xmm = _mm_and_si128(data[7], mask0.x);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
+	key->xmm = _mm_and_si128(tmpdata0, mask0);
 }
 
 static inline void
@@ -159,184 +71,4 @@ static inline __attribute__((always_inline)) void
 	key->xmm[1] = tmpdata1;
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
-{
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
-}
-
-static inline __attribute__((always_inline)) uint16_t
-em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint8_t next_hop;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ipv6_hdr *ipv6_hdr;
-	uint32_t tcp_or_udp;
-	uint32_t l3_ptypes;
-
-	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
-
-	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
-
-		/* Handle IPv4 headers.*/
-		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
-				qconf->ipv4_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
-
-		/* Handle IPv6 headers.*/
-		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
-				qconf->ipv6_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	}
-
-	return portid;
-}
-
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
-static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint8_t portid, struct lcore_conf *qconf)
-{
-	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
-
-	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
-	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
-
-	for (j = 0; j < n; j += 8) {
-
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
-
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
-		}
-	}
-
-	for (; j < nb_rx; j++)
-		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
-
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
-
-}
 #endif /* __L3FWD_EM_SSE_HLM_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v3 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c                                | 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v3 3/7] examples/l3fwd: extract common code from multi packet send
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_common.h | 293 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h    | 255 +-----------------------------------
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 0000000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+	uint8_t ihl;
+
+	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+			dp[0] = BAD_PORT;
+
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {             \
+		(lp)[0]++;                                   \
+	} else {                                         \
+		(dlp) = (dcp)[idx];                          \
+		(lp) = (pn) + (idx);                         \
+		(lp)[0] = 1;                                 \
+	}                                                \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+static inline __attribute__((always_inline)) void
+send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
+		uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 3:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 2:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 1:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	qconf->tx_mbufs[port].len = len;
+}
+
+#endif /* _L3FWD_COMMON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 1afa1f0..d99842b 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -32,53 +32,11 @@
  */
 
 
-#ifndef _L3FWD_COMMON_H_
-#define _L3FWD_COMMON_H_
+#ifndef _L3FWD_SSE_H_
+#define _L3FWD_SSE_H_
 
 #include "l3fwd.h"
-
-#ifdef DO_RFC_1812_CHECKS
-
-#define	IPV4_MIN_VER_IHL	0x45
-#define	IPV4_MAX_VER_IHL	0x4f
-#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
-
-/* Minimum value of IPV4 total length (20B) in network byte order. */
-#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
-
-/*
- * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
- * - The IP version number must be 4.
- * - The IP header length field must be large enough to hold the
- *    minimum length legal IP datagram (20 bytes = 5 words).
- * - The IP total length field must be large enough to hold the IP
- *   datagram header, whose length is specified in the IP header length
- *   field.
- * If we encounter invalid IPV4 packet, then set destination port for it
- * to BAD_PORT value.
- */
-static inline __attribute__((always_inline)) void
-rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
-{
-	uint8_t ihl;
-
-	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
-		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
-
-		ipv4_hdr->time_to_live--;
-		ipv4_hdr->hdr_checksum++;
-
-		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
-				((uint8_t)ipv4_hdr->total_length == 0 &&
-				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
-			dp[0] = BAD_PORT;
-
-	}
-}
-
-#else
-#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
-#endif /* DO_RFC_1812_CHECKS */
+#include "l3fwd_common.h"
 
 /*
  * Update source and destination MAC addresses in the ethernet header.
@@ -130,30 +88,6 @@ static inline __attribute__((always_inline)) void
 }
 
 /*
- * We group consecutive packets with the same destionation port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-/*
  * Group consecutive packets with the same destination port in bursts of 4.
  * Suppose we have array of destionation ports:
  * dst_port[] = {a, b, c, d,, e, ... }
@@ -164,109 +98,6 @@ static inline __attribute__((always_inline)) void
 static inline uint16_t *
 port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
 {
-	static const struct {
-		uint64_t pnum; /* prebuild 4 values for pnum[]. */
-		int32_t  idx;  /* index for new last updated elemnet. */
-		uint16_t lpv;  /* add value to the last updated element. */
-	} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-	};
-
 	union {
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
@@ -314,84 +145,6 @@ static inline __attribute__((always_inline)) void
 	_mm_storeu_si128((__m128i *)eth_hdr, te);
 }
 
-static inline __attribute__((always_inline)) void
-send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
-		uint32_t num)
-{
-	uint32_t len, j, n;
-
-	len = qconf->tx_mbufs[port].len;
-
-	/*
-	 * If TX buffer for that queue is empty, and we have enough packets,
-	 * then send them straightway.
-	 */
-	if (num >= MAX_TX_BURST && len == 0) {
-		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
-		if (unlikely(n < num)) {
-			do {
-				rte_pktmbuf_free(m[n]);
-			} while (++n < num);
-		}
-		return;
-	}
-
-	/*
-	 * Put packets into TX buffer for that queue.
-	 */
-
-	n = len + num;
-	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
-
-	j = 0;
-	switch (n % FWDSTEP) {
-	while (j < n) {
-	case 0:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 3:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 2:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 1:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	}
-	}
-
-	len += n;
-
-	/* enough pkts to be sent */
-	if (unlikely(len == MAX_PKT_BURST)) {
-
-		send_burst(qconf, MAX_PKT_BURST, port);
-
-		/* copy rest of the packets into the TX buffer. */
-		len = num - n;
-		j = 0;
-		switch (len % FWDSTEP) {
-		while (j < len) {
-		case 0:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 3:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 2:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 1:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		}
-		}
-	}
-
-	qconf->tx_mbufs[port].len = len;
-}
-
 /**
  * Send packets burst from pkts_burst to the ports in dst_port array
  */
@@ -498,4 +251,4 @@ static inline __attribute__((always_inline)) void
 	}
 }
 
-#endif /* _L3FWD_COMMON_H_ */
+#endif /* _L3FWD_SSE_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v3 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (2 preceding siblings ...)
  2017-05-11  9:25   ` [PATCH v3 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c     | 83 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.h     | 26 +------------
 examples/l3fwd/l3fwd_lpm_sse.h | 66 ---------------------------------
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm *ipv4_l3fwd_lookup_struct =
+		(struct rte_lpm *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+		&next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+		(struct rte_lpm6 *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+			&next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	struct ipv6_hdr *ipv6_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+					     qconf->ipv4_lookup_struct);
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					     qconf->ipv6_lookup_struct);
+	}
+
+	return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint8_t portid)
+{
+	uint32_t next_hop;
+	struct ipv6_hdr *ipv6_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+						   dst_ipv4, &next_hop) == 0)
+				   ? next_hop : portid);
+
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+				ipv6_hdr->dst_addr, &next_hop) == 0)
+				? next_hop : portid);
+
+	}
+
+	return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm *ipv4_l3fwd_lookup_struct =
-		(struct rte_lpm *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-		&next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-		(struct rte_lpm6 *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-			&next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((always_inline)) void
 l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint8_t portid,
 		struct lcore_conf *qconf)
 {
 	struct ether_hdr *eth_hdr;
 	struct ipv4_hdr *ipv4_hdr;
-	uint8_t dst_port;
+	uint16_t dst_port;
 
 	eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index aa06b6d..4a9b7ed 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -36,72 +36,6 @@
 
 #include "l3fwd_sse.h"
 
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) (
-			(rte_lpm_lookup(qconf->ipv4_lookup_struct,
-					rte_be_to_cpu_32(ipv4_hdr->dst_addr),
-					&next_hop) == 0) ?
-						next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-}
-
-/*
- * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
- * precalculated. If packet is ipv6 dst_addr is taken directly from packet
- * header and dst_ipv4 value is not used.
- */
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-	uint32_t dst_ipv4, uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
-			&next_hop) == 0) ? next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-
-}
-
 /*
  * Read packet_type and destination IPV4 addresses from 4 mbufs.
  */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (3 preceding siblings ...)
  2017-05-11  9:25   ` [PATCH v3 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  2017-05-11  9:49     ` Sekhar, Ashwin
  2017-05-11  9:25   ` [PATCH v3 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 7/7] examples/l3fwd: change the guard macro name for header file Jianbo Liu
  6 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c            |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h        |  17 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
 examples/l3fwd/l3fwd_em_sequential.h |  18 ++-
 examples/l3fwd/l3fwd_lpm.c           |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h      | 193 ++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h          | 259 +++++++++++++++++++++++++++++++++++
 7 files changed, 563 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..b9163e3 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j, pos;
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
@@ -247,6 +252,11 @@ static inline __attribute__((always_inline)) uint16_t
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+	for (j = 0; j < 8 && j < nb_rx; j++) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
 	for (j = 0; j < n; j += 8) {
 
 		uint32_t pkt_type =
@@ -263,6 +273,11 @@ static inline __attribute__((always_inline)) uint16_t
 		uint32_t tcp_or_udp = pkt_type &
 			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
 			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index c0a9725..2b3ec16 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -43,7 +43,11 @@
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
@@ -101,11 +105,21 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j;
 	uint16_t dst_port[MAX_PKT_BURST];
 
-	for (j = 0; j < nb_rx; j++)
+	if (nb_rx > 0) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (i = 1, j = 0; j < nb_rx; i++, j++) {
+		if (i < nb_rx) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						       struct ether_hdr *) + 1);
+		}
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+	}
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index fc554fc..ddef250 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@ static inline __attribute__((always_inline)) uint16_t
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@ static inline __attribute__((always_inline)) uint16_t
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..baedbfe
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,193 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t i = 0, j = 0;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip;
+	uint32_t ipv4_flag;
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+
+	if (k) {
+		for (i = 0; i < FWDSTEP; i++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						struct ether_hdr *) + 1);
+		}
+
+		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
+			for (i = 0; i < FWDSTEP; i++) {
+				rte_prefetch0(rte_pktmbuf_mtod(
+						pkts_burst[j + i + FWDSTEP],
+						struct ether_hdr *) + 1);
+			}
+
+			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+			processx4_step2(qconf, dip, ipv4_flag, portid,
+					&pkts_burst[j], &dst_port[j]);
+		}
+
+		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
+				&dst_port[j]);
+
+		j += FWDSTEP;
+	}
+
+	if (m) {
+		/* Prefetch last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 2:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 1:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+		}
+
+		j -= m;
+		/* Classify last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 2:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 1:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+		}
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..7a91afc
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vcopyq_lane_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static inline __attribute__((always_inline)) void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v3 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (4 preceding siblings ...)
  2017-05-11  9:25   ` [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  2017-05-11  9:25   ` [PATCH v3 7/7] examples/l3fwd: change the guard macro name for header file Jianbo Liu
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

New macro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index b9163e3..098b396 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-
-	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv4_l3fwd_out_if[ret[i]]);
 
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv6xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv6_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv6_5tuple(m[i], mask1.x, mask2.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
 
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv6_l3fwd_out_if[ret[i]]);
+
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static inline __attribute__((always_inline)) uint16_t
@@ -247,64 +164,48 @@ static inline __attribute__((always_inline)) uint16_t
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
+	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
+	 * in groups of EM_HASH_LOOKUP_COUNT.
 	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < 8 && j < nb_rx; j++) {
+	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
 		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
 					       struct ether_hdr *) + 1);
 	}
 
-	for (j = 0; j < n; j += 8) {
+	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
+
+		uint32_t pkt_type = RTE_PTYPE_L3_MASK |
+				    RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
+		uint32_t l3_type, tcp_or_udp;
 
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+			pkt_type &= pkts_burst[j + i]->packet_type;
 
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
-		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
+		     i < EM_HASH_LOOKUP_COUNT && pos < nb_rx; i++, pos++) {
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
 						       struct ether_hdr *) + 1);
 		}
 
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv4xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
 
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv6xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
-							portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
-							portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
-							portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
-							portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
-							portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
-							portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
-							portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
-							portid);
+			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+				dst_port[j + i] = em_get_dst_port(qconf,
+						pkts_burst[j + i], portid);
 		}
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v3 7/7] examples/l3fwd: change the guard macro name for header file
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (5 preceding siblings ...)
  2017-05-11  9:25   ` [PATCH v3 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
@ 2017-05-11  9:25   ` Jianbo Liu
  6 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11  9:25 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index 2b3ec16..c7d477d 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -123,4 +123,4 @@ static inline __attribute__((always_inline)) uint16_t
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11  9:25   ` [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-11  9:49     ` Sekhar, Ashwin
  2017-05-11 10:01       ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-11  9:49 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

Hi Jianbo,

Thanks for v3. Small compilation error. See inline comment. Otherwise
it looks fine.

On Thu, 2017-05-11 at 17:25 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
[...]

> +/**
> + * Process one packet:
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> +{
> +	struct ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +
> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> +			pkt->packet_type);
> +
> +	ve = vcopyq_lane_u32(ve, 3, te, 3);
Compilation error here. This should be vcopyq_laneq_u32 (Extra q after
lane)
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +}
> +
[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11  9:49     ` Sekhar, Ashwin
@ 2017-05-11 10:01       ` Jianbo Liu
  2017-05-11 10:27         ` Sekhar, Ashwin
  0 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-11 10:01 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: tomasz.kantecki, Jacob, Jerin, dev

On 11 May 2017 at 17:49, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> Hi Jianbo,
>
> Thanks for v3. Small compilation error. See inline comment. Otherwise
> it looks fine.
>
> On Thu, 2017-05-11 at 17:25 +0800, Jianbo Liu wrote:
>> Use ARM NEON intrinsics to accelerate l3 fowarding.
>>
>> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> ---
> [...]
>
>> +/**
>> + * Process one packet:
>> + * Update source and destination MAC addresses in the ethernet
>> header.
>> + * Perform RFC1812 checks and updates for IPV4 packets.
>> + */
>> +static inline void
>> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
>> +{
>> +     struct ether_hdr *eth_hdr;
>> +     uint32x4_t te, ve;
>> +
>> +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
>> +
>> +     te = vld1q_u32((uint32_t *)eth_hdr);
>> +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
>> +
>> +
>> +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
>> +                     pkt->packet_type);
>> +
>> +     ve = vcopyq_lane_u32(ve, 3, te, 3);
> Compilation error here. This should be vcopyq_laneq_u32 (Extra q after
> lane)

No vcopyq_laneq_u32 in arm_neon.h of my environment. I thought it's a
typo so I changed.

my gcc version 4.8.5 20150623 (Red Hat 4.8.5-11) (GCC).
What about yours?

>> +     vst1q_u32((uint32_t *)eth_hdr, ve);
>> +}
>> +
> [...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11 10:01       ` Jianbo Liu
@ 2017-05-11 10:27         ` Sekhar, Ashwin
  2017-05-12  2:40           ` Jianbo Liu
  0 siblings, 1 reply; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-11 10:27 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu; +Cc: Jacob,  Jerin, tomasz.kantecki, dev

On Thu, 2017-05-11 at 18:01 +0800, Jianbo Liu wrote:
> On 11 May 2017 at 17:49, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
> wrote:
> > 
> > Hi Jianbo,
> > 
> > Thanks for v3. Small compilation error. See inline comment.
> > Otherwise
> > it looks fine.
> > 
> > On Thu, 2017-05-11 at 17:25 +0800, Jianbo Liu wrote:
> > > 
> > > Use ARM NEON intrinsics to accelerate l3 fowarding.
> > > 
> > > Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> > > ---
> > [...]
> > 
> > > 
> > > +/**
> > > + * Process one packet:
> > > + * Update source and destination MAC addresses in the ethernet
> > > header.
> > > + * Perform RFC1812 checks and updates for IPV4 packets.
> > > + */
> > > +static inline void
> > > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> > > +{
> > > +     struct ether_hdr *eth_hdr;
> > > +     uint32x4_t te, ve;
> > > +
> > > +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> > > +
> > > +     te = vld1q_u32((uint32_t *)eth_hdr);
> > > +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> > > +
> > > +
> > > +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> > > +                     pkt->packet_type);
> > > +
> > > +     ve = vcopyq_lane_u32(ve, 3, te, 3);
> > Compilation error here. This should be vcopyq_laneq_u32 (Extra q
> > after
> > lane)
> No vcopyq_laneq_u32 in arm_neon.h of my environment. I thought it's a
> typo so I changed.
> 
> my gcc version 4.8.5 20150623 (Red Hat 4.8.5-11) (GCC).
> What about yours?
> 
I am using GCC 7.1. No error with this version.

Also to cross check I tried the following versions as well which all
gave compilation errors.
 * gcc (Ubuntu/Linaro 4.9.2-10ubuntu13) 4.9.2
 * gcc 5.3
 * GCC 6.3

So looks like vcopyq_laneq_u32 is not supported in GCC versions < 7.
We can add a wrapper for the same in
./lib/librte_eal/common/include/arch/arm/rte_vect.h for gcc versions <
7.

But I think we can defer this activity. Because I have some other
patches, which moves around the definition of GCC_VERSION, and adds
wrappers for some unsupported instrinsics. Please see below.
http://dpdk.org/dev/patchwork/patch/24161/
http://dpdk.org/dev/patchwork/patch/24162/

I think we can add the vcopyq_laneq_u32 change and the wrapper for the
same after the above patches are merged.

And FYI - Documentation for the vcopyq_laneq_u32 can be found in below
document.
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0
073A_arm_neon_intrinsics_ref.pdf

> > 
> > > 
> > > +     vst1q_u32((uint32_t *)eth_hdr, ve);
> > > +}
> > > +
> > [...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd
  2017-05-11 10:27         ` Sekhar, Ashwin
@ 2017-05-12  2:40           ` Jianbo Liu
  0 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-12  2:40 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: Jacob, Jerin, tomasz.kantecki, dev

On 11 May 2017 at 18:27, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Thu, 2017-05-11 at 18:01 +0800, Jianbo Liu wrote:
>> On 11 May 2017 at 17:49, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
>> wrote:
>> >
>> > Hi Jianbo,
>> >
>> > Thanks for v3. Small compilation error. See inline comment.
>> > Otherwise
>> > it looks fine.
>> >
>> > On Thu, 2017-05-11 at 17:25 +0800, Jianbo Liu wrote:
>> > >
>> > > Use ARM NEON intrinsics to accelerate l3 fowarding.
>> > >
>> > > Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> > > ---
>> > [...]
>> >
>> > >
>> > > +/**
>> > > + * Process one packet:
>> > > + * Update source and destination MAC addresses in the ethernet
>> > > header.
>> > > + * Perform RFC1812 checks and updates for IPV4 packets.
>> > > + */
>> > > +static inline void
>> > > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
>> > > +{
>> > > +     struct ether_hdr *eth_hdr;
>> > > +     uint32x4_t te, ve;
>> > > +
>> > > +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
>> > > +
>> > > +     te = vld1q_u32((uint32_t *)eth_hdr);
>> > > +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
>> > > +
>> > > +
>> > > +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
>> > > +                     pkt->packet_type);
>> > > +
>> > > +     ve = vcopyq_lane_u32(ve, 3, te, 3);
>> > Compilation error here. This should be vcopyq_laneq_u32 (Extra q
>> > after
>> > lane)
>> No vcopyq_laneq_u32 in arm_neon.h of my environment. I thought it's a
>> typo so I changed.
>>
>> my gcc version 4.8.5 20150623 (Red Hat 4.8.5-11) (GCC).
>> What about yours?
>>
> I am using GCC 7.1. No error with this version.
>
> Also to cross check I tried the following versions as well which all
> gave compilation errors.
>  * gcc (Ubuntu/Linaro 4.9.2-10ubuntu13) 4.9.2
>  * gcc 5.3
>  * GCC 6.3
>
> So looks like vcopyq_laneq_u32 is not supported in GCC versions < 7.
> We can add a wrapper for the same in
> ./lib/librte_eal/common/include/arch/arm/rte_vect.h for gcc versions <
> 7.

OK

>
> But I think we can defer this activity. Because I have some other
> patches, which moves around the definition of GCC_VERSION, and adds
> wrappers for some unsupported instrinsics. Please see below.
> http://dpdk.org/dev/patchwork/patch/24161/
> http://dpdk.org/dev/patchwork/patch/24162/
>
> I think we can add the vcopyq_laneq_u32 change and the wrapper for the
> same after the above patches are merged.
>

I'll add that after your patches are ready.

> And FYI - Documentation for the vcopyq_laneq_u32 can be found in below
> document.
> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0
> 073A_arm_neon_intrinsics_ref.pdf
>

Thanks!

>> >
>> > >
>> > > +     vst1q_u32((uint32_t *)eth_hdr, ve);
>> > > +}
>> > > +
>> > [...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                   ` (5 preceding siblings ...)
  2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
@ 2017-05-15  3:34 ` Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                     ` (8 more replies)
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
  7 siblings, 9 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

v4:
  - add vcopyq_laneq_u32 for older version of gcc

v3:
  - remove unnecessary perfetch for rte_mbuf
  - fix typo in git log
  - Ashwin's suggestions for performance on ThunderX

v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (8):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  arch/arm: add vcopyq_laneq_u32 for old version of gcc
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard macro name for header file

 examples/l3fwd/l3fwd_common.h                      | 293 +++++++++++++++++++++
 examples/l3fwd/l3fwd_em.c                          |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h                      | 218 +++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_neon.h                 |  74 ++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h                  | 280 +-------------------
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}      |  24 +-
 examples/l3fwd/l3fwd_lpm.c                         |  87 +++++-
 examples/l3fwd/l3fwd_lpm.h                         |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h                    | 193 ++++++++++++++
 examples/l3fwd/l3fwd_lpm_sse.h                     |  66 -----
 examples/l3fwd/l3fwd_neon.h                        | 259 ++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h                         | 255 +-----------------
 lib/librte_eal/common/include/arch/arm/rte_vect.h  |   9 +
 13 files changed, 1166 insertions(+), 626 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (88%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v4 1/8] examples/l3fwd: extract arch independent code from multi hash lookup
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c         |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h     | 302 ++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +----------------------------------
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 0000000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv4_5tuple_host key[8];
+
+	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+				&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv6_5tuple_host key[8];
+
+	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+			&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static inline __attribute__((always_inline)) uint16_t
+em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	uint8_t next_hop;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	uint32_t tcp_or_udp;
+	uint32_t l3_ptypes;
+
+	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
+
+	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
+
+		/* Handle IPv4 headers.*/
+		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
+				qconf->ipv4_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
+
+		/* Handle IPv6 headers.*/
+		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
+				qconf->ipv6_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	}
+
+	return portid;
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+		uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+
+	/*
+	 * Send nb_rx - nb_rx%8 packets
+	 * in groups of 8.
+	 */
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+
+	for (j = 0; j < n; j += 8) {
+
+		uint32_t pkt_type =
+			pkts_burst[j]->packet_type &
+			pkts_burst[j+1]->packet_type &
+			pkts_burst[j+2]->packet_type &
+			pkts_burst[j+3]->packet_type &
+			pkts_burst[j+4]->packet_type &
+			pkts_burst[j+5]->packet_type &
+			pkts_burst[j+6]->packet_type &
+			pkts_burst[j+7]->packet_type;
+
+		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		uint32_t tcp_or_udp = pkt_type &
+			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+
+		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
+
+			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
+
+			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else {
+			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
+							portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
+							portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
+							portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
+							portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
+							portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
+							portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
+							portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
+							portid);
+		}
+	}
+
+	for (; j < nb_rx; j++)
+		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+
+}
+#endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 7714a20..cb1304f 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,104 +34,16 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-#include "l3fwd_sse.h"
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, __m128i mask0,
+		union ipv4_5tuple_host *key)
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-	__m128i data[8];
-
-	data[0] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[0], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[1] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[1], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[2] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[2], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[3] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[3], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[4] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[4], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[5] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[5], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[6] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[6], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[7] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[7], __m128i *,
+	 __m128i tmpdata0 = _mm_loadu_si128(
+			rte_pktmbuf_mtod_offset(m0, __m128i *,
 				sizeof(struct ether_hdr) +
 				offsetof(struct ipv4_hdr, time_to_live)));
 
-	key[0].xmm = _mm_and_si128(data[0], mask0.x);
-	key[1].xmm = _mm_and_si128(data[1], mask0.x);
-	key[2].xmm = _mm_and_si128(data[2], mask0.x);
-	key[3].xmm = _mm_and_si128(data[3], mask0.x);
-	key[4].xmm = _mm_and_si128(data[4], mask0.x);
-	key[5].xmm = _mm_and_si128(data[5], mask0.x);
-	key[6].xmm = _mm_and_si128(data[6], mask0.x);
-	key[7].xmm = _mm_and_si128(data[7], mask0.x);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
+	key->xmm = _mm_and_si128(tmpdata0, mask0);
 }
 
 static inline void
@@ -159,184 +71,4 @@ static inline __attribute__((always_inline)) void
 	key->xmm[1] = tmpdata1;
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
-
-static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
-{
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
-}
-
-static inline __attribute__((always_inline)) uint16_t
-em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint8_t next_hop;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ipv6_hdr *ipv6_hdr;
-	uint32_t tcp_or_udp;
-	uint32_t l3_ptypes;
-
-	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
-
-	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
-
-		/* Handle IPv4 headers.*/
-		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
-				qconf->ipv4_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
-
-		/* Handle IPv6 headers.*/
-		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
-				qconf->ipv6_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	}
-
-	return portid;
-}
-
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
-static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint8_t portid, struct lcore_conf *qconf)
-{
-	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
-
-	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
-	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
-
-	for (j = 0; j < n; j += 8) {
-
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
-
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
-		}
-	}
-
-	for (; j < nb_rx; j++)
-		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
-
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
-
-}
 #endif /* __L3FWD_EM_SSE_HLM_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c                                | 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 3/8] examples/l3fwd: extract common code from multi packet send
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_common.h | 293 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h    | 255 +-----------------------------------
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 0000000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+	uint8_t ihl;
+
+	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+			dp[0] = BAD_PORT;
+
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {             \
+		(lp)[0]++;                                   \
+	} else {                                         \
+		(dlp) = (dcp)[idx];                          \
+		(lp) = (pn) + (idx);                         \
+		(lp)[0] = 1;                                 \
+	}                                                \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+static inline __attribute__((always_inline)) void
+send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
+		uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 3:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 2:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 1:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	qconf->tx_mbufs[port].len = len;
+}
+
+#endif /* _L3FWD_COMMON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 1afa1f0..d99842b 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -32,53 +32,11 @@
  */
 
 
-#ifndef _L3FWD_COMMON_H_
-#define _L3FWD_COMMON_H_
+#ifndef _L3FWD_SSE_H_
+#define _L3FWD_SSE_H_
 
 #include "l3fwd.h"
-
-#ifdef DO_RFC_1812_CHECKS
-
-#define	IPV4_MIN_VER_IHL	0x45
-#define	IPV4_MAX_VER_IHL	0x4f
-#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
-
-/* Minimum value of IPV4 total length (20B) in network byte order. */
-#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
-
-/*
- * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
- * - The IP version number must be 4.
- * - The IP header length field must be large enough to hold the
- *    minimum length legal IP datagram (20 bytes = 5 words).
- * - The IP total length field must be large enough to hold the IP
- *   datagram header, whose length is specified in the IP header length
- *   field.
- * If we encounter invalid IPV4 packet, then set destination port for it
- * to BAD_PORT value.
- */
-static inline __attribute__((always_inline)) void
-rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
-{
-	uint8_t ihl;
-
-	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
-		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
-
-		ipv4_hdr->time_to_live--;
-		ipv4_hdr->hdr_checksum++;
-
-		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
-				((uint8_t)ipv4_hdr->total_length == 0 &&
-				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
-			dp[0] = BAD_PORT;
-
-	}
-}
-
-#else
-#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
-#endif /* DO_RFC_1812_CHECKS */
+#include "l3fwd_common.h"
 
 /*
  * Update source and destination MAC addresses in the ethernet header.
@@ -130,30 +88,6 @@ static inline __attribute__((always_inline)) void
 }
 
 /*
- * We group consecutive packets with the same destionation port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-/*
  * Group consecutive packets with the same destination port in bursts of 4.
  * Suppose we have array of destionation ports:
  * dst_port[] = {a, b, c, d,, e, ... }
@@ -164,109 +98,6 @@ static inline __attribute__((always_inline)) void
 static inline uint16_t *
 port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
 {
-	static const struct {
-		uint64_t pnum; /* prebuild 4 values for pnum[]. */
-		int32_t  idx;  /* index for new last updated elemnet. */
-		uint16_t lpv;  /* add value to the last updated element. */
-	} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-	};
-
 	union {
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
@@ -314,84 +145,6 @@ static inline __attribute__((always_inline)) void
 	_mm_storeu_si128((__m128i *)eth_hdr, te);
 }
 
-static inline __attribute__((always_inline)) void
-send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
-		uint32_t num)
-{
-	uint32_t len, j, n;
-
-	len = qconf->tx_mbufs[port].len;
-
-	/*
-	 * If TX buffer for that queue is empty, and we have enough packets,
-	 * then send them straightway.
-	 */
-	if (num >= MAX_TX_BURST && len == 0) {
-		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
-		if (unlikely(n < num)) {
-			do {
-				rte_pktmbuf_free(m[n]);
-			} while (++n < num);
-		}
-		return;
-	}
-
-	/*
-	 * Put packets into TX buffer for that queue.
-	 */
-
-	n = len + num;
-	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
-
-	j = 0;
-	switch (n % FWDSTEP) {
-	while (j < n) {
-	case 0:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 3:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 2:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	case 1:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	}
-	}
-
-	len += n;
-
-	/* enough pkts to be sent */
-	if (unlikely(len == MAX_PKT_BURST)) {
-
-		send_burst(qconf, MAX_PKT_BURST, port);
-
-		/* copy rest of the packets into the TX buffer. */
-		len = num - n;
-		j = 0;
-		switch (len % FWDSTEP) {
-		while (j < len) {
-		case 0:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 3:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 2:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		case 1:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		}
-		}
-	}
-
-	qconf->tx_mbufs[port].len = len;
-}
-
 /**
  * Send packets burst from pkts_burst to the ports in dst_port array
  */
@@ -498,4 +251,4 @@ static inline __attribute__((always_inline)) void
 	}
 }
 
-#endif /* _L3FWD_COMMON_H_ */
+#endif /* _L3FWD_SSE_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (2 preceding siblings ...)
  2017-05-15  3:34   ` [PATCH v4 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c     | 83 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.h     | 26 +------------
 examples/l3fwd/l3fwd_lpm_sse.h | 66 ---------------------------------
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm *ipv4_l3fwd_lookup_struct =
+		(struct rte_lpm *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+		&next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+		(struct rte_lpm6 *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+			&next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	struct ipv6_hdr *ipv6_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+					     qconf->ipv4_lookup_struct);
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					     qconf->ipv6_lookup_struct);
+	}
+
+	return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint8_t portid)
+{
+	uint32_t next_hop;
+	struct ipv6_hdr *ipv6_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+						   dst_ipv4, &next_hop) == 0)
+				   ? next_hop : portid);
+
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+				ipv6_hdr->dst_addr, &next_hop) == 0)
+				? next_hop : portid);
+
+	}
+
+	return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm *ipv4_l3fwd_lookup_struct =
-		(struct rte_lpm *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-		&next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-		(struct rte_lpm6 *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-			&next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((always_inline)) void
 l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint8_t portid,
 		struct lcore_conf *qconf)
 {
 	struct ether_hdr *eth_hdr;
 	struct ipv4_hdr *ipv4_hdr;
-	uint8_t dst_port;
+	uint16_t dst_port;
 
 	eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index aa06b6d..4a9b7ed 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -36,72 +36,6 @@
 
 #include "l3fwd_sse.h"
 
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) (
-			(rte_lpm_lookup(qconf->ipv4_lookup_struct,
-					rte_be_to_cpu_32(ipv4_hdr->dst_addr),
-					&next_hop) == 0) ?
-						next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-}
-
-/*
- * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
- * precalculated. If packet is ipv6 dst_addr is taken directly from packet
- * header and dst_ipv4 value is not used.
- */
-static inline __attribute__((always_inline)) uint16_t
-lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-	uint32_t dst_ipv4, uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
-			&next_hop) == 0) ? next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-
-}
-
 /*
  * Read packet_type and destination IPV4 addresses from 4 mbufs.
  */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (3 preceding siblings ...)
  2017-05-15  3:34   ` [PATCH v4 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  4:01     ` Jerin Jacob
  2017-05-15  3:34   ` [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
                     ` (3 subsequent siblings)
  8 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Implement vcopyq_laneq_u32 if gcc version is lower than 7.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 lib/librte_eal/common/include/arch/arm/rte_vect.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..d9fb4d0 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -78,6 +78,15 @@
 }
 #endif
 
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
+static inline uint32x4_t
+vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
+		 uint32x4_t b, const int lane_b)
+{
+	return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (4 preceding siblings ...)
  2017-05-15  3:34   ` [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  5:22     ` Sekhar, Ashwin
  2017-05-15  3:34   ` [PATCH v4 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
                     ` (2 subsequent siblings)
  8 siblings, 1 reply; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c            |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h        |  17 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
 examples/l3fwd/l3fwd_em_sequential.h |  18 ++-
 examples/l3fwd/l3fwd_lpm.c           |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h      | 193 ++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h          | 259 +++++++++++++++++++++++++++++++++++
 7 files changed, 563 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..b9163e3 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j, pos;
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
@@ -247,6 +252,11 @@ static inline __attribute__((always_inline)) uint16_t
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+	for (j = 0; j < 8 && j < nb_rx; j++) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
 	for (j = 0; j < n; j += 8) {
 
 		uint32_t pkt_type =
@@ -263,6 +273,11 @@ static inline __attribute__((always_inline)) uint16_t
 		uint32_t tcp_or_udp = pkt_type &
 			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
 			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index c0a9725..2b3ec16 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -43,7 +43,11 @@
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
@@ -101,11 +105,21 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j;
 	uint16_t dst_port[MAX_PKT_BURST];
 
-	for (j = 0; j < nb_rx; j++)
+	if (nb_rx > 0) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (i = 1, j = 0; j < nb_rx; i++, j++) {
+		if (i < nb_rx) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						       struct ether_hdr *) + 1);
+		}
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+	}
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index fc554fc..ddef250 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@ static inline __attribute__((always_inline)) uint16_t
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@ static inline __attribute__((always_inline)) uint16_t
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..baedbfe
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,193 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t i = 0, j = 0;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip;
+	uint32_t ipv4_flag;
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+
+	if (k) {
+		for (i = 0; i < FWDSTEP; i++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						struct ether_hdr *) + 1);
+		}
+
+		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
+			for (i = 0; i < FWDSTEP; i++) {
+				rte_prefetch0(rte_pktmbuf_mtod(
+						pkts_burst[j + i + FWDSTEP],
+						struct ether_hdr *) + 1);
+			}
+
+			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+			processx4_step2(qconf, dip, ipv4_flag, portid,
+					&pkts_burst[j], &dst_port[j]);
+		}
+
+		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
+				&dst_port[j]);
+
+		j += FWDSTEP;
+	}
+
+	if (m) {
+		/* Prefetch last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 2:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 1:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+		}
+
+		j -= m;
+		/* Classify last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 2:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 1:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+		}
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..fe351db
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static inline __attribute__((always_inline)) void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (5 preceding siblings ...)
  2017-05-15  3:34   ` [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-05-15  3:34   ` [PATCH v4 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
  2017-07-03 21:02   ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

New macro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index b9163e3..098b396 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-
-	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv4_l3fwd_out_if[ret[i]]);
 
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv6xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv6_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv6_5tuple(m[i], mask1.x, mask2.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
 
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv6_l3fwd_out_if[ret[i]]);
+
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static inline __attribute__((always_inline)) uint16_t
@@ -247,64 +164,48 @@ static inline __attribute__((always_inline)) uint16_t
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
+	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
+	 * in groups of EM_HASH_LOOKUP_COUNT.
 	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < 8 && j < nb_rx; j++) {
+	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
 		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
 					       struct ether_hdr *) + 1);
 	}
 
-	for (j = 0; j < n; j += 8) {
+	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
+
+		uint32_t pkt_type = RTE_PTYPE_L3_MASK |
+				    RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
+		uint32_t l3_type, tcp_or_udp;
 
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+			pkt_type &= pkts_burst[j + i]->packet_type;
 
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
-		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
+		     i < EM_HASH_LOOKUP_COUNT && pos < nb_rx; i++, pos++) {
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
 						       struct ether_hdr *) + 1);
 		}
 
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv4xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
 
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv6xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
-							portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
-							portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
-							portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
-							portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
-							portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
-							portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
-							portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
-							portid);
+			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+				dst_port[j + i] = em_get_dst_port(qconf,
+						pkts_burst[j + i], portid);
 		}
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v4 8/8] examples/l3fwd: change the guard macro name for header file
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (6 preceding siblings ...)
  2017-05-15  3:34   ` [PATCH v4 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
@ 2017-05-15  3:34   ` Jianbo Liu
  2017-07-03 21:02   ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-05-15  3:34 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index 2b3ec16..c7d477d 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -123,4 +123,4 @@ static inline __attribute__((always_inline)) uint16_t
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc
  2017-05-15  3:34   ` [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
@ 2017-05-15  4:01     ` Jerin Jacob
  0 siblings, 0 replies; 62+ messages in thread
From: Jerin Jacob @ 2017-05-15  4:01 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, tomasz.kantecki, ashwin.sekhar

-----Original Message-----
> Date: Mon, 15 May 2017 11:34:53 +0800
> From: Jianbo Liu <jianbo.liu@linaro.org>
> To: dev@dpdk.org, tomasz.kantecki@intel.com,
>  jerin.jacob@caviumnetworks.com, ashwin.sekhar@caviumnetworks.com
> CC: Jianbo Liu <jianbo.liu@linaro.org>
> Subject: [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of
>  gcc
> X-Mailer: git-send-email 1.8.3.1
> 
> Implement vcopyq_laneq_u32 if gcc version is lower than 7.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

> ---
>  lib/librte_eal/common/include/arch/arm/rte_vect.h | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> index 4107c99..d9fb4d0 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> @@ -78,6 +78,15 @@
>  }
>  #endif
>  
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
> +static inline uint32x4_t
> +vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
> +		 uint32x4_t b, const int lane_b)
> +{
> +	return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
> +}
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd
  2017-05-15  3:34   ` [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-05-15  5:22     ` Sekhar, Ashwin
  0 siblings, 0 replies; 62+ messages in thread
From: Sekhar, Ashwin @ 2017-05-15  5:22 UTC (permalink / raw)
  To: tomasz.kantecki, Jacob,  Jerin, jianbo.liu, dev

On Mon, 2017-05-15 at 11:34 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
Acked-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> ---
>  examples/l3fwd/l3fwd_em.c            |   4 +-
>  examples/l3fwd/l3fwd_em_hlm.h        |  17 ++-
>  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
>  examples/l3fwd/l3fwd_em_sequential.h |  18 ++-
>  examples/l3fwd/l3fwd_lpm.c           |   4 +-
>  examples/l3fwd/l3fwd_lpm_neon.h      | 193
> ++++++++++++++++++++++++++
>  examples/l3fwd/l3fwd_neon.h          | 259
> +++++++++++++++++++++++++++++++++++
>  7 files changed, 563 insertions(+), 6 deletions(-)
>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_neon.h
> 
> diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
> index ba844b2..da96cfd 100644
> --- a/examples/l3fwd/l3fwd_em.c
> +++ b/examples/l3fwd/l3fwd_em.c
> @@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
>  	return (uint8_t)((ret < 0) ? portid :
> ipv6_l3fwd_out_if[ret]);
>  }
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  #if defined(NO_HASH_MULTI_LOOKUP)
>  #include "l3fwd_em_sequential.h"
>  #else
> @@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
>  			if (nb_rx == 0)
>  				continue;
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  			l3fwd_em_send_packets(nb_rx, pkts_burst,
>  							portid,
> qconf);
>  #else
> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
> b/examples/l3fwd/l3fwd_em_hlm.h
> index 636dea4..b9163e3 100644
> --- a/examples/l3fwd/l3fwd_em_hlm.h
> +++ b/examples/l3fwd/l3fwd_em_hlm.h
> @@ -35,8 +35,13 @@
>  #ifndef __L3FWD_EM_HLM_H__
>  #define __L3FWD_EM_HLM_H__
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
>  #include "l3fwd_em_hlm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#include "l3fwd_em_hlm_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) void
>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
> *m[8],
> @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))
> uint16_t
>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>  		uint8_t portid, struct lcore_conf *qconf)
>  {
> -	int32_t j;
> +	int32_t i, j, pos;
>  	uint16_t dst_port[MAX_PKT_BURST];
>  
>  	/*
> @@ -247,6 +252,11 @@ static inline __attribute__((always_inline))
> uint16_t
>  	 */
>  	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
>  
> +	for (j = 0; j < 8 && j < nb_rx; j++) {
> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
> +					       struct ether_hdr *) +
> 1);
> +	}
> +
>  	for (j = 0; j < n; j += 8) {
>  
>  		uint32_t pkt_type =
> @@ -263,6 +273,11 @@ static inline __attribute__((always_inline))
> uint16_t
>  		uint32_t tcp_or_udp = pkt_type &
>  			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
>  
> +		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++,
> pos++) {
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
> s],
> +						       struct
> ether_hdr *) + 1);
> +		}
> +
>  		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>  
>  			em_get_dst_port_ipv4x8(qconf,
> &pkts_burst[j], portid,
> diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h
> b/examples/l3fwd/l3fwd_em_hlm_neon.h
> new file mode 100644
> index 0000000..dae1acf
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
> @@ -0,0 +1,74 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef __L3FWD_EM_HLM_NEON_H__
> +#define __L3FWD_EM_HLM_NEON_H__
> +
> +#include <arm_neon.h>
> +
> +static inline void
> +get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
> +		union ipv4_5tuple_host *key)
> +{
> +	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0,
> int32_t *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv4_hdr,
> time_to_live)));
> +
> +	key->xmm = vandq_s32(tmpdata0, mask0);
> +}
> +
> +static inline void
> +get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
> +		int32x4_t mask1, union ipv6_5tuple_host *key)
> +{
> +	int32x4_t tmpdata0 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len)));
> +
> +	int32x4_t tmpdata1 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len) + 8));
> +
> +	int32x4_t tmpdata2 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len) + 16));
> +
> +	key->xmm[0] = vandq_s32(tmpdata0, mask0);
> +	key->xmm[1] = tmpdata1;
> +	key->xmm[2] = vandq_s32(tmpdata2, mask1);
> +}
> +#endif /* __L3FWD_EM_HLM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_em_sequential.h
> b/examples/l3fwd/l3fwd_em_sequential.h
> index c0a9725..2b3ec16 100644
> --- a/examples/l3fwd/l3fwd_em_sequential.h
> +++ b/examples/l3fwd/l3fwd_em_sequential.h
> @@ -43,7 +43,11 @@
>   * compilation time.
>   */
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) uint16_t
>  em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf
> *pkt,
> @@ -101,11 +105,21 @@ static inline __attribute__((always_inline))
> uint16_t
>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>  			uint8_t portid, struct lcore_conf *qconf)
>  {
> -	int32_t j;
> +	int32_t i, j;
>  	uint16_t dst_port[MAX_PKT_BURST];
>  
> -	for (j = 0; j < nb_rx; j++)
> +	if (nb_rx > 0) {
> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
> +					       struct ether_hdr *) +
> 1);
> +	}
> +
> +	for (i = 1, j = 0; j < nb_rx; i++, j++) {
> +		if (i < nb_rx) {
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i]
> ,
> +						       struct
> ether_hdr *) + 1);
> +		}
>  		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j],
> portid);
> +	}
>  
>  	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
>  }
> diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
> index fc554fc..ddef250 100644
> --- a/examples/l3fwd/l3fwd_lpm.c
> +++ b/examples/l3fwd/l3fwd_lpm.c
> @@ -189,6 +189,8 @@ static inline __attribute__((always_inline))
> uint16_t
>  
>  #if defined(__SSE4_1__)
>  #include "l3fwd_lpm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_lpm_neon.h"
>  #else
>  #include "l3fwd_lpm.h"
>  #endif
> @@ -261,7 +263,7 @@ static inline __attribute__((always_inline))
> uint16_t
>  			if (nb_rx == 0)
>  				continue;
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
>  						portid, qconf);
>  #else
> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
> b/examples/l3fwd/l3fwd_lpm_neon.h
> new file mode 100644
> index 0000000..baedbfe
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_lpm_neon.h
> @@ -0,0 +1,193 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef __L3FWD_LPM_NEON_H__
> +#define __L3FWD_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +
> +#include "l3fwd_neon.h"
> +
> +/*
> + * Read packet_type and destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
> +		int32x4_t *dip,
> +		uint32_t *ipv4_flag)
> +{
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[0] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[1] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[1]->packet_type;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[2] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[2]->packet_type;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[3] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[3]->packet_type;
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + * If lookup fails, use incoming port (portid) as destination port.
> + */
> +static inline void
> +processx4_step2(const struct lcore_conf *qconf,
> +		int32x4_t dip,
> +		uint32_t ipv4_flag,
> +		uint8_t portid,
> +		struct rte_mbuf *pkt[FWDSTEP],
> +		uint16_t dprt[FWDSTEP])
> +{
> +	rte_xmm_t dst;
> +
> +	dip =
> vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> +
> +	/* if all 4 packets are IPV4. */
> +	if (likely(ipv4_flag)) {
> +		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip,
> dst.u32,
> +			portid);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +	} else {
> +		dst.x = dip;
> +		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
> +						     dst.u32[0],
> portid);
> +		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
> +						     dst.u32[1],
> portid);
> +		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
> +						     dst.u32[2],
> portid);
> +		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
> +						     dst.u32[3],
> portid);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of packets, invoked
> + * from main_loop.
> + */
> +static inline void
> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
> +			uint8_t portid, struct lcore_conf *qconf)
> +{
> +	int32_t i = 0, j = 0;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	int32x4_t dip;
> +	uint32_t ipv4_flag;
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	const int32_t m = nb_rx % FWDSTEP;
> +
> +	if (k) {
> +		for (i = 0; i < FWDSTEP; i++) {
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i]
> ,
> +						struct ether_hdr *)
> + 1);
> +		}
> +
> +		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
> +			for (i = 0; i < FWDSTEP; i++) {
> +				rte_prefetch0(rte_pktmbuf_mtod(
> +						pkts_burst[j + i +
> FWDSTEP],
> +						struct ether_hdr *)
> + 1);
> +			}
> +
> +			processx4_step1(&pkts_burst[j], &dip,
> &ipv4_flag);
> +			processx4_step2(qconf, dip, ipv4_flag,
> portid,
> +					&pkts_burst[j],
> &dst_port[j]);
> +		}
> +
> +		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
> +		processx4_step2(qconf, dip, ipv4_flag, portid,
> &pkts_burst[j],
> +				&dst_port[j]);
> +
> +		j += FWDSTEP;
> +	}
> +
> +	if (m) {
> +		/* Prefetch last up to 3 packets one by one */
> +		switch (m) {
> +		case 3:
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j]
> ,
> +						struct ether_hdr *)
> + 1);
> +			j++;
> +			/* fallthrough */
> +		case 2:
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j]
> ,
> +						struct ether_hdr *)
> + 1);
> +			j++;
> +			/* fallthrough */
> +		case 1:
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j]
> ,
> +						struct ether_hdr *)
> + 1);
> +			j++;
> +		}
> +
> +		j -= m;
> +		/* Classify last up to 3 packets one by one */
> +		switch (m) {
> +		case 3:
> +			dst_port[j] = lpm_get_dst_port(qconf,
> pkts_burst[j],
> +						       portid);
> +			j++;
> +			/* fallthrough */
> +		case 2:
> +			dst_port[j] = lpm_get_dst_port(qconf,
> pkts_burst[j],
> +						       portid);
> +			j++;
> +			/* fallthrough */
> +		case 1:
> +			dst_port[j] = lpm_get_dst_port(qconf,
> pkts_burst[j],
> +						       portid);
> +		}
> +	}
> +
> +	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> +}
> +
> +#endif /* __L3FWD_LPM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_neon.h
> b/examples/l3fwd/l3fwd_neon.h
> new file mode 100644
> index 0000000..fe351db
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_neon.h
> @@ -0,0 +1,259 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +
> +#ifndef _L3FWD_NEON_H_
> +#define _L3FWD_NEON_H_
> +
> +#include "l3fwd.h"
> +#include "l3fwd_common.h"
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t
> dst_port[FWDSTEP])
> +{
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +
> +	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
> +	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
> +	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
> +	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
> +
> +	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +	te[0] = vld1q_u32(p[0]);
> +
> +	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
> +	te[1] = vld1q_u32(p[1]);
> +
> +	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
> +	te[2] = vld1q_u32(p[2]);
> +
> +	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
> +	te[3] = vld1q_u32(p[3]);
> +
> +	/* Update last 4 bytes */
> +	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
> +	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
> +	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
> +	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
> +
> +	vst1q_u32(p[0], ve[0]);
> +	vst1q_u32(p[1], ve[1]);
> +	vst1q_u32(p[2], ve[2]);
> +	vst1q_u32(p[3], ve[3]);
> +
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0]
> + 1),
> +		&dst_port[0], pkt[0]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1]
> + 1),
> +		&dst_port[1], pkt[1]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2]
> + 1),
> +		&dst_port[2], pkt[2]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3]
> + 1),
> +		&dst_port[3], pkt[3]->packet_type);
> +}
> +
> +/*
> + * Group consecutive packets with the same destination port in
> bursts of 4.
> + * Suppose we have array of destionation ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisions at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +	     uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	int32_t v;
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +/**
> + * Process one packet:
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> +{
> +	struct ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +
> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> +			pkt->packet_type);
> +
> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +}
> +
> +/**
> + * Send packets burst from pkts_burst to the ports in dst_port array
> + */
> +static inline __attribute__((always_inline)) void
> +send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf
> **pkts_burst,
> +		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
> +{
> +	int32_t k;
> +	int j = 0;
> +	uint16_t dlp;
> +	uint16_t *lp;
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts_burst, dst_port);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (j = FWDSTEP; j != k; j += FWDSTEP) {
> +			processx4_step3(&pkts_burst[j],
> &dst_port[j]);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> +			lp  = port_groupx4(&pnum[j - FWDSTEP], lp,
> dp1, dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2,
> 3);
> +		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[j - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (j = 0; j < nb_rx; j += k) {
> +
> +		int32_t m;
> +		uint16_t pn;
> +
> +		pn = dst_port[j];
> +		k = pnum[j];
> +
> +		if (likely(pn != BAD_PORT))
> +			send_packetsx4(qconf, pn, pkts_burst + j,
> k);
> +		else
> +			for (m = j; m != j + k; m++)
> +				rte_pktmbuf_free(pkts_burst[m]);
> +
> +	}
> +}
> +
> +#endif /* _L3FWD_NEON_H_ */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
                     ` (7 preceding siblings ...)
  2017-05-15  3:34   ` [PATCH v4 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
@ 2017-07-03 21:02   ` Thomas Monjalon
  8 siblings, 0 replies; 62+ messages in thread
From: Thomas Monjalon @ 2017-07-03 21:02 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar

> Jianbo Liu (8):
>   examples/l3fwd: extract arch independent code from multi hash lookup
>   examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
>   examples/l3fwd: extract common code from multi packet send
>   examples/l3fwd: rearrange the code for lpm_l3fwd
>   arch/arm: add vcopyq_laneq_u32 for old version of gcc
>   examples/l3fwd: add neon support for l3fwd
>   examples/l3fwd: add the times of hash multi-lookup for different Archs
>   examples/l3fwd: change the guard macro name for header file

Please, may I ask you to rebase this series?
Thanks

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform
  2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                   ` (6 preceding siblings ...)
  2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
@ 2017-07-04 10:23 ` Jianbo Liu
  2017-07-04 10:23   ` [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
                     ` (8 more replies)
  7 siblings, 9 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:23 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

v5:
  - rebase to master
  Please apply after "move gcc version definition to	common header"
     http://www.dpdk.org/ml/archives/dev/2017-July/070031.html

v4:
  - add vcopyq_laneq_u32 for older version of gcc

v3:
  - remove unnecessary perfetch for rte_mbuf
  - fix typo in git log
  - Ashwin's suggestions for performance on ThunderX

v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (8):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  arch/arm: add vcopyq_laneq_u32 for old version of gcc
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard macro name for header file

 examples/l3fwd/l3fwd_common.h                      | 293 +++++++++++++++++++++
 examples/l3fwd/l3fwd_em.c                          |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h                      | 218 +++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_neon.h                 |  74 ++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h                  | 276 +------------------
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}      |  24 +-
 examples/l3fwd/l3fwd_lpm.c                         |  87 +++++-
 examples/l3fwd/l3fwd_lpm.h                         |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h                    | 193 ++++++++++++++
 examples/l3fwd/l3fwd_lpm_sse.h                     |  66 -----
 examples/l3fwd/l3fwd_neon.h                        | 259 ++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h                         | 261 +-----------------
 lib/librte_eal/common/include/arch/arm/rte_vect.h  |   9 +
 13 files changed, 1165 insertions(+), 629 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (88%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
@ 2017-07-04 10:23   ` Jianbo Liu
  2017-07-04 10:23   ` [PATCH v5 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:23 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c         |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h     | 302 ++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 276 +---------------------------------
 3 files changed, 308 insertions(+), 272 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 0000000..9fb5ff6
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static __rte_always_inline void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv4_5tuple_host key[8];
+
+	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+				&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv4_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static __rte_always_inline void
+em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+		uint8_t portid, uint16_t dst_port[8])
+{
+	int32_t ret[8];
+	union ipv6_5tuple_host key[8];
+
+	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
+
+	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+			&key[4], &key[5], &key[6], &key[7]};
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
+
+	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[0]]);
+	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[1]]);
+	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[2]]);
+	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[3]]);
+	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[4]]);
+	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[5]]);
+	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[6]]);
+	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+			portid : ipv6_l3fwd_out_if[ret[7]]);
+
+	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[0]) == 0)
+		dst_port[0] = portid;
+
+	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[1]) == 0)
+		dst_port[1] = portid;
+
+	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[2]) == 0)
+		dst_port[2] = portid;
+
+	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[3]) == 0)
+		dst_port[3] = portid;
+
+	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[4]) == 0)
+		dst_port[4] = portid;
+
+	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[5]) == 0)
+		dst_port[5] = portid;
+
+	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[6]) == 0)
+		dst_port[6] = portid;
+
+	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port[7]) == 0)
+		dst_port[7] = portid;
+
+}
+
+static __rte_always_inline uint16_t
+em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	uint8_t next_hop;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	uint32_t tcp_or_udp;
+	uint32_t l3_ptypes;
+
+	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
+
+	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
+
+		/* Handle IPv4 headers.*/
+		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
+				qconf->ipv4_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
+
+		/* Handle IPv6 headers.*/
+		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
+				sizeof(struct ether_hdr));
+
+		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
+				qconf->ipv6_lookup_struct);
+
+		if (next_hop >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << next_hop) == 0)
+			next_hop = portid;
+
+		return next_hop;
+
+	}
+
+	return portid;
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+		uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+
+	/*
+	 * Send nb_rx - nb_rx%8 packets
+	 * in groups of 8.
+	 */
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+
+	for (j = 0; j < n; j += 8) {
+
+		uint32_t pkt_type =
+			pkts_burst[j]->packet_type &
+			pkts_burst[j+1]->packet_type &
+			pkts_burst[j+2]->packet_type &
+			pkts_burst[j+3]->packet_type &
+			pkts_burst[j+4]->packet_type &
+			pkts_burst[j+5]->packet_type &
+			pkts_burst[j+6]->packet_type &
+			pkts_burst[j+7]->packet_type;
+
+		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		uint32_t tcp_or_udp = pkt_type &
+			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+
+		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
+
+			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
+
+			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+					       &dst_port[j]);
+
+		} else {
+			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
+							portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
+							portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
+							portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
+							portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
+							portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
+							portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
+							portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
+							portid);
+		}
+	}
+
+	for (; j < nb_rx; j++)
+		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+
+}
+#endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d272f11..0dd44df 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -37,101 +37,15 @@
 #include "l3fwd_sse.h"
 
 static __rte_always_inline void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+get_ipv4_5tuple(struct rte_mbuf *m0, __m128i mask0,
+		union ipv4_5tuple_host *key)
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-	__m128i data[8];
-
-	data[0] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[0], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[1] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[1], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[2] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[2], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[3] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[3], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[4] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[4], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[5] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[5], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[6] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[6], __m128i *,
-				sizeof(struct ether_hdr) +
-				offsetof(struct ipv4_hdr, time_to_live)));
-	data[7] = _mm_loadu_si128(rte_pktmbuf_mtod_offset(m[7], __m128i *,
+	 __m128i tmpdata0 = _mm_loadu_si128(
+			rte_pktmbuf_mtod_offset(m0, __m128i *,
 				sizeof(struct ether_hdr) +
 				offsetof(struct ipv4_hdr, time_to_live)));
 
-	key[0].xmm = _mm_and_si128(data[0], mask0.x);
-	key[1].xmm = _mm_and_si128(data[1], mask0.x);
-	key[2].xmm = _mm_and_si128(data[2], mask0.x);
-	key[3].xmm = _mm_and_si128(data[3], mask0.x);
-	key[4].xmm = _mm_and_si128(data[4], mask0.x);
-	key[5].xmm = _mm_and_si128(data[5], mask0.x);
-	key[6].xmm = _mm_and_si128(data[6], mask0.x);
-	key[7].xmm = _mm_and_si128(data[7], mask0.x);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
+	key->xmm = _mm_and_si128(tmpdata0, mask0);
 }
 
 static inline void
@@ -159,184 +73,4 @@
 	key->xmm[1] = tmpdata1;
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
-
-static __rte_always_inline void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
-{
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
-
-}
-
-static __rte_always_inline uint16_t
-em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint8_t next_hop;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ipv6_hdr *ipv6_hdr;
-	uint32_t tcp_or_udp;
-	uint32_t l3_ptypes;
-
-	tcp_or_udp = pkt->packet_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-	l3_ptypes = pkt->packet_type & RTE_PTYPE_L3_MASK;
-
-	if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV4)) {
-
-		/* Handle IPv4 headers.*/
-		ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv4_dst_port(ipv4_hdr, portid,
-				qconf->ipv4_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	} else if (tcp_or_udp && (l3_ptypes == RTE_PTYPE_L3_IPV6)) {
-
-		/* Handle IPv6 headers.*/
-		ipv6_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *,
-				sizeof(struct ether_hdr));
-
-		next_hop = em_get_ipv6_dst_port(ipv6_hdr, portid,
-				qconf->ipv6_lookup_struct);
-
-		if (next_hop >= RTE_MAX_ETHPORTS ||
-				(enabled_port_mask & 1 << next_hop) == 0)
-			next_hop = portid;
-
-		return next_hop;
-
-	}
-
-	return portid;
-}
-
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
-static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint8_t portid, struct lcore_conf *qconf)
-{
-	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
-
-	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
-	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
-
-	for (j = 0; j < n; j += 8) {
-
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
-
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid, &dst_port[j]);
-
-		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
-		}
-	}
-
-	for (; j < nb_rx; j++)
-		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
-
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
-
-}
 #endif /* __L3FWD_EM_SSE_HLM_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
  2017-07-04 10:23   ` [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
@ 2017-07-04 10:23   ` Jianbo Liu
  2017-07-04 10:24   ` [PATCH v5 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:23 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c                                | 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 3/8] examples/l3fwd: extract common code from multi packet send
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
  2017-07-04 10:23   ` [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
  2017-07-04 10:23   ` [PATCH v5 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
@ 2017-07-04 10:24   ` Jianbo Liu
  2017-07-04 10:24   ` [PATCH v5 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:24 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_common.h | 293 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h    | 261 +------------------------------------
 2 files changed, 297 insertions(+), 257 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 0000000..2867365
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static __rte_always_inline void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+	uint8_t ihl;
+
+	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+			dp[0] = BAD_PORT;
+
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {             \
+		(lp)[0]++;                                   \
+	} else {                                         \
+		(dlp) = (dcp)[idx];                          \
+		(lp) = (pn) + (idx);                         \
+		(lp)[0] = 1;                                 \
+	}                                                \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+static __rte_always_inline void
+send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
+		uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 3:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 2:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 1:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	qconf->tx_mbufs[port].len = len;
+}
+
+#endif /* _L3FWD_COMMON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 80e2cd1..831760f 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -32,53 +32,11 @@
  */
 
 
-#ifndef _L3FWD_COMMON_H_
-#define _L3FWD_COMMON_H_
+#ifndef _L3FWD_SSE_H_
+#define _L3FWD_SSE_H_
 
 #include "l3fwd.h"
-
-#ifdef DO_RFC_1812_CHECKS
-
-#define	IPV4_MIN_VER_IHL	0x45
-#define	IPV4_MAX_VER_IHL	0x4f
-#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
-
-/* Minimum value of IPV4 total length (20B) in network byte order. */
-#define	IPV4_MIN_LEN_BE	(sizeof(struct ipv4_hdr) << 8)
-
-/*
- * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
- * - The IP version number must be 4.
- * - The IP header length field must be large enough to hold the
- *    minimum length legal IP datagram (20 bytes = 5 words).
- * - The IP total length field must be large enough to hold the IP
- *   datagram header, whose length is specified in the IP header length
- *   field.
- * If we encounter invalid IPV4 packet, then set destination port for it
- * to BAD_PORT value.
- */
-static __rte_always_inline void
-rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
-{
-	uint8_t ihl;
-
-	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
-		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
-
-		ipv4_hdr->time_to_live--;
-		ipv4_hdr->hdr_checksum++;
-
-		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
-				((uint8_t)ipv4_hdr->total_length == 0 &&
-				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
-			dp[0] = BAD_PORT;
-
-	}
-}
-
-#else
-#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
-#endif /* DO_RFC_1812_CHECKS */
+#include "l3fwd_common.h"
 
 /*
  * Update source and destination MAC addresses in the ethernet header.
@@ -130,30 +88,6 @@
 }
 
 /*
- * We group consecutive packets with the same destionation port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-/*
  * Group consecutive packets with the same destination port in bursts of 4.
  * Suppose we have array of destionation ports:
  * dst_port[] = {a, b, c, d,, e, ... }
@@ -164,109 +98,6 @@
 static inline uint16_t *
 port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
 {
-	static const struct {
-		uint64_t pnum; /* prebuild 4 values for pnum[]. */
-		int32_t  idx;  /* index for new last updated elemnet. */
-		uint16_t lpv;  /* add value to the last updated element. */
-	} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-	};
-
 	union {
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
@@ -314,90 +145,6 @@
 	_mm_storeu_si128((__m128i *)eth_hdr, te);
 }
 
-static __rte_always_inline void
-send_packetsx4(struct lcore_conf *qconf, uint8_t port, struct rte_mbuf *m[],
-		uint32_t num)
-{
-	uint32_t len, j, n;
-
-	len = qconf->tx_mbufs[port].len;
-
-	/*
-	 * If TX buffer for that queue is empty, and we have enough packets,
-	 * then send them straightway.
-	 */
-	if (num >= MAX_TX_BURST && len == 0) {
-		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
-		if (unlikely(n < num)) {
-			do {
-				rte_pktmbuf_free(m[n]);
-			} while (++n < num);
-		}
-		return;
-	}
-
-	/*
-	 * Put packets into TX buffer for that queue.
-	 */
-
-	n = len + num;
-	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
-
-	j = 0;
-	switch (n % FWDSTEP) {
-	while (j < n) {
-	case 0:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-		/* fall-through */
-	case 3:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-		/* fall-through */
-	case 2:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-		/* fall-through */
-	case 1:
-		qconf->tx_mbufs[port].m_table[len + j] = m[j];
-		j++;
-	}
-	}
-
-	len += n;
-
-	/* enough pkts to be sent */
-	if (unlikely(len == MAX_PKT_BURST)) {
-
-		send_burst(qconf, MAX_PKT_BURST, port);
-
-		/* copy rest of the packets into the TX buffer. */
-		len = num - n;
-		j = 0;
-		switch (len % FWDSTEP) {
-		while (j < len) {
-		case 0:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-			/* fall-through */
-		case 3:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-			/* fall-through */
-		case 2:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-			/* fall-through */
-		case 1:
-			qconf->tx_mbufs[port].m_table[j] = m[n + j];
-			j++;
-		}
-		}
-	}
-
-	qconf->tx_mbufs[port].len = len;
-}
-
 /**
  * Send packets burst from pkts_burst to the ports in dst_port array
  */
@@ -506,4 +253,4 @@
 	}
 }
 
-#endif /* _L3FWD_COMMON_H_ */
+#endif /* _L3FWD_SSE_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
                     ` (2 preceding siblings ...)
  2017-07-04 10:24   ` [PATCH v5 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
@ 2017-07-04 10:24   ` Jianbo Liu
  2017-07-04 10:24   ` [PATCH v5 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:24 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Some common code can be used by other ARCHs, move to l3fwd_lpm.c

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_lpm.c     | 83 ++++++++++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.h     | 26 +------------
 examples/l3fwd/l3fwd_lpm_sse.h | 66 ---------------------------------
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..ff8d10b 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm *ipv4_l3fwd_lookup_struct =
+		(struct rte_lpm *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+		&next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+	uint32_t next_hop;
+	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+		(struct rte_lpm6 *)lookup_struct;
+
+	return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+			&next_hop) == 0) ?  next_hop : portid);
+}
+
+static __rte_always_inline uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+		uint8_t portid)
+{
+	struct ipv6_hdr *ipv6_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+					     qconf->ipv4_lookup_struct);
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					     qconf->ipv6_lookup_struct);
+	}
+
+	return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static __rte_always_inline uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint8_t portid)
+{
+	uint32_t next_hop;
+	struct ipv6_hdr *ipv6_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+						   dst_ipv4, &next_hop) == 0)
+				   ? next_hop : portid);
+
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+				ipv6_hdr->dst_addr, &next_hop) == 0)
+				? next_hop : portid);
+
+	}
+
+	return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 4d77b58..55c3e83 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm *ipv4_l3fwd_lookup_struct =
-		(struct rte_lpm *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-		rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-		&next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-	uint32_t next_hop;
-	struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-		(struct rte_lpm6 *)lookup_struct;
-
-	return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-			((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-			&next_hop) == 0) ?  next_hop : portid);
-}
-
 static __rte_always_inline void
 l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint8_t portid,
 		struct lcore_conf *qconf)
 {
 	struct ether_hdr *eth_hdr;
 	struct ipv4_hdr *ipv4_hdr;
-	uint8_t dst_port;
+	uint16_t dst_port;
 
 	eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index fa1b902..4e294c8 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -36,72 +36,6 @@
 
 #include "l3fwd_sse.h"
 
-static __rte_always_inline uint16_t
-lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-		uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ipv4_hdr *ipv4_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) (
-			(rte_lpm_lookup(qconf->ipv4_lookup_struct,
-					rte_be_to_cpu_32(ipv4_hdr->dst_addr),
-					&next_hop) == 0) ?
-						next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-}
-
-/*
- * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
- * precalculated. If packet is ipv6 dst_addr is taken directly from packet
- * header and dst_ipv4 value is not used.
- */
-static __rte_always_inline uint16_t
-lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
-	uint32_t dst_ipv4, uint8_t portid)
-{
-	uint32_t next_hop;
-	struct ipv6_hdr *ipv6_hdr;
-	struct ether_hdr *eth_hdr;
-
-	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
-		return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
-			&next_hop) == 0) ? next_hop : portid);
-
-	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
-
-		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
-		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
-
-		return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
-				ipv6_hdr->dst_addr, &next_hop) == 0)
-				? next_hop : portid);
-
-	}
-
-	return portid;
-
-}
-
 /*
  * Read packet_type and destination IPV4 addresses from 4 mbufs.
  */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
                     ` (3 preceding siblings ...)
  2017-07-04 10:24   ` [PATCH v5 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
@ 2017-07-04 10:24   ` Jianbo Liu
  2017-07-04 10:24   ` [PATCH v5 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:24 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Implement vcopyq_laneq_u32 if gcc version is lower than 7.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 lib/librte_eal/common/include/arch/arm/rte_vect.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..d9fb4d0 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -78,6 +78,15 @@
 }
 #endif
 
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
+static inline uint32x4_t
+vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
+		 uint32x4_t b, const int lane_b)
+{
+	return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 6/8] examples/l3fwd: add neon support for l3fwd
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
                     ` (4 preceding siblings ...)
  2017-07-04 10:24   ` [PATCH v5 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
@ 2017-07-04 10:24   ` Jianbo Liu
  2017-07-04 10:24   ` [PATCH v5 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:24 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c            |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h        |  17 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
 examples/l3fwd/l3fwd_em_sequential.h |  18 ++-
 examples/l3fwd/l3fwd_lpm.c           |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h      | 193 ++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h          | 259 +++++++++++++++++++++++++++++++++++
 7 files changed, 563 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 9fb5ff6..aa3e561 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static __rte_always_inline void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j, pos;
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
@@ -247,6 +252,11 @@
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+	for (j = 0; j < 8 && j < nb_rx; j++) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
 	for (j = 0; j < n; j += 8) {
 
 		uint32_t pkt_type =
@@ -263,6 +273,11 @@
 		uint32_t tcp_or_udp = pkt_type &
 			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
 			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index 6c794b6..4baccf1 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -43,7 +43,11 @@
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static __rte_always_inline uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
@@ -101,11 +105,21 @@
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j;
 	uint16_t dst_port[MAX_PKT_BURST];
 
-	for (j = 0; j < nb_rx; j++)
+	if (nb_rx > 0) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (i = 1, j = 0; j < nb_rx; i++, j++) {
+		if (i < nb_rx) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						       struct ether_hdr *) + 1);
+		}
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+	}
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index ff8d10b..fe4f9e3 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@ struct ipv6_l3fwd_lpm_route {
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@ struct ipv6_l3fwd_lpm_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..baedbfe
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,193 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t i = 0, j = 0;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip;
+	uint32_t ipv4_flag;
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+
+	if (k) {
+		for (i = 0; i < FWDSTEP; i++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						struct ether_hdr *) + 1);
+		}
+
+		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
+			for (i = 0; i < FWDSTEP; i++) {
+				rte_prefetch0(rte_pktmbuf_mtod(
+						pkts_burst[j + i + FWDSTEP],
+						struct ether_hdr *) + 1);
+			}
+
+			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+			processx4_step2(qconf, dip, ipv4_flag, portid,
+					&pkts_burst[j], &dst_port[j]);
+		}
+
+		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
+				&dst_port[j]);
+
+		j += FWDSTEP;
+	}
+
+	if (m) {
+		/* Prefetch last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 2:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 1:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct ether_hdr *) + 1);
+			j++;
+		}
+
+		j -= m;
+		/* Classify last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 2:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 1:
+			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
+						       portid);
+		}
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..42d50d3
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
                     ` (5 preceding siblings ...)
  2017-07-04 10:24   ` [PATCH v5 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
@ 2017-07-04 10:24   ` Jianbo Liu
  2017-07-04 10:24   ` [PATCH v5 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
  2017-07-04 15:11   ` [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:24 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

New macro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index aa3e561..707c7fc 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static __rte_always_inline void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv4_5tuple_host key[8];
-
-	get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-	get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-	get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-	get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-	get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-	get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-	get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-	get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-				&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv4_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv4_l3fwd_out_if[ret[i]]);
 
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static __rte_always_inline void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv6xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+		uint8_t portid, uint16_t dst_port[])
 {
-	int32_t ret[8];
-	union ipv6_5tuple_host key[8];
-
-	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
-	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
-	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
-	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
-	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
-	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
-	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
-	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
-
-	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-			&key[4], &key[5], &key[6], &key[7]};
-
-	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0], 8, ret);
-
-	dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[0]]);
-	dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[1]]);
-	dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[2]]);
-	dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[3]]);
-	dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[4]]);
-	dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[5]]);
-	dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[6]]);
-	dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-			portid : ipv6_l3fwd_out_if[ret[7]]);
-
-	if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[0]) == 0)
-		dst_port[0] = portid;
-
-	if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[1]) == 0)
-		dst_port[1] = portid;
-
-	if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[2]) == 0)
-		dst_port[2] = portid;
-
-	if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[3]) == 0)
-		dst_port[3] = portid;
-
-	if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[4]) == 0)
-		dst_port[4] = portid;
-
-	if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[5]) == 0)
-		dst_port[5] = portid;
-
-	if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[6]) == 0)
-		dst_port[6] = portid;
-
-	if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-			(enabled_port_mask & 1 << dst_port[7]) == 0)
-		dst_port[7] = portid;
+	int i;
+	int32_t ret[EM_HASH_LOOKUP_COUNT];
+	union ipv6_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+	const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		get_ipv6_5tuple(m[i], mask1.x, mask2.x, &key[i]);
+		key_array[i] = &key[i];
+	}
+
+	rte_hash_lookup_bulk(qconf->ipv6_lookup_struct, &key_array[0],
+			     EM_HASH_LOOKUP_COUNT, ret);
 
+	for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+		dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+				portid : ipv6_l3fwd_out_if[ret[i]]);
+
+		if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port[i]) == 0)
+			dst_port[i] = portid;
+	}
 }
 
 static __rte_always_inline uint16_t
@@ -247,64 +164,48 @@
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
-	 * Send nb_rx - nb_rx%8 packets
-	 * in groups of 8.
+	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
+	 * in groups of EM_HASH_LOOKUP_COUNT.
 	 */
-	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
+	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < 8 && j < nb_rx; j++) {
+	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
 		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
 					       struct ether_hdr *) + 1);
 	}
 
-	for (j = 0; j < n; j += 8) {
+	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
+
+		uint32_t pkt_type = RTE_PTYPE_L3_MASK |
+				    RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
+		uint32_t l3_type, tcp_or_udp;
 
-		uint32_t pkt_type =
-			pkts_burst[j]->packet_type &
-			pkts_burst[j+1]->packet_type &
-			pkts_burst[j+2]->packet_type &
-			pkts_burst[j+3]->packet_type &
-			pkts_burst[j+4]->packet_type &
-			pkts_burst[j+5]->packet_type &
-			pkts_burst[j+6]->packet_type &
-			pkts_burst[j+7]->packet_type;
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+			pkt_type &= pkts_burst[j + i]->packet_type;
 
-		uint32_t l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		uint32_t tcp_or_udp = pkt_type &
-			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
+		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
+		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
-		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
+		     i < EM_HASH_LOOKUP_COUNT && pos < nb_rx; i++, pos++) {
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
 						       struct ether_hdr *) + 1);
 		}
 
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
-			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv4xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
 
-			em_get_dst_port_ipv6x8(qconf, &pkts_burst[j], portid,
+			em_get_dst_port_ipv6xN(qconf, &pkts_burst[j], portid,
 					       &dst_port[j]);
 
 		} else {
-			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j],
-							portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1],
-							portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2],
-							portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3],
-							portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4],
-							portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5],
-							portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6],
-							portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7],
-							portid);
+			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
+				dst_port[j + i] = em_get_dst_port(qconf,
+						pkts_burst[j + i], portid);
 		}
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH v5 8/8] examples/l3fwd: change the guard macro name for header file
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
                     ` (6 preceding siblings ...)
  2017-07-04 10:24   ` [PATCH v5 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
@ 2017-07-04 10:24   ` Jianbo Liu
  2017-07-04 15:11   ` [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon
  8 siblings, 0 replies; 62+ messages in thread
From: Jianbo Liu @ 2017-07-04 10:24 UTC (permalink / raw)
  To: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar; +Cc: Jianbo Liu

As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index 4baccf1..6b34733 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -123,4 +123,4 @@
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform
  2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
                     ` (7 preceding siblings ...)
  2017-07-04 10:24   ` [PATCH v5 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
@ 2017-07-04 15:11   ` Thomas Monjalon
  8 siblings, 0 replies; 62+ messages in thread
From: Thomas Monjalon @ 2017-07-04 15:11 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, tomasz.kantecki, jerin.jacob, ashwin.sekhar

> Jianbo Liu (8):
>   examples/l3fwd: extract arch independent code from multi hash lookup
>   examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
>   examples/l3fwd: extract common code from multi packet send
>   examples/l3fwd: rearrange the code for lpm_l3fwd
>   arch/arm: add vcopyq_laneq_u32 for old version of gcc
>   examples/l3fwd: add neon support for l3fwd
>   examples/l3fwd: add the times of hash multi-lookup for different Archs
>   examples/l3fwd: change the guard macro name for header file

Applied and rebased, thanks

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2017-07-04 15:11 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-02  7:14 ` [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h Jianbo Liu
2017-05-02  9:40   ` Sekhar, Ashwin
2017-05-02  7:14 ` [PATCH 3/5] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-02  7:14 ` [PATCH 4/5] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-02  7:14 ` [PATCH 5/5] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-02 11:20   ` Sekhar, Ashwin
2017-05-02 11:47   ` Sekhar, Ashwin
2017-05-03  5:24     ` Jianbo Liu
2017-05-04  8:42       ` Jianbo Liu
2017-05-05  4:24         ` Sekhar, Ashwin
2017-05-05  5:43           ` Jianbo Liu
2017-05-09  8:10             ` Sekhar, Ashwin
2017-05-10  2:39               ` Jianbo Liu
2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-10 15:00     ` Sekhar, Ashwin
2017-05-11  3:16       ` Jianbo Liu
2017-05-11  4:14         ` Sekhar, Ashwin
2017-05-11  4:27           ` Sekhar, Ashwin
2017-05-11  6:11             ` Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file Jianbo Liu
2017-05-10 11:57     ` Sekhar, Ashwin
2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-11  9:49     ` Sekhar, Ashwin
2017-05-11 10:01       ` Jianbo Liu
2017-05-11 10:27         ` Sekhar, Ashwin
2017-05-12  2:40           ` Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 7/7] examples/l3fwd: change the guard macro name for header file Jianbo Liu
2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
2017-05-15  4:01     ` Jerin Jacob
2017-05-15  3:34   ` [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-15  5:22     ` Sekhar, Ashwin
2017-05-15  3:34   ` [PATCH v4 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
2017-07-03 21:02   ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon
2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
2017-07-04 10:23   ` [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-07-04 10:23   ` [PATCH v5 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
2017-07-04 15:11   ` [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.