All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rahul Bhansali <rbhansali@marvell.com>
To: <dev@dpdk.org>, Radu Nicolau <radu.nicolau@intel.com>,
	Akhil Goyal <gakhil@marvell.com>,
	Ruifeng Wang <ruifeng.wang@arm.com>
Cc: <jerinj@marvell.com>, Rahul Bhansali <rbhansali@marvell.com>
Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode
Date: Fri, 17 Jun 2022 13:12:41 +0530	[thread overview]
Message-ID: <20220617074241.3260496-2-rbhansali@marvell.com> (raw)
In-Reply-To: <20220617074241.3260496-1-rbhansali@marvell.com>

This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by upto ~8% and inbound performance increased by
upto ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: Removed Neon packet grouping function and used
the common one.

 examples/ipsec-secgw/Makefile         |   5 +-
 examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
 examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++++++++
 examples/ipsec-secgw/ipsec_neon.h     | 321 ++++++++++++++++++++++++++
 examples/ipsec-secgw/ipsec_worker.c   |   9 +
 5 files changed, 571 insertions(+), 2 deletions(-)
 create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
 create mode 100644 examples/ipsec-secgw/ipsec_neon.h

diff --git a/examples/ipsec-secgw/Makefile b/examples/ipsec-secgw/Makefile
index 89af54bd37..ffe232774d 100644
--- a/examples/ipsec-secgw/Makefile
+++ b/examples/ipsec-secgw/Makefile
@@ -36,6 +36,7 @@ shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
@@ -53,10 +54,10 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -Wno-address-of-packed-member

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 4d8a4a71b8..b650668305 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@
 #include "parser.h"
 #include "sad.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;

 #define MAX_JUMBO_PKT_LEN  9600
@@ -100,6 +104,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
 	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };

+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];

 #define CMD_LINE_OPT_CONFIG		"config"
@@ -568,9 +578,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
 			process_pkts_outbound(&qconf->outbound, &traffic);
 	}

+#if defined __ARM_NEON
+	/* Neon optimized packet routing */
+	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+			 qconf->outbound.ipv4_offloads, true);
+	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
 	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
 		    qconf->outbound.ipv4_offloads, true);
 	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }

 static inline void
@@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
 		return -EINVAL;

 	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+			    (struct rte_ether_addr *)(val_eth + port));
 	return 0;
 }

@@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 			portid, rte_strerror(-ret));

 	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+			    (struct rte_ether_addr *)(val_eth + portid));
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+			    (struct rte_ether_addr *)(val_eth + portid) + 1);
+
 	print_ethaddr("Address: ", &ethaddr);
 	printf("\n");

diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+		uint64_t *inline_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+	int i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+							RTE_ETHER_HDR_LEN);
+		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		/* Fetch destination IPv4 address */
+		dst[i] = ipv4_hdr->dst_addr;
+		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+	}
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	uint32_t next_hop;
+	rte_xmm_t dst;
+	uint8_t i;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* If all 4 packets are non-inline */
+	if (!inline_flag) {
+		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+				 BAD_PORT);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+		return;
+	}
+
+	/* Inline and non-inline packets */
+	dst.x = dip;
+	for (i = 0; i < FWDSTEP; i++) {
+		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+			dprt[i] = (uint16_t) (((next_hop &
+						RTE_LPM_LOOKUP_SUCCESS) != 0)
+						? next_hop : BAD_PORT);
+
+		} else {
+			dprt[i] = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						 dst.u32[i], &next_hop) == 0)
+						? next_hop : BAD_PORT);
+		}
+	}
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+		   uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint32_t next_hop;
+	uint32_t dst_ip;
+
+	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+		next_hop = get_hop_for_offload_pkt(pkt, 0);
+		*dst_port = (uint16_t) (((next_hop &
+					  RTE_LPM_LOOKUP_SUCCESS) != 0)
+					  ? next_hop : BAD_PORT);
+	} else {
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+		*dst_port = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						dst_ip, &next_hop) == 0)
+						? next_hop : BAD_PORT);
+	}
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+	uint8_t dst_ip6[MAX_PKT_BURST][16];
+	int32_t dst_port[MAX_PKT_BURST];
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	int32_t hop[MAX_PKT_BURST];
+	struct rte_mbuf *pkt;
+	uint8_t lpm_pkts = 0;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	/* Need to do an LPM lookup for non-inline packets. Inline packets will
+	 * have port ID in the SA
+	 */
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+		pkt->l2_len = RTE_ETHER_HDR_LEN;
+		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+			/* Security offload not enabled. So an LPM lookup is
+			 * required to get the hop
+			 */
+			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+			memcpy(&dst_ip6[lpm_pkts][0],
+					ipv6_hdr->dst_addr, 16);
+			lpm_pkts++;
+		}
+	}
+
+	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+				  hop, lpm_pkts);
+
+	lpm_pkts = 0;
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			/* Read hop from the SA */
+			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+		} else {
+			/* Need to use hop returned by lookup */
+			dst_port[i] = hop[lpm_pkts++];
+		}
+		if (dst_port[i] == -1)
+			dst_port[i] = BAD_PORT;
+	}
+
+	/* Send packets */
+	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+		 uint64_t tx_offloads, bool ip_cksum)
+{
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+	uint16_t dst_port[MAX_PKT_BURST];
+	uint64_t inline_flag = 0;
+	int32x4_t dip;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	for (i = 0; i != k; i += FWDSTEP) {
+		processx4_step1(&pkts[i], &dip, &inline_flag);
+		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+				&dst_port[i]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (m) {
+	case 3:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+	}
+
+	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..0f72219ed0
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+#include "neon_common.h"
+
+#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
+#define BAD_PORT	((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+	struct rte_mbuf *pkt;
+	uint8_t i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		pkt = pkts[i];
+
+		/* Check if it is a large packet */
+		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+			*l_pkt |= 1;
+
+		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+		te[i] = vld1q_u32(p[i]);
+
+		/* Update last 4 bytes */
+		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+		vst1q_u32(p[i], ve[i]);
+
+		if (ip_cksum) {
+			struct rte_ipv4_hdr *ip;
+
+			pkt->ol_flags |= tx_offloads;
+
+			ip = (struct rte_ipv4_hdr *)
+				(p[i] + RTE_ETHER_HDR_LEN + 1);
+			ip->hdr_checksum = 0;
+
+			/* calculate IPv4 cksum in SW */
+			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+				ip->hdr_checksum = rte_ipv4_cksum(ip);
+		}
+
+	}
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+	       bool ip_cksum, uint8_t *l_pkt)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	/* Check if it is a large packet */
+	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+		*l_pkt |= 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+
+	if (ip_cksum) {
+		struct rte_ipv4_hdr *ip;
+
+		pkt->ol_flags |= tx_offloads;
+
+		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip->hdr_checksum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+			ip->hdr_checksum = rte_ipv4_cksum(ip);
+	}
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+	uint8_t proto;
+	uint32_t i;
+
+	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+	for (i = 0; i < num; i++)
+		send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	struct lcore_conf *qconf;
+	uint32_t len, j, n;
+
+	qconf = &lcore_conf[lcoreid];
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		core_stats_update_tx(n);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+		}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		if (len == 0)
+			goto exit;
+
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+			case 0:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 3:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 2:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 1:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+		}
+		}
+	}
+
+exit:
+	qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	uint16_t pnum[MAX_PKT_BURST + 1];
+	uint8_t l_pkt = 0;
+	uint16_t dlp, *lp;
+	int i = 0, k;
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (i = FWDSTEP; i != k; i += FWDSTEP) {
+			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+					ip_cksum, &l_pkt);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+			lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[i - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (i = 0; i < nb_rx; i += k) {
+
+		uint16_t pn;
+
+		pn = dst_port[i];
+		k = pnum[i];
+
+		if (likely(pn != BAD_PORT)) {
+			if (l_pkt)
+				/* Large packet is present, need to send
+				 * individual packets with fragment
+				 */
+				send_packets(pkts + i, pn, k, is_ipv4);
+			else
+				send_packetsx4(pkts + i, pn, k);
+
+		} else {
+			free_pkts(&pkts[i], k);
+			if (is_ipv4)
+				core_statistics[lcoreid].lpm4.miss++;
+			else
+				core_statistics[lcoreid].lpm6.miss++;
+		}
+	}
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
 	struct rte_security_session *sess;
 	struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
 				v6_num = ip6.num;
 			}

+#if defined __ARM_NEON
+			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+			route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
 			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
 			route6_pkts(rt6_ctx, v6, v6_num);
+#endif
 		}
 	}
 }
--
2.25.1


  reply	other threads:[~2022-06-17  7:43 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-24  9:57 [PATCH] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-05-24 23:00 ` Konstantin Ananyev
2022-05-25 11:03   ` [EXT] " Rahul Bhansali
2022-05-27 11:44     ` Konstantin Ananyev
2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
2022-06-17  7:42   ` Rahul Bhansali [this message]
2022-06-17  7:51     ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-06-21 12:55     ` Akhil Goyal
2022-06-23  8:46     ` Zhang, Roy Fan
2022-06-23  9:37       ` Rahul Bhansali
2022-06-17  7:50   ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
2022-06-20 23:13     ` Konstantin Ananyev
2022-06-21 16:50       ` [EXT] " Rahul Bhansali
2022-06-22 23:25         ` Konstantin Ananyev
2022-06-20  7:49   ` [EXT] " Akhil Goyal
2022-06-20 10:45     ` Thomas Monjalon
2022-06-21 12:56     ` Akhil Goyal
2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
2022-06-23  9:38   ` [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-06-26 19:00   ` [PATCH v3 1/2] examples/l3fwd: common packet group functionality Konstantin Ananyev
2022-06-28  8:54     ` [EXT] " Akhil Goyal
2022-07-03 21:40   ` Thomas Monjalon
2022-07-04 12:49     ` [EXT] " Rahul Bhansali
2022-07-04 14:04       ` Thomas Monjalon
2022-07-04 14:48   ` Thomas Monjalon
2022-07-05 16:11     ` [EXT] " Rahul Bhansali

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220617074241.3260496-2-rbhansali@marvell.com \
    --to=rbhansali@marvell.com \
    --cc=dev@dpdk.org \
    --cc=gakhil@marvell.com \
    --cc=jerinj@marvell.com \
    --cc=radu.nicolau@intel.com \
    --cc=ruifeng.wang@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.