All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] net/memif: add a fast path for Rx
@ 2022-04-12  9:32 Joyce Kong
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-09-15  6:58 ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Joyce Kong
  0 siblings, 2 replies; 26+ messages in thread
From: Joyce Kong @ 2022-04-12  9:32 UTC (permalink / raw)
  To: Jakub Grajciar; +Cc: dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. The removal
of the branch leads to performance uplift.

When mbuf >= memif buffer size, Rx chooses the fast memcpy
path. Test with 1p1q on Ampere Altra AArch64 server, there
is 2.6% perf gain with non-zero-copy mode, and 1.36% perf
gain with zero-copy mode. Test with 1p1q on Cascade Lake
Xeon X86 server, there is 3.04% perf gain with non-zero-copy
mode, and 0.27% perf gain with zero-copy mode.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 40 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 587ad45576..f55776ca46 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		goto refill;
 	n_slots = last_slot - cur_slot;
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
-			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot1:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+			cp_len = d0->length;
 
-		src_len = d0->length;
-		dst_off = 0;
-		src_off = 0;
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
+
+			cur_slot++;
+			n_slots--;
 
-				/* store pointer to tail */
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
 				if (unlikely(mbuf == NULL))
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		cur_slot++;
-		n_slots--;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +739,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v1 0/2] add a fast path for memif Rx/Tx
  2022-04-12  9:32 [RFC] net/memif: add a fast path for Rx Joyce Kong
@ 2022-05-17 10:51 ` Joyce Kong
  2022-05-17 10:51   ` [PATCH v1 1/2] net/memif: add a Rx fast path Joyce Kong
                     ` (4 more replies)
  2022-09-15  6:58 ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Joyce Kong
  1 sibling, 5 replies; 26+ messages in thread
From: Joyce Kong @ 2022-05-17 10:51 UTC (permalink / raw)
  Cc: ruifeng.wang, dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. And for Tx
fast path, bulk free the mbufs which come from the same
mempool.

When mbuf == memif buffer size, both Rx/Tx would choose
the fast memcpy path. When mbuf < memif buffer size, the
Rx chooses previous memcpy path while Tx chooses fast
memcpy path. When mbuf > memif buffer size, the Rx chooses
fast memcpy path while Tx chooses previous memcpy path.

Test with 1p1q on Ampere Altra AArch64 server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    16.95%    |     3.28%    |    13.29%
---------------------------------------------------------
   zc gain  |    19.43%    |     4.62%    |    18.14%
---------------------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    19.97%    |     2.35%    |    21.43%
---------------------------------------------------------
   zc gain  |    14.30%    |    -1.21%    |    11.98%
---------------------------------------------------------

Joyce Kong (2):
  net/memif: add a Rx fast path
  net/memif: add a Tx fast path

 drivers/net/memif/rte_eth_memif.c | 258 ++++++++++++++++++++----------
 1 file changed, 176 insertions(+), 82 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
@ 2022-05-17 10:51   ` Joyce Kong
  2022-05-18 16:53     ` Ferruh Yigit
  2022-05-18 17:06     ` Ferruh Yigit
  2022-05-17 10:51   ` [PATCH v1 2/2] net/memif: add a Tx " Joyce Kong
                     ` (3 subsequent siblings)
  4 siblings, 2 replies; 26+ messages in thread
From: Joyce Kong @ 2022-05-17 10:51 UTC (permalink / raw)
  To: Jakub Grajciar; +Cc: ruifeng.wang, dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. The removal
of the branch leads to considerable performance uplift.

When memif <= buffer size, Rx chooses the fast memcpy path,
otherwise it would choose the original path.

Test with 1p1q on Ampere Altra AArch64 server,
--------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
--------------------------------------------
non-zc gain |     4.30%     |    -0.52%    |
--------------------------------------------
   zc gain  |     2.46%     |     0.70%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
-------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
-------------------------------------------
non-zc gain |     2.13%     |    -1.40%    |
-------------------------------------------
   zc gain  |     0.18%     |     0.48%    |
-------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 40 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 587ad45576..f55776ca46 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		goto refill;
 	n_slots = last_slot - cur_slot;
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
-			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot1:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+			cp_len = d0->length;
 
-		src_len = d0->length;
-		dst_off = 0;
-		src_off = 0;
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
+
+			cur_slot++;
+			n_slots--;
 
-				/* store pointer to tail */
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
 				if (unlikely(mbuf == NULL))
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		cur_slot++;
-		n_slots--;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +739,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v1 2/2] net/memif: add a Tx fast path
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-05-17 10:51   ` [PATCH v1 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-05-17 10:51   ` Joyce Kong
  2022-05-17 13:59   ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Morten Brørup
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-05-17 10:51 UTC (permalink / raw)
  To: Jakub Grajciar; +Cc: ruifeng.wang, dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. If all
mbufs come from the same mempool, and memif buf size >= mbuf
size, add a fast Tx memory copy path without the comparing
branch and with mbuf bulk free, otherwise still run the
original Tx path.
The removal of the branch and bulk free lead to considerable
performance uplift.

Test with 1p1q on Ampere Altra AArch64 server,
--------------------------------------------
  buf size  | memif >= mbuf | memif < mbuf |
--------------------------------------------
non-zc gain |    13.35%     |    -0.77%    |
--------------------------------------------
   zc gain  |    17.15%     |    -0.47%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
--------------------------------------------
  buf size  | memif >= mbuf | memif < mbuf |
--------------------------------------------
non-zc gain |    10.10%     |    -0.29%    |
--------------------------------------------
   zc gain  |     8.87%     |    -0.99%    |
--------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/memif/rte_eth_memif.c | 134 ++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 42 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index f55776ca46..f6ef7c6e93 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -660,62 +660,112 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
 	}
 
-	while (n_tx_pkts < nb_pkts && n_free) {
-		mbuf_head = *bufs++;
-		nb_segs = mbuf_head->nb_segs;
-		mbuf = mbuf_head;
+	uint8_t i;
+	struct rte_mbuf **buf_tmp = bufs;
+	mbuf_head = *buf_tmp++;
+	struct rte_mempool *mp = mbuf_head->pool;
+
+	for (i = 1; i < nb_pkts; i++) {
+		mbuf_head = *buf_tmp++;
+		if (mbuf_head->pool != mp)
+			break;
+	}
+
+	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mp) - RTE_PKTMBUF_HEADROOM;
+	if (i == nb_pkts && pmd->cfg.pkt_buffer_size >= mbuf_size) {
+		buf_tmp = bufs;
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-		saved_slot = slot;
-		d0 = &ring->desc[slot & mask];
-		dst_off = 0;
-		dst_len = (type == MEMIF_RING_C2S) ?
-			pmd->run.pkt_buffer_size : d0->length;
+			saved_slot = slot;
 
-next_in_chain:
-		src_off = 0;
-		src_len = rte_pktmbuf_data_len(mbuf);
+next_in_chain1:
+			d0 = &ring->desc[slot & mask];
+			cp_len = rte_pktmbuf_data_len(mbuf);
 
-		while (src_len) {
-			if (dst_len == 0) {
+			rte_memcpy((uint8_t *)memif_get_buffer(proc_private, d0),
+				rte_pktmbuf_mtod(mbuf, void *), cp_len);
+
+			d0->length = cp_len;
+			mq->n_bytes += cp_len;
+			slot++;
+			n_free--;
+
+			if (--nb_segs > 0) {
 				if (n_free) {
-					slot++;
-					n_free--;
 					d0->flags |= MEMIF_DESC_FLAG_NEXT;
-					d0 = &ring->desc[slot & mask];
-					dst_off = 0;
-					dst_len = (type == MEMIF_RING_C2S) ?
-					    pmd->run.pkt_buffer_size : d0->length;
-					d0->flags = 0;
+					mbuf = mbuf->next;
+					goto next_in_chain1;
 				} else {
 					slot = saved_slot;
-					goto no_free_slots;
+					goto free_mbufs;
 				}
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
-							       d0) + dst_off,
-				rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
-				cp_len);
+			n_tx_pkts++;
+		}
+free_mbufs:
+		rte_pktmbuf_free_bulk(buf_tmp, n_tx_pkts);
+	} else {
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-			mq->n_bytes += cp_len;
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-			dst_len -= cp_len;
+			saved_slot = slot;
+			d0 = &ring->desc[slot & mask];
+			dst_off = 0;
+			dst_len = (type == MEMIF_RING_C2S) ?
+				pmd->run.pkt_buffer_size : d0->length;
 
-			d0->length = dst_off;
-		}
+next_in_chain2:
+			src_off = 0;
+			src_len = rte_pktmbuf_data_len(mbuf);
 
-		if (--nb_segs > 0) {
-			mbuf = mbuf->next;
-			goto next_in_chain;
-		}
+			while (src_len) {
+				if (dst_len == 0) {
+					if (n_free) {
+						slot++;
+						n_free--;
+						d0->flags |= MEMIF_DESC_FLAG_NEXT;
+						d0 = &ring->desc[slot & mask];
+						dst_off = 0;
+						dst_len = (type == MEMIF_RING_C2S) ?
+						    pmd->run.pkt_buffer_size : d0->length;
+						d0->flags = 0;
+					} else {
+						slot = saved_slot;
+						goto no_free_slots;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		n_tx_pkts++;
-		slot++;
-		n_free--;
-		rte_pktmbuf_free(mbuf_head);
+				rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
+								       d0) + dst_off,
+					rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
+					cp_len);
+
+				mq->n_bytes += cp_len;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+				dst_len -= cp_len;
+
+				d0->length = dst_off;
+			}
+
+			if (--nb_segs > 0) {
+				mbuf = mbuf->next;
+				goto next_in_chain2;
+			}
+
+			n_tx_pkts++;
+			slot++;
+			n_free--;
+			rte_pktmbuf_free(mbuf_head);
+		}
 	}
 
 no_free_slots:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* RE: [PATCH v1 0/2] add a fast path for memif Rx/Tx
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-05-17 10:51   ` [PATCH v1 1/2] net/memif: add a Rx fast path Joyce Kong
  2022-05-17 10:51   ` [PATCH v1 2/2] net/memif: add a Tx " Joyce Kong
@ 2022-05-17 13:59   ` Morten Brørup
  2022-05-18  2:48   ` Ruifeng Wang
  2022-07-01 10:28   ` [PATCH v2 " Joyce Kong
  4 siblings, 0 replies; 26+ messages in thread
From: Morten Brørup @ 2022-05-17 13:59 UTC (permalink / raw)
  To: Joyce Kong; +Cc: ruifeng.wang, dev, nd

> From: Joyce Kong [mailto:joyce.kong@arm.com]
> Sent: Tuesday, 17 May 2022 12.51
> 
> For memif non-zero-copy mode, there is a branch to compare
> the mbuf and memif buffer size during memory copying. Add
> a fast memory copy path by removing this branch with mbuf
> and memif buffer size defined at compile time. And for Tx
> fast path, bulk free the mbufs which come from the same
> mempool.
> 
> When mbuf == memif buffer size, both Rx/Tx would choose
> the fast memcpy path. When mbuf < memif buffer size, the
> Rx chooses previous memcpy path while Tx chooses fast
> memcpy path. When mbuf > memif buffer size, the Rx chooses
> fast memcpy path while Tx chooses previous memcpy path.
> 
> Test with 1p1q on Ampere Altra AArch64 server,
> ---------------------------------------------------------
>   buf size  | memif = mbuf | memif < mbuf | memif > mbuf
> ---------------------------------------------------------
> non-zc gain |    16.95%    |     3.28%    |    13.29%
> ---------------------------------------------------------
>    zc gain  |    19.43%    |     4.62%    |    18.14%
> ---------------------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86server,
> ---------------------------------------------------------
>   buf size  | memif = mbuf | memif < mbuf | memif > mbuf
> ---------------------------------------------------------
> non-zc gain |    19.97%    |     2.35%    |    21.43%
> ---------------------------------------------------------
>    zc gain  |    14.30%    |    -1.21%    |    11.98%
> ---------------------------------------------------------
> 
> Joyce Kong (2):
>   net/memif: add a Rx fast path
>   net/memif: add a Tx fast path
> 
>  drivers/net/memif/rte_eth_memif.c | 258 ++++++++++++++++++++----------
>  1 file changed, 176 insertions(+), 82 deletions(-)
> 
> --
> 2.25.1
> 

Series-Acked-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v1 0/2] add a fast path for memif Rx/Tx
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
                     ` (2 preceding siblings ...)
  2022-05-17 13:59   ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Morten Brørup
@ 2022-05-18  2:48   ` Ruifeng Wang
  2022-07-01 10:28   ` [PATCH v2 " Joyce Kong
  4 siblings, 0 replies; 26+ messages in thread
From: Ruifeng Wang @ 2022-05-18  2:48 UTC (permalink / raw)
  To: Joyce Kong; +Cc: dev, nd, Joyce Kong, nd

> -----Original Message-----
> From: Joyce Kong <joyce.kong@arm.com>
> Sent: Tuesday, May 17, 2022 6:51 PM
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>; Joyce Kong <Joyce.Kong@arm.com>
> Subject: [PATCH v1 0/2] add a fast path for memif Rx/Tx
> 
> For memif non-zero-copy mode, there is a branch to compare the mbuf and
> memif buffer size during memory copying. Add a fast memory copy path by
> removing this branch with mbuf and memif buffer size defined at compile
> time. And for Tx fast path, bulk free the mbufs which come from the same
> mempool.
> 
> When mbuf == memif buffer size, both Rx/Tx would choose the fast
> memcpy path. When mbuf < memif buffer size, the Rx chooses previous
> memcpy path while Tx chooses fast memcpy path. When mbuf > memif
> buffer size, the Rx chooses fast memcpy path while Tx chooses previous
> memcpy path.
> 
> Test with 1p1q on Ampere Altra AArch64 server,
> ---------------------------------------------------------
>   buf size  | memif = mbuf | memif < mbuf | memif > mbuf
> ---------------------------------------------------------
> non-zc gain |    16.95%    |     3.28%    |    13.29%
> ---------------------------------------------------------
>    zc gain  |    19.43%    |     4.62%    |    18.14%
> ---------------------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86server,
> ---------------------------------------------------------
>   buf size  | memif = mbuf | memif < mbuf | memif > mbuf
> ---------------------------------------------------------
> non-zc gain |    19.97%    |     2.35%    |    21.43%
> ---------------------------------------------------------
>    zc gain  |    14.30%    |    -1.21%    |    11.98%
> ---------------------------------------------------------
> 
> Joyce Kong (2):
>   net/memif: add a Rx fast path
>   net/memif: add a Tx fast path
> 
>  drivers/net/memif/rte_eth_memif.c | 258 ++++++++++++++++++++--------
> --
>  1 file changed, 176 insertions(+), 82 deletions(-)
> 
> --
> 2.25.1
Series-reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-17 10:51   ` [PATCH v1 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-05-18 16:53     ` Ferruh Yigit
  2022-05-19  7:00       ` Joyce Kong
  2022-05-18 17:06     ` Ferruh Yigit
  1 sibling, 1 reply; 26+ messages in thread
From: Ferruh Yigit @ 2022-05-18 16:53 UTC (permalink / raw)
  To: Joyce Kong, Jakub Grajciar; +Cc: ruifeng.wang, dev, nd

On 5/17/2022 11:51 AM, Joyce Kong wrote:
> For memif non-zero-copy mode, there is a branch to compare
> the mbuf and memif buffer size during memory copying. Add
> a fast memory copy path by removing this branch with mbuf
> and memif buffer size defined at compile time. The removal
> of the branch leads to considerable performance uplift.
> 
> When memif <= buffer size, Rx chooses the fast memcpy path,
> otherwise it would choose the original path.
> 
> Test with 1p1q on Ampere Altra AArch64 server,
> --------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> --------------------------------------------
> non-zc gain |     4.30%     |    -0.52%    |
> --------------------------------------------
>     zc gain  |     2.46%     |     0.70%    |
> --------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86server,
> -------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> -------------------------------------------
> non-zc gain |     2.13%     |    -1.40%    |
> -------------------------------------------
>     zc gain  |     0.18%     |     0.48%    |
> -------------------------------------------
> 


Hi Joyce,

I have multiple questions,

1) The patch updates only non-zero-copy mode Rx path ('eth_memif_rx'), 
why zero-copy path performance also impacted?

2) As far as I can see there is a behavior change, more details below

3) patch talking about memif buffer size being defined in compile time, 
is the big "memif <= mbuf" if block optimized out?
Since 'pkt_buffer_size' is a devarg, so it can change from run to run 
and it is not known in compile time, I doubt that it is optimized out.
Is having  'pkt_buffer_size' as devarg breaks your logic?

4) One option gains performance and other loose performance, do you 
think gain performance case is more common use case? Is there any data 
around it?


Jakub,

Do you want to test this patch first before progressing with it?

> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> ---
>   drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
>   1 file changed, 84 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
> index 587ad45576..f55776ca46 100644
> --- a/drivers/net/memif/rte_eth_memif.c
> +++ b/drivers/net/memif/rte_eth_memif.c
> @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
>   		goto refill;
>   	n_slots = last_slot - cur_slot;
>   
> -	while (n_slots && n_rx_pkts < nb_pkts) {
> -		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> -		if (unlikely(mbuf_head == NULL))
> -			goto no_free_bufs;
> -		mbuf = mbuf_head;
> -		mbuf->port = mq->in_port;
> +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> +		while (n_slots && n_rx_pkts < nb_pkts) {
> +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> +			if (unlikely(mbuf_head == NULL))
> +				goto no_free_bufs;
> +			mbuf = mbuf_head;
> +			mbuf->port = mq->in_port;
> +
> +next_slot1:
> +			s0 = cur_slot & mask;
> +			d0 = &ring->desc[s0];
>   
> -next_slot:
> -		s0 = cur_slot & mask;
> -		d0 = &ring->desc[s0];
> +			cp_len = d0->length;
>   
> -		src_len = d0->length;
> -		dst_off = 0;
> -		src_off = 0;
> +			rte_pktmbuf_data_len(mbuf) = cp_len;
> +			rte_pktmbuf_pkt_len(mbuf) = cp_len;
> +			if (mbuf != mbuf_head)
> +				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
>   
> -		do {
> -			dst_len = mbuf_size - dst_off;
> -			if (dst_len == 0) {
> -				dst_off = 0;
> -				dst_len = mbuf_size;
> +			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
> +				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
> +
> +			cur_slot++;
> +			n_slots--;
>   
> -				/* store pointer to tail */
> +			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
>   				mbuf_tail = mbuf;
>   				mbuf = rte_pktmbuf_alloc(mq->mempool);
>   				if (unlikely(mbuf == NULL))
>   					goto no_free_bufs;
> -				mbuf->port = mq->in_port;
>   				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
>   				if (unlikely(ret < 0)) {
>   					MIF_LOG(ERR, "number-of-segments-overflow");
>   					rte_pktmbuf_free(mbuf);
>   					goto no_free_bufs;
>   				}
> +				goto next_slot1;
>   			}

It is very hard to comment on the correct part of the patch, since it is 
mixed a lot, but
- previously when memif buffer is segmented, and its size is less than 
mbuf; mbuf is filled with as much memif data as possible and later 
switched to next mbuf, like:

   memif buffer
+-+  +-+  +-+  +-+
|a|->|b|->|c|->|d|
+-+  +-+  +-+  +-+

+---+  +---+
|abc|->|d  |
+---+  +---+
   mbuf


- Now each memif segment is a mbuf,

   memif buffer
+-+  +-+  +-+  +-+
|a|->|b|->|c|->|d|
+-+  +-+  +-+  +-+

+---+  +---+  +---+  +---+
|a  |->|b  |->|c  |->|d  |
+---+  +---+  +---+  +---+
   mbuf

Can you please confirm this behavior change? If so can you please 
highlight is more in the commit log?
And is this tradeoff something preferred?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-17 10:51   ` [PATCH v1 1/2] net/memif: add a Rx fast path Joyce Kong
  2022-05-18 16:53     ` Ferruh Yigit
@ 2022-05-18 17:06     ` Ferruh Yigit
  2022-05-19 15:09       ` Joyce Kong
  1 sibling, 1 reply; 26+ messages in thread
From: Ferruh Yigit @ 2022-05-18 17:06 UTC (permalink / raw)
  To: Joyce Kong, Jakub Grajciar; +Cc: ruifeng.wang, dev, nd

On 5/17/2022 11:51 AM, Joyce Kong wrote:
> For memif non-zero-copy mode, there is a branch to compare
> the mbuf and memif buffer size during memory copying. Add
> a fast memory copy path by removing this branch with mbuf
> and memif buffer size defined at compile time. The removal
> of the branch leads to considerable performance uplift.
> 
> When memif <= buffer size, Rx chooses the fast memcpy path,
> otherwise it would choose the original path.
> 
> Test with 1p1q on Ampere Altra AArch64 server,
> --------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> --------------------------------------------
> non-zc gain |     4.30%     |    -0.52%    |
> --------------------------------------------
>     zc gain  |     2.46%     |     0.70%    |
> --------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86server,
> -------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> -------------------------------------------
> non-zc gain |     2.13%     |    -1.40%    |
> -------------------------------------------
>     zc gain  |     0.18%     |     0.48%    |
> -------------------------------------------
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>

<...>

> +	} else {
> +		while (n_slots && n_rx_pkts < nb_pkts) {
> +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> +			if (unlikely(mbuf_head == NULL))
> +				goto no_free_bufs;
> +			mbuf = mbuf_head;
> +			mbuf->port = mq->in_port;
> +
> +next_slot2:
> +			s0 = cur_slot & mask;
> +			d0 = &ring->desc[s0];
>   
> -			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
> -							   dst_off),
> -				(uint8_t *)memif_get_buffer(proc_private, d0) +
> -				src_off, cp_len);
> +			src_len = d0->length;
> +			dst_off = 0;
> +			src_off = 0;

Hi Joyce, Jakub,

Something doesn't look right in the original code (not in this patch), 
can you please help me check if I am missing something?

For the memif buffer segmented case, first buffer will be copied to 
mbuf, 'dst_off' increased and jump back to process next memif segment:

  + d0
  |
  v
+++  +-+
|a+->+b|
+-+  +-+

+---+
|a  |
+-+-+
   ^
   |
   + dst_off

"
     if (d0->flags & MEMIF_DESC_FLAG_NEXT)
          goto next_slot;
"

But here 'dst_off' set back to '0', wont this cause next memif buffer 
segment to write to beginning of mbuf overwriting previous data?

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-18 16:53     ` Ferruh Yigit
@ 2022-05-19  7:00       ` Joyce Kong
  2022-05-19  8:44         ` Joyce Kong
  0 siblings, 1 reply; 26+ messages in thread
From: Joyce Kong @ 2022-05-19  7:00 UTC (permalink / raw)
  To: Ferruh Yigit, Jakub Grajciar; +Cc: Ruifeng Wang, dev, nd

Hi Ferruh,

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> Sent: Thursday, May 19, 2022 12:53 AM
> To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar <jgrajcia@cisco.com>
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
> 
> On 5/17/2022 11:51 AM, Joyce Kong wrote:
> > For memif non-zero-copy mode, there is a branch to compare the mbuf
> > and memif buffer size during memory copying. Add a fast memory copy
> > path by removing this branch with mbuf and memif buffer size defined
> > at compile time. The removal of the branch leads to considerable
> > performance uplift.
> >
> > When memif <= buffer size, Rx chooses the fast memcpy path, otherwise
> > it would choose the original path.
> >
> > Test with 1p1q on Ampere Altra AArch64 server,
> > --------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > --------------------------------------------
> > non-zc gain |     4.30%     |    -0.52%    |
> > --------------------------------------------
> >     zc gain  |     2.46%     |     0.70%    |
> > --------------------------------------------
> >
> > Test with 1p1q on Cascade Lake Xeon X86server,
> > -------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > -------------------------------------------
> > non-zc gain |     2.13%     |    -1.40%    |
> > -------------------------------------------
> >     zc gain  |     0.18%     |     0.48%    |
> > -------------------------------------------
> >
> 
> 
> Hi Joyce,
> 
> I have multiple questions,
> 
> 1) The patch updates only non-zero-copy mode Rx path ('eth_memif_rx'), why
> zero-copy path performance also impacted?
> 
For memif zero-copy mode, only client runs 'eth_memif_rx_zc', and server still runs
'eth_memif_rx', so the patch would impacts zero-copy mode.

> 2) As far as I can see there is a behavior change, more details below
> 
> 3) patch talking about memif buffer size being defined in compile time, is the
> big "memif <= mbuf" if block optimized out?
> Since 'pkt_buffer_size' is a devarg, so it can change from run to run and it is not
> known in compile time, I doubt that it is optimized out.
> Is having  'pkt_buffer_size' as devarg breaks your logic?
> 
From memif run to run, run.pkt_buffer_size would change, and cfg.pkt_buffer_size
which is the reserved max buffer size would not change. For patch details, I use
cfg.pkt_buffer_size to implement the logic.

> 4) One option gains performance and other loose performance, do you think
> gain performance case is more common use case? Is there any data around it?
> 
Yes, I think the gain performance case is more common case, as the default memif
buffer size equals to mbuf size. In theory, when memif buf size >= mbuf size, the Rx
runs the original path, it would not lead to obvious impact.

> 
> Jakub,
> 
> Do you want to test this patch first before progressing with it?
> 
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > ---
> >   drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
> >   1 file changed, 84 insertions(+), 40 deletions(-)
> >
> > diff --git a/drivers/net/memif/rte_eth_memif.c
> > b/drivers/net/memif/rte_eth_memif.c
> > index 587ad45576..f55776ca46 100644
> > --- a/drivers/net/memif/rte_eth_memif.c
> > +++ b/drivers/net/memif/rte_eth_memif.c
> > @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> >   		goto refill;
> >   	n_slots = last_slot - cur_slot;
> >
> > -	while (n_slots && n_rx_pkts < nb_pkts) {
> > -		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > -		if (unlikely(mbuf_head == NULL))
> > -			goto no_free_bufs;
> > -		mbuf = mbuf_head;
> > -		mbuf->port = mq->in_port;
> > +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> > +		while (n_slots && n_rx_pkts < nb_pkts) {
> > +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > +			if (unlikely(mbuf_head == NULL))
> > +				goto no_free_bufs;
> > +			mbuf = mbuf_head;
> > +			mbuf->port = mq->in_port;
> > +
> > +next_slot1:
> > +			s0 = cur_slot & mask;
> > +			d0 = &ring->desc[s0];
> >
> > -next_slot:
> > -		s0 = cur_slot & mask;
> > -		d0 = &ring->desc[s0];
> > +			cp_len = d0->length;
> >
> > -		src_len = d0->length;
> > -		dst_off = 0;
> > -		src_off = 0;
> > +			rte_pktmbuf_data_len(mbuf) = cp_len;
> > +			rte_pktmbuf_pkt_len(mbuf) = cp_len;
> > +			if (mbuf != mbuf_head)
> > +				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
> >
> > -		do {
> > -			dst_len = mbuf_size - dst_off;
> > -			if (dst_len == 0) {
> > -				dst_off = 0;
> > -				dst_len = mbuf_size;
> > +			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
> > +				(uint8_t *)memif_get_buffer(proc_private, d0),
> cp_len);
> > +
> > +			cur_slot++;
> > +			n_slots--;
> >
> > -				/* store pointer to tail */
> > +			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
> >   				mbuf_tail = mbuf;
> >   				mbuf = rte_pktmbuf_alloc(mq->mempool);
> >   				if (unlikely(mbuf == NULL))
> >   					goto no_free_bufs;
> > -				mbuf->port = mq->in_port;
> >   				ret = memif_pktmbuf_chain(mbuf_head,
> mbuf_tail, mbuf);
> >   				if (unlikely(ret < 0)) {
> >   					MIF_LOG(ERR, "number-of-segments-
> overflow");
> >   					rte_pktmbuf_free(mbuf);
> >   					goto no_free_bufs;
> >   				}
> > +				goto next_slot1;
> >   			}
> 
> It is very hard to comment on the correct part of the patch, since it is mixed a
> lot, but
> - previously when memif buffer is segmented, and its size is less than mbuf;
> mbuf is filled with as much memif data as possible and later switched to next
> mbuf, like:
> 
>    memif buffer
> +-+  +-+  +-+  +-+
> |a|->|b|->|c|->|d|
> +-+  +-+  +-+  +-+
> 
> +---+  +---+
> |abc|->|d  |
> +---+  +---+
>    mbuf
> 
> 
> - Now each memif segment is a mbuf,
> 
>    memif buffer
> +-+  +-+  +-+  +-+
> |a|->|b|->|c|->|d|
> +-+  +-+  +-+  +-+
> 
> +---+  +---+  +---+  +---+
> |a  |->|b  |->|c  |->|d  |
> +---+  +---+  +---+  +---+
>    mbuf
> 
> Can you please confirm this behavior change? If so can you please highlight is
> more in the commit log?
> And is this tradeoff something preferred?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-19  7:00       ` Joyce Kong
@ 2022-05-19  8:44         ` Joyce Kong
  0 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-05-19  8:44 UTC (permalink / raw)
  To: Ferruh Yigit, Jakub Grajciar; +Cc: Ruifeng Wang, dev, nd



> -----Original Message-----
> From: Joyce Kong
> Sent: Thursday, May 19, 2022 3:00 PM
> To: Ferruh Yigit <ferruh.yigit@xilinx.com>; Jakub Grajciar <jgrajcia@cisco.com>
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> Subject: RE: [PATCH v1 1/2] net/memif: add a Rx fast path
> 
> Hi Ferruh,
> 
> > -----Original Message-----
> > From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> > Sent: Thursday, May 19, 2022 12:53 AM
> > To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar
> > <jgrajcia@cisco.com>
> > Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> > Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
> >
> > On 5/17/2022 11:51 AM, Joyce Kong wrote:
> > > For memif non-zero-copy mode, there is a branch to compare the mbuf
> > > and memif buffer size during memory copying. Add a fast memory copy
> > > path by removing this branch with mbuf and memif buffer size defined
> > > at compile time. The removal of the branch leads to considerable
> > > performance uplift.
> > >
> > > When memif <= buffer size, Rx chooses the fast memcpy path,
> > > otherwise it would choose the original path.
> > >
> > > Test with 1p1q on Ampere Altra AArch64 server,
> > > --------------------------------------------
> > >    buf size  | memif <= mbuf | memif > mbuf |
> > > --------------------------------------------
> > > non-zc gain |     4.30%     |    -0.52%    |
> > > --------------------------------------------
> > >     zc gain  |     2.46%     |     0.70%    |
> > > --------------------------------------------
> > >
> > > Test with 1p1q on Cascade Lake Xeon X86server,
> > > -------------------------------------------
> > >    buf size  | memif <= mbuf | memif > mbuf |
> > > -------------------------------------------
> > > non-zc gain |     2.13%     |    -1.40%    |
> > > -------------------------------------------
> > >     zc gain  |     0.18%     |     0.48%    |
> > > -------------------------------------------
> > >
> >
> >
> > Hi Joyce,
> >
> > I have multiple questions,
> >
> > 1) The patch updates only non-zero-copy mode Rx path ('eth_memif_rx'),
> > why zero-copy path performance also impacted?
> >
> For memif zero-copy mode, only client runs 'eth_memif_rx_zc', and server still
> runs 'eth_memif_rx', so the patch would impacts zero-copy mode.
> 
> > 2) As far as I can see there is a behavior change, more details below
> >
> > 3) patch talking about memif buffer size being defined in compile
> > time, is the big "memif <= mbuf" if block optimized out?
> > Since 'pkt_buffer_size' is a devarg, so it can change from run to run
> > and it is not known in compile time, I doubt that it is optimized out.
> > Is having  'pkt_buffer_size' as devarg breaks your logic?
> >
> From memif run to run, run.pkt_buffer_size would change, and
> cfg.pkt_buffer_size which is the reserved max buffer size would not change.
> For patch details, I use cfg.pkt_buffer_size to implement the logic.
> 
> > 4) One option gains performance and other loose performance, do you
> > think gain performance case is more common use case? Is there any data
> around it?
> >
> Yes, I think the gain performance case is more common case, as the default
> memif buffer size equals to mbuf size. In theory, when memif buf size >= mbuf
> size, the Rx runs the original path, it would not lead to obvious impact.
> 
> >
> > Jakub,
> >
> > Do you want to test this patch first before progressing with it?
> >
> > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > ---
> > >   drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
> > >   1 file changed, 84 insertions(+), 40 deletions(-)
> > >
> > > diff --git a/drivers/net/memif/rte_eth_memif.c
> > > b/drivers/net/memif/rte_eth_memif.c
> > > index 587ad45576..f55776ca46 100644
> > > --- a/drivers/net/memif/rte_eth_memif.c
> > > +++ b/drivers/net/memif/rte_eth_memif.c
> > > @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf
> > **bufs, uint16_t nb_pkts)
> > >   		goto refill;
> > >   	n_slots = last_slot - cur_slot;
> > >
> > > -	while (n_slots && n_rx_pkts < nb_pkts) {
> > > -		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > > -		if (unlikely(mbuf_head == NULL))
> > > -			goto no_free_bufs;
> > > -		mbuf = mbuf_head;
> > > -		mbuf->port = mq->in_port;
> > > +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> > > +		while (n_slots && n_rx_pkts < nb_pkts) {
> > > +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > > +			if (unlikely(mbuf_head == NULL))
> > > +				goto no_free_bufs;
> > > +			mbuf = mbuf_head;
> > > +			mbuf->port = mq->in_port;
> > > +
> > > +next_slot1:
> > > +			s0 = cur_slot & mask;
> > > +			d0 = &ring->desc[s0];
> > >
> > > -next_slot:
> > > -		s0 = cur_slot & mask;
> > > -		d0 = &ring->desc[s0];
> > > +			cp_len = d0->length;
> > >
> > > -		src_len = d0->length;
> > > -		dst_off = 0;
> > > -		src_off = 0;
> > > +			rte_pktmbuf_data_len(mbuf) = cp_len;
> > > +			rte_pktmbuf_pkt_len(mbuf) = cp_len;
> > > +			if (mbuf != mbuf_head)
> > > +				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
> > >
> > > -		do {
> > > -			dst_len = mbuf_size - dst_off;
> > > -			if (dst_len == 0) {
> > > -				dst_off = 0;
> > > -				dst_len = mbuf_size;
> > > +			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
> > > +				(uint8_t *)memif_get_buffer(proc_private, d0),
> > cp_len);
> > > +
> > > +			cur_slot++;
> > > +			n_slots--;
> > >
> > > -				/* store pointer to tail */
> > > +			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
> > >   				mbuf_tail = mbuf;
> > >   				mbuf = rte_pktmbuf_alloc(mq->mempool);
> > >   				if (unlikely(mbuf == NULL))
> > >   					goto no_free_bufs;
> > > -				mbuf->port = mq->in_port;
> > >   				ret = memif_pktmbuf_chain(mbuf_head,
> > mbuf_tail, mbuf);
> > >   				if (unlikely(ret < 0)) {
> > >   					MIF_LOG(ERR, "number-of-segments-
> > overflow");
> > >   					rte_pktmbuf_free(mbuf);
> > >   					goto no_free_bufs;
> > >   				}
> > > +				goto next_slot1;
> > >   			}
> >
> > It is very hard to comment on the correct part of the patch, since it
> > is mixed a lot, but
> > - previously when memif buffer is segmented, and its size is less than
> > mbuf; mbuf is filled with as much memif data as possible and later
> > switched to next mbuf, like:
> >
> >    memif buffer
> > +-+  +-+  +-+  +-+
> > |a|->|b|->|c|->|d|
> > +-+  +-+  +-+  +-+
> >
> > +---+  +---+
> > |abc|->|d  |
> > +---+  +---+
> >    mbuf
> >
> >
> > - Now each memif segment is a mbuf,
> >
> >    memif buffer
> > +-+  +-+  +-+  +-+
> > |a|->|b|->|c|->|d|
> > +-+  +-+  +-+  +-+
> >
> > +---+  +---+  +---+  +---+
> > |a  |->|b  |->|c  |->|d  |
> > +---+  +---+  +---+  +---+
> >    mbuf
> >
> > Can you please confirm this behavior change? If so can you please
> > highlight is more in the commit log?
> > And is this tradeoff something preferred?

Yes, the patch leads to the behavior change, and I will highlight more in the commit
log for next version.
This change is the same as zero-copy mode does, reducing complexed comparation
with more memory space. I am also looking forward to get some feedback from the
community about the tradeoff.

Thanks,
Joyce


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-18 17:06     ` Ferruh Yigit
@ 2022-05-19 15:09       ` Joyce Kong
  2022-05-19 16:38         ` Ferruh Yigit
  0 siblings, 1 reply; 26+ messages in thread
From: Joyce Kong @ 2022-05-19 15:09 UTC (permalink / raw)
  To: Ferruh Yigit, Jakub Grajciar; +Cc: Ruifeng Wang, dev, nd

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> Sent: Thursday, May 19, 2022 1:06 AM
> To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar <jgrajcia@cisco.com>
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
> 
> On 5/17/2022 11:51 AM, Joyce Kong wrote:
> > For memif non-zero-copy mode, there is a branch to compare
> > the mbuf and memif buffer size during memory copying. Add
> > a fast memory copy path by removing this branch with mbuf
> > and memif buffer size defined at compile time. The removal
> > of the branch leads to considerable performance uplift.
> >
> > When memif <= buffer size, Rx chooses the fast memcpy path,
> > otherwise it would choose the original path.
> >
> > Test with 1p1q on Ampere Altra AArch64 server,
> > --------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > --------------------------------------------
> > non-zc gain |     4.30%     |    -0.52%    |
> > --------------------------------------------
> >     zc gain  |     2.46%     |     0.70%    |
> > --------------------------------------------
> >
> > Test with 1p1q on Cascade Lake Xeon X86server,
> > -------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > -------------------------------------------
> > non-zc gain |     2.13%     |    -1.40%    |
> > -------------------------------------------
> >     zc gain  |     0.18%     |     0.48%    |
> > -------------------------------------------
> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> 
> <...>
> 
> > +	} else {
> > +		while (n_slots && n_rx_pkts < nb_pkts) {
> > +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > +			if (unlikely(mbuf_head == NULL))
> > +				goto no_free_bufs;
> > +			mbuf = mbuf_head;
> > +			mbuf->port = mq->in_port;
> > +
> > +next_slot2:
> > +			s0 = cur_slot & mask;
> > +			d0 = &ring->desc[s0];
> >
> > -			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
> > -							   dst_off),
> > -				(uint8_t *)memif_get_buffer(proc_private, d0)
> +
> > -				src_off, cp_len);
> > +			src_len = d0->length;
> > +			dst_off = 0;
> > +			src_off = 0;
> 
> Hi Joyce, Jakub,
> 
> Something doesn't look right in the original code (not in this patch),
> can you please help me check if I am missing something?
> 
> For the memif buffer segmented case, first buffer will be copied to
> mbuf, 'dst_off' increased and jump back to process next memif segment:
> 
>   + d0
>   |
>   v
> +++  +-+
> |a+->+b|
> +-+  +-+
> 
> +---+
> |a  |
> +-+-+
>    ^
>    |
>    + dst_off
> 
> "
>      if (d0->flags & MEMIF_DESC_FLAG_NEXT)
>           goto next_slot;
> "
> 
> But here 'dst_off' set back to '0', wont this cause next memif buffer
> segment to write to beginning of mbuf overwriting previous data?
> 
> Thanks,
> Ferruh

Hi Ferruh,

Agree with you here, and sorry I didn’t notice it before. Perhaps moving
'det_off = 0' to the line above 'next_slot' would solve the overwriting?

Best Regards,
Joyce

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v1 1/2] net/memif: add a Rx fast path
  2022-05-19 15:09       ` Joyce Kong
@ 2022-05-19 16:38         ` Ferruh Yigit
  0 siblings, 0 replies; 26+ messages in thread
From: Ferruh Yigit @ 2022-05-19 16:38 UTC (permalink / raw)
  To: Joyce Kong, Jakub Grajciar; +Cc: Ruifeng Wang, dev, nd

On 5/19/2022 4:09 PM, Joyce Kong wrote:
> [CAUTION: External Email]
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
>> Sent: Thursday, May 19, 2022 1:06 AM
>> To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar <jgrajcia@cisco.com>
>> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
>> <nd@arm.com>
>> Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
>>
>> On 5/17/2022 11:51 AM, Joyce Kong wrote:
>>> For memif non-zero-copy mode, there is a branch to compare
>>> the mbuf and memif buffer size during memory copying. Add
>>> a fast memory copy path by removing this branch with mbuf
>>> and memif buffer size defined at compile time. The removal
>>> of the branch leads to considerable performance uplift.
>>>
>>> When memif <= buffer size, Rx chooses the fast memcpy path,
>>> otherwise it would choose the original path.
>>>
>>> Test with 1p1q on Ampere Altra AArch64 server,
>>> --------------------------------------------
>>>     buf size  | memif <= mbuf | memif > mbuf |
>>> --------------------------------------------
>>> non-zc gain |     4.30%     |    -0.52%    |
>>> --------------------------------------------
>>>      zc gain  |     2.46%     |     0.70%    |
>>> --------------------------------------------
>>>
>>> Test with 1p1q on Cascade Lake Xeon X86server,
>>> -------------------------------------------
>>>     buf size  | memif <= mbuf | memif > mbuf |
>>> -------------------------------------------
>>> non-zc gain |     2.13%     |    -1.40%    |
>>> -------------------------------------------
>>>      zc gain  |     0.18%     |     0.48%    |
>>> -------------------------------------------
>>>
>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>
>> <...>
>>
>>> +   } else {
>>> +           while (n_slots && n_rx_pkts < nb_pkts) {
>>> +                   mbuf_head = rte_pktmbuf_alloc(mq->mempool);
>>> +                   if (unlikely(mbuf_head == NULL))
>>> +                           goto no_free_bufs;
>>> +                   mbuf = mbuf_head;
>>> +                   mbuf->port = mq->in_port;
>>> +
>>> +next_slot2:
>>> +                   s0 = cur_slot & mask;
>>> +                   d0 = &ring->desc[s0];
>>>
>>> -                   rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
>>> -                                                      dst_off),
>>> -                           (uint8_t *)memif_get_buffer(proc_private, d0)
>> +
>>> -                           src_off, cp_len);
>>> +                   src_len = d0->length;
>>> +                   dst_off = 0;
>>> +                   src_off = 0;
>>
>> Hi Joyce, Jakub,
>>
>> Something doesn't look right in the original code (not in this patch),
>> can you please help me check if I am missing something?
>>
>> For the memif buffer segmented case, first buffer will be copied to
>> mbuf, 'dst_off' increased and jump back to process next memif segment:
>>
>>    + d0
>>    |
>>    v
>> +++  +-+
>> |a+->+b|
>> +-+  +-+
>>
>> +---+
>> |a  |
>> +-+-+
>>     ^
>>     |
>>     + dst_off
>>
>> "
>>       if (d0->flags & MEMIF_DESC_FLAG_NEXT)
>>            goto next_slot;
>> "
>>
>> But here 'dst_off' set back to '0', wont this cause next memif buffer
>> segment to write to beginning of mbuf overwriting previous data?
>>
>> Thanks,
>> Ferruh
> 
> Hi Ferruh,
> 
> Agree with you here, and sorry I didn’t notice it before. Perhaps moving
> 'det_off = 0' to the line above 'next_slot' would solve the overwriting?
> 

Yes, I think this solves the issue.


And I wonder why this is not caught by testing. @Jakub, is the segmented 
memif buffers not a common use case?

I did able to reproduce the issue as following (and confirm suggested 
change fixes it):

server
./build/app/dpdk-testpmd --proc-type=primary --file-prefix=pmd1 
--vdev=net_memif0,role=server,bsize=32 -- -i --txpkts=512
 > set fwd txonly
 > start

client
./build/app/dpdk-testpmd --proc-type=primary --file-prefix=pmd2 
--vdev=net_memif1,bsize=32 -- -i
 > set fwd rxonly
 > set verbose 3
 > start

'client' will display packets info wrong, it will be all '0'. Also it is 
possible to capture packets in client and confirm.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 0/2] add a fast path for memif Rx/Tx
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
                     ` (3 preceding siblings ...)
  2022-05-18  2:48   ` Ruifeng Wang
@ 2022-07-01 10:28   ` Joyce Kong
  2022-07-01 10:28     ` [PATCH v2 1/2] net/memif: add a Rx fast path Joyce Kong
  2022-07-01 10:28     ` [PATCH v2 " Joyce Kong
  4 siblings, 2 replies; 26+ messages in thread
From: Joyce Kong @ 2022-07-01 10:28 UTC (permalink / raw)
  Cc: dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. And for Tx
fast path, bulk free the mbufs which come from the same
mempool.

When mbuf == memif buffer size, both Rx/Tx would choose
the fast memcpy path. When mbuf < memif buffer size, the
Rx chooses previous memcpy path while Tx chooses fast
memcpy path. When mbuf > memif buffer size, the Rx chooses
fast memcpy path while Tx chooses previous memcpy path.

Test with 1p1q on Ampere Altra AArch64 server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    16.95%    |     3.28%    |    13.29%
---------------------------------------------------------
   zc gain  |    19.43%    |     4.62%    |    18.14%
---------------------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    19.97%    |     2.35%    |    21.43%
---------------------------------------------------------
   zc gain  |    14.30%    |    -1.21%    |    11.98%
---------------------------------------------------------

v2:
 Rebase v1 and update commit message.

Joyce Kong (2):
  net/memif: add a Rx fast path
  net/memif: add a Tx fast path

 drivers/net/memif/rte_eth_memif.c | 257 ++++++++++++++++++++----------
 1 file changed, 175 insertions(+), 82 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/2] net/memif: add a Rx fast path
  2022-07-01 10:28   ` [PATCH v2 " Joyce Kong
@ 2022-07-01 10:28     ` Joyce Kong
  2022-07-01 16:51       ` Stephen Hemminger
  2022-08-22  3:47       ` [PATCH v3 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-07-01 10:28     ` [PATCH v2 " Joyce Kong
  1 sibling, 2 replies; 26+ messages in thread
From: Joyce Kong @ 2022-07-01 10:28 UTC (permalink / raw)
  To: Jakub Grajciar; +Cc: dev, nd, Joyce Kong, Ruifeng Wang, Morten Brørup

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. The removal
of the branch leads to considerable performance uplift.
The Rx fast path would not change mbuf's behavior of storing
memif buf.

When memif <= buffer size, Rx chooses the fast memcpy path,
otherwise it would choose the original path.

Test with 1p1q on Ampere Altra AArch64 server,
----------------------------------------------
|  buf size   | memif <= mbuf | memif > mbuf |
----------------------------------------------
| non-zc gain |     4.30%     |    -0.52%    |
----------------------------------------------
|   zc gain   |     2.46%     |     0.70%    |
----------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
----------------------------------------------
|   buf size  | memif <= mbuf | memif > mbuf |
----------------------------------------------
| non-zc gain |     2.13%     |    -1.40%    |
----------------------------------------------
|   zc gain   |     0.18%     |     0.48%    |
----------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 123 ++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 40 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index dd951b8296..24fc8b13fa 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -341,67 +341,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	if (cur_slot == last_slot)
 		goto refill;
 	n_slots = last_slot - cur_slot;
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+
+next_slot1:
+			mbuf->port = mq->in_port;
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
-			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
-		dst_off = 0;
+			cp_len = d0->length;
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		src_len = d0->length;
-		src_off = 0;
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			cur_slot++;
+			n_slots--;
 
-				/* store pointer to tail */
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
 				if (unlikely(mbuf == NULL))
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		cur_slot++;
-		n_slots--;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +738,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 2/2] net/memif: add a Tx fast path
  2022-07-01 10:28   ` [PATCH v2 " Joyce Kong
  2022-07-01 10:28     ` [PATCH v2 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-07-01 10:28     ` Joyce Kong
  1 sibling, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-07-01 10:28 UTC (permalink / raw)
  To: Jakub Grajciar; +Cc: dev, nd, Joyce Kong, Ruifeng Wang, Morten Brørup

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. If all
mbufs come from the same mempool, and memif buf size >= mbuf
size, add a fast Tx memory copy path without the comparing
branch and with mbuf bulk free, otherwise still run the
original Tx path.
The Tx fast path would not change memif's behavior of storing
mbuf.

The removal of the branch and bulk free lead to considerable
performance uplift.

Test with 1p1q on Ampere Altra AArch64 server,
----------------------------------------------
|  buf size   | memif >= mbuf | memif < mbuf |
----------------------------------------------
| non-zc gain |    13.35%     |    -0.77%    |
----------------------------------------------
|  zc gain    |    17.15%     |    -0.47%    |
----------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
----------------------------------------------
|  buf size   | memif >= mbuf | memif < mbuf |
----------------------------------------------
| non-zc gain |    10.10%     |    -0.29%    |
----------------------------------------------
|   zc gain   |     8.87%     |    -0.99%    |
----------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 134 ++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 42 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 24fc8b13fa..bafcfd5a7c 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -659,62 +659,112 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
 	}
 
-	while (n_tx_pkts < nb_pkts && n_free) {
-		mbuf_head = *bufs++;
-		nb_segs = mbuf_head->nb_segs;
-		mbuf = mbuf_head;
+	uint8_t i;
+	struct rte_mbuf **buf_tmp = bufs;
+	mbuf_head = *buf_tmp++;
+	struct rte_mempool *mp = mbuf_head->pool;
+
+	for (i = 1; i < nb_pkts; i++) {
+		mbuf_head = *buf_tmp++;
+		if (mbuf_head->pool != mp)
+			break;
+	}
+
+	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mp) - RTE_PKTMBUF_HEADROOM;
+	if (i == nb_pkts && pmd->cfg.pkt_buffer_size >= mbuf_size) {
+		buf_tmp = bufs;
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-		saved_slot = slot;
-		d0 = &ring->desc[slot & mask];
-		dst_off = 0;
-		dst_len = (type == MEMIF_RING_C2S) ?
-			pmd->run.pkt_buffer_size : d0->length;
+			saved_slot = slot;
 
-next_in_chain:
-		src_off = 0;
-		src_len = rte_pktmbuf_data_len(mbuf);
+next_in_chain1:
+			d0 = &ring->desc[slot & mask];
+			cp_len = rte_pktmbuf_data_len(mbuf);
 
-		while (src_len) {
-			if (dst_len == 0) {
+			rte_memcpy((uint8_t *)memif_get_buffer(proc_private, d0),
+				rte_pktmbuf_mtod(mbuf, void *), cp_len);
+
+			d0->length = cp_len;
+			mq->n_bytes += cp_len;
+			slot++;
+			n_free--;
+
+			if (--nb_segs > 0) {
 				if (n_free) {
-					slot++;
-					n_free--;
 					d0->flags |= MEMIF_DESC_FLAG_NEXT;
-					d0 = &ring->desc[slot & mask];
-					dst_off = 0;
-					dst_len = (type == MEMIF_RING_C2S) ?
-					    pmd->run.pkt_buffer_size : d0->length;
-					d0->flags = 0;
+					mbuf = mbuf->next;
+					goto next_in_chain1;
 				} else {
 					slot = saved_slot;
-					goto no_free_slots;
+					goto free_mbufs;
 				}
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
-							       d0) + dst_off,
-				rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
-				cp_len);
+			n_tx_pkts++;
+		}
+free_mbufs:
+		rte_pktmbuf_free_bulk(buf_tmp, n_tx_pkts);
+	} else {
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-			mq->n_bytes += cp_len;
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-			dst_len -= cp_len;
+			saved_slot = slot;
+			d0 = &ring->desc[slot & mask];
+			dst_off = 0;
+			dst_len = (type == MEMIF_RING_C2S) ?
+				pmd->run.pkt_buffer_size : d0->length;
 
-			d0->length = dst_off;
-		}
+next_in_chain2:
+			src_off = 0;
+			src_len = rte_pktmbuf_data_len(mbuf);
 
-		if (--nb_segs > 0) {
-			mbuf = mbuf->next;
-			goto next_in_chain;
-		}
+			while (src_len) {
+				if (dst_len == 0) {
+					if (n_free) {
+						slot++;
+						n_free--;
+						d0->flags |= MEMIF_DESC_FLAG_NEXT;
+						d0 = &ring->desc[slot & mask];
+						dst_off = 0;
+						dst_len = (type == MEMIF_RING_C2S) ?
+						    pmd->run.pkt_buffer_size : d0->length;
+						d0->flags = 0;
+					} else {
+						slot = saved_slot;
+						goto no_free_slots;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		n_tx_pkts++;
-		slot++;
-		n_free--;
-		rte_pktmbuf_free(mbuf_head);
+				rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
+								       d0) + dst_off,
+					rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
+					cp_len);
+
+				mq->n_bytes += cp_len;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+				dst_len -= cp_len;
+
+				d0->length = dst_off;
+			}
+
+			if (--nb_segs > 0) {
+				mbuf = mbuf->next;
+				goto next_in_chain2;
+			}
+
+			n_tx_pkts++;
+			slot++;
+			n_free--;
+			rte_pktmbuf_free(mbuf_head);
+		}
 	}
 
 no_free_slots:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/2] net/memif: add a Rx fast path
  2022-07-01 10:28     ` [PATCH v2 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-07-01 16:51       ` Stephen Hemminger
  2022-08-22  3:47       ` [PATCH v3 0/2] add a fast path for memif Rx/Tx Joyce Kong
  1 sibling, 0 replies; 26+ messages in thread
From: Stephen Hemminger @ 2022-07-01 16:51 UTC (permalink / raw)
  To: Joyce Kong; +Cc: Jakub Grajciar, dev, nd, Ruifeng Wang, Morten Brørup

On Fri,  1 Jul 2022 10:28:14 +0000
Joyce Kong <joyce.kong@arm.com> wrote:

>  	n_slots = last_slot - cur_slot;
> +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> +		while (n_slots && n_rx_pkts < nb_pkts) {
> +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> +			if (unlikely(mbuf_head == NULL))
> +				goto no_free_bufs;
> +			mbuf = mbuf_head;
> +
> +next_slot1:
> +			mbuf->port = mq->in_port;
> +			s0 = cur_slot & mask;
> +			d0 = &ring->desc[s0];
>  

You might get additional speedup by doing bulk allocation.
If you know you are going to get N packets than rte_pktmbuf_alloc_bulk()
might speed it up?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 0/2] add a fast path for memif Rx/Tx
  2022-07-01 10:28     ` [PATCH v2 1/2] net/memif: add a Rx fast path Joyce Kong
  2022-07-01 16:51       ` Stephen Hemminger
@ 2022-08-22  3:47       ` Joyce Kong
  2022-08-22  3:47         ` [PATCH v3 1/2] net/memif: add a Rx fast path Joyce Kong
  2022-08-22  3:47         ` [PATCH v3 2/2] net/memif: add a Tx " Joyce Kong
  1 sibling, 2 replies; 26+ messages in thread
From: Joyce Kong @ 2022-08-22  3:47 UTC (permalink / raw)
  To: jgrajcia, stephen, huzaifa.rahman; +Cc: dev, nd, mb, ruifeng.wang, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copy. Add a
fast memcpy path by removing this branch with mbuf and memif
buffer size defined at compile time. For Rx fast path, bulk
allocating mbufs to get additional speedup. For Tx fast path,
bulk free mbufs which come from the same mempool.

When mbuf == memif buffer size, both Rx/Tx would choose the
fast memcpy path. When mbuf < memif buffer size, the Rx
chooses previous memcpy path while Tx chooses fast memcpy
path. When mbuf > memif buffer size, the Rx chooses fast
memcpy path while Tx chooses previous memcpy path.

Test with 1p1q on N1SDP AArch64 server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    47.16%    |    24.67%    |    12.47%
---------------------------------------------------------
   zc gain  |    20.96%    |     9.16%    |    10.66%
---------------------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86 server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    23.52%    |    14.20%    |     5.10%
---------------------------------------------------------
   zc gain  |    17.49%    |    10.62%    |    12.42%
---------------------------------------------------------

v3:
 Add bulk allocation to get additional speedup for memif Rx
 fast path. <Stephen Hemminger>

v2:
 Rebase v1 and update commit message.

Joyce Kong (2):
  net/memif: add a Rx fast path
  net/memif: add a Tx fast path

 drivers/net/memif/rte_eth_memif.c | 271 +++++++++++++++++++++---------
 1 file changed, 188 insertions(+), 83 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 1/2] net/memif: add a Rx fast path
  2022-08-22  3:47       ` [PATCH v3 0/2] add a fast path for memif Rx/Tx Joyce Kong
@ 2022-08-22  3:47         ` Joyce Kong
  2022-08-31 16:25           ` Stephen Hemminger
  2022-08-22  3:47         ` [PATCH v3 2/2] net/memif: add a Tx " Joyce Kong
  1 sibling, 1 reply; 26+ messages in thread
From: Joyce Kong @ 2022-08-22  3:47 UTC (permalink / raw)
  To: jgrajcia, stephen, huzaifa.rahman; +Cc: dev, nd, mb, ruifeng.wang, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Mbuf
and memif buffer size is defined at compile time. If memif
buf size <= mbuf size, add a fast Rx memory copy path by
removing this branch and mbuf bulk alloc.

The removal of the branch and bulk alloc lead to considerable
performance uplift.

Test with 1p1q on N1SDP AArch64 server,
--------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
--------------------------------------------
non-zc gain |     26.85%    |    -0.37%    |
--------------------------------------------
   zc gain  |      8.57%    |     3.04%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86 server,
--------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
--------------------------------------------
non-zc gain |     17.54%    |    -0.42%    |
--------------------------------------------
   zc gain  |     10.67%    |     0.26%    |
--------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 137 +++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 41 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index dd951b8296..2ea2a8e266 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -342,66 +342,122 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		goto refill;
 	n_slots = last_slot - cur_slot;
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
-			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
-		dst_off = 0;
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		struct rte_mbuf *mbufs[nb_pkts];
+		ret = rte_pktmbuf_alloc_bulk(mq->mempool, mbufs, nb_pkts);
+			if (unlikely(ret < 0))
+				goto no_free_bufs;
+
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = mbufs[n_rx_pkts];
+			mbuf = mbuf_head;
+
+next_slot1:
+			mbuf->port = mq->in_port;
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+			cp_len = d0->length;
 
-		src_len = d0->length;
-		src_off = 0;
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
 
-				/* store pointer to tail */
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
-				if (unlikely(mbuf == NULL))
+				if (unlikely(mbuf == NULL)) {
+					rte_pktmbuf_free_bulk(mbufs + n_rx_pkts,
+							nb_pkts - n_rx_pkts);
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
+				}
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
+					rte_pktmbuf_free_bulk(mbufs + n_rx_pkts,
+							nb_pkts - n_rx_pkts);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+		if (n_rx_pkts < nb_pkts)
+			rte_pktmbuf_free_bulk(mbufs + n_rx_pkts, nb_pkts - n_rx_pkts);
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-		cur_slot++;
-		n_slots--;
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
+
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +750,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 2/2] net/memif: add a Tx fast path
  2022-08-22  3:47       ` [PATCH v3 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-08-22  3:47         ` [PATCH v3 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-08-22  3:47         ` Joyce Kong
  1 sibling, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-08-22  3:47 UTC (permalink / raw)
  To: jgrajcia, stephen, huzaifa.rahman; +Cc: dev, nd, mb, ruifeng.wang, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. If all
mbufs come from the same mempool, and memif buf size >= mbuf
size, add a fast Tx memory copy path without the comparing
branch and with mbuf bulk free, otherwise still run the
original Tx path.

The removal of the branch and bulk free lead to considerable
performance uplift.

Test with 1p1q on N1SDP AArch64 server,
--------------------------------------------
  buf size  | memif >= mbuf | memif < mbuf |
--------------------------------------------
non-zc gain |    10.82%     |     0.04%    |
--------------------------------------------
   zc gain  |     8.86%     |     3.18%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86 server,
--------------------------------------------
  buf size  | memif >= mbuf | memif < mbuf |
--------------------------------------------
non-zc gain |     7.32%     |    -0.85%    |
--------------------------------------------
   zc gain  |    12.75%     |    -0.16%    |
--------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 134 ++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 42 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 2ea2a8e266..b0b5abef5a 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -671,62 +671,112 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
 	}
 
-	while (n_tx_pkts < nb_pkts && n_free) {
-		mbuf_head = *bufs++;
-		nb_segs = mbuf_head->nb_segs;
-		mbuf = mbuf_head;
+	uint8_t i;
+	struct rte_mbuf **buf_tmp = bufs;
+	mbuf_head = *buf_tmp++;
+	struct rte_mempool *mp = mbuf_head->pool;
+
+	for (i = 1; i < nb_pkts; i++) {
+		mbuf_head = *buf_tmp++;
+		if (mbuf_head->pool != mp)
+			break;
+	}
+
+	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mp) - RTE_PKTMBUF_HEADROOM;
+	if (i == nb_pkts && pmd->cfg.pkt_buffer_size >= mbuf_size) {
+		buf_tmp = bufs;
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-		saved_slot = slot;
-		d0 = &ring->desc[slot & mask];
-		dst_off = 0;
-		dst_len = (type == MEMIF_RING_C2S) ?
-			pmd->run.pkt_buffer_size : d0->length;
+			saved_slot = slot;
 
-next_in_chain:
-		src_off = 0;
-		src_len = rte_pktmbuf_data_len(mbuf);
+next_in_chain1:
+			d0 = &ring->desc[slot & mask];
+			cp_len = rte_pktmbuf_data_len(mbuf);
 
-		while (src_len) {
-			if (dst_len == 0) {
+			rte_memcpy((uint8_t *)memif_get_buffer(proc_private, d0),
+				rte_pktmbuf_mtod(mbuf, void *), cp_len);
+
+			d0->length = cp_len;
+			mq->n_bytes += cp_len;
+			slot++;
+			n_free--;
+
+			if (--nb_segs > 0) {
 				if (n_free) {
-					slot++;
-					n_free--;
 					d0->flags |= MEMIF_DESC_FLAG_NEXT;
-					d0 = &ring->desc[slot & mask];
-					dst_off = 0;
-					dst_len = (type == MEMIF_RING_C2S) ?
-					    pmd->run.pkt_buffer_size : d0->length;
-					d0->flags = 0;
+					mbuf = mbuf->next;
+					goto next_in_chain1;
 				} else {
 					slot = saved_slot;
-					goto no_free_slots;
+					goto free_mbufs;
 				}
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
-							       d0) + dst_off,
-				rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
-				cp_len);
+			n_tx_pkts++;
+		}
+free_mbufs:
+		rte_pktmbuf_free_bulk(buf_tmp, n_tx_pkts);
+	} else {
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-			mq->n_bytes += cp_len;
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-			dst_len -= cp_len;
+			saved_slot = slot;
+			d0 = &ring->desc[slot & mask];
+			dst_off = 0;
+			dst_len = (type == MEMIF_RING_C2S) ?
+				pmd->run.pkt_buffer_size : d0->length;
 
-			d0->length = dst_off;
-		}
+next_in_chain2:
+			src_off = 0;
+			src_len = rte_pktmbuf_data_len(mbuf);
 
-		if (--nb_segs > 0) {
-			mbuf = mbuf->next;
-			goto next_in_chain;
-		}
+			while (src_len) {
+				if (dst_len == 0) {
+					if (n_free) {
+						slot++;
+						n_free--;
+						d0->flags |= MEMIF_DESC_FLAG_NEXT;
+						d0 = &ring->desc[slot & mask];
+						dst_off = 0;
+						dst_len = (type == MEMIF_RING_C2S) ?
+						    pmd->run.pkt_buffer_size : d0->length;
+						d0->flags = 0;
+					} else {
+						slot = saved_slot;
+						goto no_free_slots;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		n_tx_pkts++;
-		slot++;
-		n_free--;
-		rte_pktmbuf_free(mbuf_head);
+				rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
+								       d0) + dst_off,
+					rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
+					cp_len);
+
+				mq->n_bytes += cp_len;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+				dst_len -= cp_len;
+
+				d0->length = dst_off;
+			}
+
+			if (--nb_segs > 0) {
+				mbuf = mbuf->next;
+				goto next_in_chain2;
+			}
+
+			n_tx_pkts++;
+			slot++;
+			n_free--;
+			rte_pktmbuf_free(mbuf_head);
+		}
 	}
 
 no_free_slots:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/2] net/memif: add a Rx fast path
  2022-08-22  3:47         ` [PATCH v3 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-08-31 16:25           ` Stephen Hemminger
  2022-09-07  6:06             ` Joyce Kong
  0 siblings, 1 reply; 26+ messages in thread
From: Stephen Hemminger @ 2022-08-31 16:25 UTC (permalink / raw)
  To: Joyce Kong; +Cc: jgrajcia, huzaifa.rahman, dev, nd, mb, ruifeng.wang

On Mon, 22 Aug 2022 03:47:30 +0000
Joyce Kong <joyce.kong@arm.com> wrote:

> +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> +		struct rte_mbuf *mbufs[nb_pkts];
> +		ret = rte_pktmbuf_alloc_bulk(mq->mempool, mbufs, nb_pkts);
> +			if (unlikely(ret < 0))
> +				goto no_free_bufs;
> +

The indentation looks off here, is this because of diff?
Also, my preference is to use blank line after declaration.

One more thing, the use of variable length array on stack will cause
the function to get additional overhead if stack-protector strong is
enabled.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v3 1/2] net/memif: add a Rx fast path
  2022-08-31 16:25           ` Stephen Hemminger
@ 2022-09-07  6:06             ` Joyce Kong
  0 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-09-07  6:06 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: jgrajcia, huzaifa.rahman, dev, nd, mb, Ruifeng Wang

Hi Stephen,

> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Thursday, September 1, 2022 12:26 AM
> To: Joyce Kong <Joyce.Kong@arm.com>
> Cc: jgrajcia@cisco.com; huzaifa.rahman@emumba.com; dev@dpdk.org; nd
> <nd@arm.com>; mb@smartsharesystems.com; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> Subject: Re: [PATCH v3 1/2] net/memif: add a Rx fast path
> 
> On Mon, 22 Aug 2022 03:47:30 +0000
> Joyce Kong <joyce.kong@arm.com> wrote:
> 
> > +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> > +		struct rte_mbuf *mbufs[nb_pkts];
> > +		ret = rte_pktmbuf_alloc_bulk(mq->mempool, mbufs,
> nb_pkts);
> > +			if (unlikely(ret < 0))
> > +				goto no_free_bufs;
> > +
> 
> The indentation looks off here, is this because of diff?
> Also, my preference is to use blank line after declaration.
Will modify the format in next version.

> 
> One more thing, the use of variable length array on stack will cause the
> function to get additional overhead if stack-protector strong is enabled.
Will fix the array length in next version.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 0/2] add a fast path for memif Rx/Tx
  2022-04-12  9:32 [RFC] net/memif: add a fast path for Rx Joyce Kong
  2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
@ 2022-09-15  6:58 ` Joyce Kong
  2022-09-15  6:58   ` [PATCH v4 1/2] net/memif: add a Rx fast path Joyce Kong
                     ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Joyce Kong @ 2022-09-15  6:58 UTC (permalink / raw)
  To: jgrajcia, stephen; +Cc: dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copy. Add a
fast memcpy path by removing this branch with mbuf and memif
buffer size defined at compile time. For Rx fast path, bulk
allocating mbufs to get additional speedup. For Tx fast path,
bulk free mbufs which come from the same mempool.

When mbuf == memif buffer size, both Rx/Tx would choose the
fast memcpy path. When mbuf < memif buffer size, the Rx
chooses previous memcpy path while Tx chooses fast memcpy
path. When mbuf > memif buffer size, the Rx chooses fast
memcpy path while Tx chooses previous memcpy path.

Test with 1p1q on N1SDP AArch64 server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    47.16%    |    24.67%    |    12.47%
---------------------------------------------------------
   zc gain  |    20.96%    |     9.16%    |    10.66%
---------------------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86 server,
---------------------------------------------------------
  buf size  | memif = mbuf | memif < mbuf | memif > mbuf
---------------------------------------------------------
non-zc gain |    23.52%    |    14.20%    |     5.10%
---------------------------------------------------------
   zc gain  |    17.49%    |    10.62%    |    12.42%
---------------------------------------------------------

v4:
 1.Fix incorrect indentation.
 2.Fix the mbuf array length to avoid additional overhead if
   stack-protector strong is enabled. <Stephen Hemminger>

v3:
 Add bulk allocation to get additional speedup for memif Rx
 fast path. <Stephen Hemminger>

v2:
 Rebase v1 and update commit message.

Joyce Kong (2):
  net/memif: add a Rx fast path
  net/memif: add a Tx fast path

 drivers/net/memif/rte_eth_memif.c | 280 +++++++++++++++++++++---------
 drivers/net/memif/rte_eth_memif.h |   2 +
 2 files changed, 199 insertions(+), 83 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 1/2] net/memif: add a Rx fast path
  2022-09-15  6:58 ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Joyce Kong
@ 2022-09-15  6:58   ` Joyce Kong
  2022-09-15  6:58   ` [PATCH v4 2/2] net/memif: add a Tx " Joyce Kong
  2022-09-22  9:12   ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Ferruh Yigit
  2 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-09-15  6:58 UTC (permalink / raw)
  To: jgrajcia, stephen; +Cc: dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Mbuf
and memif buffer size is defined at compile time. If memif
buf size <= mbuf size, add a fast Rx memory copy path by
removing this branch and mbuf bulk alloc.

The removal of the branch and bulk alloc lead to considerable
performance uplift.

Test with 1p1q on N1SDP AArch64 server,
--------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
--------------------------------------------
non-zc gain |     26.85%    |    -0.37%    |
--------------------------------------------
   zc gain  |      8.57%    |     3.04%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86 server,
--------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
--------------------------------------------
non-zc gain |     17.54%    |    -0.42%    |
--------------------------------------------
   zc gain  |     10.67%    |     0.26%    |
--------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 146 +++++++++++++++++++++---------
 drivers/net/memif/rte_eth_memif.h |   2 +
 2 files changed, 107 insertions(+), 41 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index dd951b8296..762293f636 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -297,7 +297,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		rte_eth_devices[mq->in_port].process_private;
 	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
 	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0;
-	uint16_t n_rx_pkts = 0;
+	uint16_t pkts, rx_pkts, n_rx_pkts = 0;
 	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mq->mempool) -
 		RTE_PKTMBUF_HEADROOM;
 	uint16_t src_len, src_off, dst_len, dst_off, cp_len;
@@ -342,66 +342,131 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		goto refill;
 	n_slots = last_slot - cur_slot;
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		struct rte_mbuf *mbufs[MAX_PKT_BURST];
+next_bulk:
+		ret = rte_pktmbuf_alloc_bulk(mq->mempool, mbufs, MAX_PKT_BURST);
+		if (unlikely(ret < 0))
 			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
-		dst_off = 0;
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+		rx_pkts = 0;
+		pkts = nb_pkts < MAX_PKT_BURST ? nb_pkts : MAX_PKT_BURST;
+		while (n_slots && rx_pkts < pkts) {
+			mbuf_head = mbufs[n_rx_pkts];
+			mbuf = mbuf_head;
 
-		src_len = d0->length;
-		src_off = 0;
+next_slot1:
+			mbuf->port = mq->in_port;
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			cp_len = d0->length;
+
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
 
-				/* store pointer to tail */
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
-				if (unlikely(mbuf == NULL))
+				if (unlikely(mbuf == NULL)) {
+					rte_pktmbuf_free_bulk(mbufs + rx_pkts,
+							MAX_PKT_BURST - rx_pkts);
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
+				}
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
+					rte_pktmbuf_free_bulk(mbufs + rx_pkts,
+							MAX_PKT_BURST - rx_pkts);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			rx_pkts++;
+			n_rx_pkts++;
+		}
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+		if (rx_pkts < MAX_PKT_BURST) {
+			rte_pktmbuf_free_bulk(mbufs + rx_pkts, MAX_PKT_BURST - rx_pkts);
+		} else {
+			nb_pkts -= rx_pkts;
+			if (nb_pkts)
+				goto next_bulk;
+		}
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-		cur_slot++;
-		n_slots--;
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
+
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +759,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
diff --git a/drivers/net/memif/rte_eth_memif.h b/drivers/net/memif/rte_eth_memif.h
index 81e7dceae0..09928ecc86 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -25,6 +25,8 @@
 #define ETH_MEMIF_DISC_STRING_SIZE		96
 #define ETH_MEMIF_SECRET_SIZE			24
 
+#define MAX_PKT_BURST				32
+
 extern int memif_logtype;
 
 #define MIF_LOG(level, fmt, args...) \
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v4 2/2] net/memif: add a Tx fast path
  2022-09-15  6:58 ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-09-15  6:58   ` [PATCH v4 1/2] net/memif: add a Rx fast path Joyce Kong
@ 2022-09-15  6:58   ` Joyce Kong
  2022-09-22  9:12   ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Ferruh Yigit
  2 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2022-09-15  6:58 UTC (permalink / raw)
  To: jgrajcia, stephen; +Cc: dev, nd, Joyce Kong

For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. If all
mbufs come from the same mempool, and memif buf size >= mbuf
size, add a fast Tx memory copy path without the comparing
branch and with mbuf bulk free, otherwise still run the
original Tx path.
The removal of the branch and bulk free lead to considerable
performance uplift.

Test with 1p1q on Ampere Altra AArch64 server,
--------------------------------------------
  buf size  | memif >= mbuf | memif < mbuf |
--------------------------------------------
non-zc gain |    10.82%     |     0.04%    |
--------------------------------------------
   zc gain  |     8.86%     |     3.18%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
--------------------------------------------
  buf size  | memif >= mbuf | memif < mbuf |
--------------------------------------------
non-zc gain |     7.32%     |    -0.85%    |
--------------------------------------------
   zc gain  |    12.75%     |    -0.16%    |
--------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 134 ++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 42 deletions(-)

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 762293f636..b5bed955ed 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -680,62 +680,112 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
 	}
 
-	while (n_tx_pkts < nb_pkts && n_free) {
-		mbuf_head = *bufs++;
-		nb_segs = mbuf_head->nb_segs;
-		mbuf = mbuf_head;
+	uint8_t i;
+	struct rte_mbuf **buf_tmp = bufs;
+	mbuf_head = *buf_tmp++;
+	struct rte_mempool *mp = mbuf_head->pool;
+
+	for (i = 1; i < nb_pkts; i++) {
+		mbuf_head = *buf_tmp++;
+		if (mbuf_head->pool != mp)
+			break;
+	}
+
+	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mp) - RTE_PKTMBUF_HEADROOM;
+	if (i == nb_pkts && pmd->cfg.pkt_buffer_size >= mbuf_size) {
+		buf_tmp = bufs;
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-		saved_slot = slot;
-		d0 = &ring->desc[slot & mask];
-		dst_off = 0;
-		dst_len = (type == MEMIF_RING_C2S) ?
-			pmd->run.pkt_buffer_size : d0->length;
+			saved_slot = slot;
 
-next_in_chain:
-		src_off = 0;
-		src_len = rte_pktmbuf_data_len(mbuf);
+next_in_chain1:
+			d0 = &ring->desc[slot & mask];
+			cp_len = rte_pktmbuf_data_len(mbuf);
 
-		while (src_len) {
-			if (dst_len == 0) {
+			rte_memcpy((uint8_t *)memif_get_buffer(proc_private, d0),
+				rte_pktmbuf_mtod(mbuf, void *), cp_len);
+
+			d0->length = cp_len;
+			mq->n_bytes += cp_len;
+			slot++;
+			n_free--;
+
+			if (--nb_segs > 0) {
 				if (n_free) {
-					slot++;
-					n_free--;
 					d0->flags |= MEMIF_DESC_FLAG_NEXT;
-					d0 = &ring->desc[slot & mask];
-					dst_off = 0;
-					dst_len = (type == MEMIF_RING_C2S) ?
-					    pmd->run.pkt_buffer_size : d0->length;
-					d0->flags = 0;
+					mbuf = mbuf->next;
+					goto next_in_chain1;
 				} else {
 					slot = saved_slot;
-					goto no_free_slots;
+					goto free_mbufs;
 				}
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
-							       d0) + dst_off,
-				rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
-				cp_len);
+			n_tx_pkts++;
+		}
+free_mbufs:
+		rte_pktmbuf_free_bulk(buf_tmp, n_tx_pkts);
+	} else {
+		while (n_tx_pkts < nb_pkts && n_free) {
+			mbuf_head = *bufs++;
+			nb_segs = mbuf_head->nb_segs;
+			mbuf = mbuf_head;
 
-			mq->n_bytes += cp_len;
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-			dst_len -= cp_len;
+			saved_slot = slot;
+			d0 = &ring->desc[slot & mask];
+			dst_off = 0;
+			dst_len = (type == MEMIF_RING_C2S) ?
+				pmd->run.pkt_buffer_size : d0->length;
 
-			d0->length = dst_off;
-		}
+next_in_chain2:
+			src_off = 0;
+			src_len = rte_pktmbuf_data_len(mbuf);
 
-		if (--nb_segs > 0) {
-			mbuf = mbuf->next;
-			goto next_in_chain;
-		}
+			while (src_len) {
+				if (dst_len == 0) {
+					if (n_free) {
+						slot++;
+						n_free--;
+						d0->flags |= MEMIF_DESC_FLAG_NEXT;
+						d0 = &ring->desc[slot & mask];
+						dst_off = 0;
+						dst_len = (type == MEMIF_RING_C2S) ?
+						    pmd->run.pkt_buffer_size : d0->length;
+						d0->flags = 0;
+					} else {
+						slot = saved_slot;
+						goto no_free_slots;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		n_tx_pkts++;
-		slot++;
-		n_free--;
-		rte_pktmbuf_free(mbuf_head);
+				rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
+								       d0) + dst_off,
+					rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
+					cp_len);
+
+				mq->n_bytes += cp_len;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+				dst_len -= cp_len;
+
+				d0->length = dst_off;
+			}
+
+			if (--nb_segs > 0) {
+				mbuf = mbuf->next;
+				goto next_in_chain2;
+			}
+
+			n_tx_pkts++;
+			slot++;
+			n_free--;
+			rte_pktmbuf_free(mbuf_head);
+		}
 	}
 
 no_free_slots:
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v4 0/2] add a fast path for memif Rx/Tx
  2022-09-15  6:58 ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Joyce Kong
  2022-09-15  6:58   ` [PATCH v4 1/2] net/memif: add a Rx fast path Joyce Kong
  2022-09-15  6:58   ` [PATCH v4 2/2] net/memif: add a Tx " Joyce Kong
@ 2022-09-22  9:12   ` Ferruh Yigit
  2022-12-09 13:59     ` Ferruh Yigit
  2 siblings, 1 reply; 26+ messages in thread
From: Ferruh Yigit @ 2022-09-22  9:12 UTC (permalink / raw)
  To: Joyce Kong, jgrajcia, stephen; +Cc: dev, nd, Ruifeng Wang

On 9/15/2022 7:58 AM, Joyce Kong wrote:
> For memif non-zero-copy mode, there is a branch to compare
> the mbuf and memif buffer size during memory copy. Add a
> fast memcpy path by removing this branch with mbuf and memif
> buffer size defined at compile time. For Rx fast path, bulk
> allocating mbufs to get additional speedup. For Tx fast path,
> bulk free mbufs which come from the same mempool.
> 
> When mbuf == memif buffer size, both Rx/Tx would choose the
> fast memcpy path. When mbuf < memif buffer size, the Rx
> chooses previous memcpy path while Tx chooses fast memcpy
> path. When mbuf > memif buffer size, the Rx chooses fast
> memcpy path while Tx chooses previous memcpy path.
> 
> Test with 1p1q on N1SDP AArch64 server,
> ---------------------------------------------------------
>    buf size  | memif = mbuf | memif < mbuf | memif > mbuf
> ---------------------------------------------------------
> non-zc gain |    47.16%    |    24.67%    |    12.47%
> ---------------------------------------------------------
>     zc gain  |    20.96%    |     9.16%    |    10.66%
> ---------------------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86 server,
> ---------------------------------------------------------
>    buf size  | memif = mbuf | memif < mbuf | memif > mbuf
> ---------------------------------------------------------
> non-zc gain |    23.52%    |    14.20%    |     5.10%
> ---------------------------------------------------------
>     zc gain  |    17.49%    |    10.62%    |    12.42%
> ---------------------------------------------------------
> 
> v4:
>   1.Fix incorrect indentation.
>   2.Fix the mbuf array length to avoid additional overhead if
>     stack-protector strong is enabled. <Stephen Hemminger>
> 
> v3:
>   Add bulk allocation to get additional speedup for memif Rx
>   fast path. <Stephen Hemminger>
> 
> v2:
>   Rebase v1 and update commit message.
> 
> Joyce Kong (2):
>    net/memif: add a Rx fast path
>    net/memif: add a Tx fast path
> 

Hi Jakub,

Reminder of this set waiting for your review.

Thanks,
ferruh


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v4 0/2] add a fast path for memif Rx/Tx
  2022-09-22  9:12   ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Ferruh Yigit
@ 2022-12-09 13:59     ` Ferruh Yigit
  0 siblings, 0 replies; 26+ messages in thread
From: Ferruh Yigit @ 2022-12-09 13:59 UTC (permalink / raw)
  To: Joyce Kong, jgrajcia, stephen; +Cc: dev, nd, Ruifeng Wang

On 9/22/2022 10:12 AM, Ferruh Yigit wrote:
> On 9/15/2022 7:58 AM, Joyce Kong wrote:
>> For memif non-zero-copy mode, there is a branch to compare
>> the mbuf and memif buffer size during memory copy. Add a
>> fast memcpy path by removing this branch with mbuf and memif
>> buffer size defined at compile time. For Rx fast path, bulk
>> allocating mbufs to get additional speedup. For Tx fast path,
>> bulk free mbufs which come from the same mempool.
>>
>> When mbuf == memif buffer size, both Rx/Tx would choose the
>> fast memcpy path. When mbuf < memif buffer size, the Rx
>> chooses previous memcpy path while Tx chooses fast memcpy
>> path. When mbuf > memif buffer size, the Rx chooses fast
>> memcpy path while Tx chooses previous memcpy path.
>>
>> Test with 1p1q on N1SDP AArch64 server,
>> ---------------------------------------------------------
>>    buf size  | memif = mbuf | memif < mbuf | memif > mbuf
>> ---------------------------------------------------------
>> non-zc gain |    47.16%    |    24.67%    |    12.47%
>> ---------------------------------------------------------
>>     zc gain  |    20.96%    |     9.16%    |    10.66%
>> ---------------------------------------------------------
>>
>> Test with 1p1q on Cascade Lake Xeon X86 server,
>> ---------------------------------------------------------
>>    buf size  | memif = mbuf | memif < mbuf | memif > mbuf
>> ---------------------------------------------------------
>> non-zc gain |    23.52%    |    14.20%    |     5.10%
>> ---------------------------------------------------------
>>     zc gain  |    17.49%    |    10.62%    |    12.42%
>> ---------------------------------------------------------
>>
>> v4:
>>   1.Fix incorrect indentation.
>>   2.Fix the mbuf array length to avoid additional overhead if
>>     stack-protector strong is enabled. <Stephen Hemminger>
>>
>> v3:
>>   Add bulk allocation to get additional speedup for memif Rx
>>   fast path. <Stephen Hemminger>
>>
>> v2:
>>   Rebase v1 and update commit message.
>>
>> Joyce Kong (2):
>>    net/memif: add a Rx fast path
>>    net/memif: add a Tx fast path
>>
> 
> Hi Jakub,
> 
> Reminder of this set waiting for your review.
> 

No objection received on the patch, and I can reproduce the performance
improvement.
Taking into account that we are at the beginning of the release cycle
and will have time to address any possible issue later, I will proceed
with the set.

Series applied to dpdk-next-net/main, thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-12-09 13:59 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-12  9:32 [RFC] net/memif: add a fast path for Rx Joyce Kong
2022-05-17 10:51 ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Joyce Kong
2022-05-17 10:51   ` [PATCH v1 1/2] net/memif: add a Rx fast path Joyce Kong
2022-05-18 16:53     ` Ferruh Yigit
2022-05-19  7:00       ` Joyce Kong
2022-05-19  8:44         ` Joyce Kong
2022-05-18 17:06     ` Ferruh Yigit
2022-05-19 15:09       ` Joyce Kong
2022-05-19 16:38         ` Ferruh Yigit
2022-05-17 10:51   ` [PATCH v1 2/2] net/memif: add a Tx " Joyce Kong
2022-05-17 13:59   ` [PATCH v1 0/2] add a fast path for memif Rx/Tx Morten Brørup
2022-05-18  2:48   ` Ruifeng Wang
2022-07-01 10:28   ` [PATCH v2 " Joyce Kong
2022-07-01 10:28     ` [PATCH v2 1/2] net/memif: add a Rx fast path Joyce Kong
2022-07-01 16:51       ` Stephen Hemminger
2022-08-22  3:47       ` [PATCH v3 0/2] add a fast path for memif Rx/Tx Joyce Kong
2022-08-22  3:47         ` [PATCH v3 1/2] net/memif: add a Rx fast path Joyce Kong
2022-08-31 16:25           ` Stephen Hemminger
2022-09-07  6:06             ` Joyce Kong
2022-08-22  3:47         ` [PATCH v3 2/2] net/memif: add a Tx " Joyce Kong
2022-07-01 10:28     ` [PATCH v2 " Joyce Kong
2022-09-15  6:58 ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Joyce Kong
2022-09-15  6:58   ` [PATCH v4 1/2] net/memif: add a Rx fast path Joyce Kong
2022-09-15  6:58   ` [PATCH v4 2/2] net/memif: add a Tx " Joyce Kong
2022-09-22  9:12   ` [PATCH v4 0/2] add a fast path for memif Rx/Tx Ferruh Yigit
2022-12-09 13:59     ` Ferruh Yigit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.