All of lore.kernel.org
 help / color / mirror / Atom feed
From: <pbhagavatula@marvell.com>
To: <jerinj@marvell.com>, Pavan Nikhilesh <pbhagavatula@marvell.com>,
	"Shijith Thotton" <sthotton@marvell.com>,
	Nithin Dabilpuram <ndabilpuram@marvell.com>,
	Kiran Kumar K <kirankumark@marvell.com>,
	Sunil Kumar Kori <skori@marvell.com>,
	Satha Rao <skoteshwar@marvell.com>
Cc: <dev@dpdk.org>
Subject: [PATCH v3 3/3] net/cnxk: improve Rx performance
Date: Thu, 10 Feb 2022 15:49:40 +0530	[thread overview]
Message-ID: <20220210101940.1669-3-pbhagavatula@marvell.com> (raw)
In-Reply-To: <20220210101940.1669-1-pbhagavatula@marvell.com>

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve vWQE and CQ Rx performance by tuning perfetches to 64B
cacheline size.
Also, prefetch the vWQE array offsets at cacheline boundaries.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h | 25 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx.h       |  8 ++++----
 drivers/net/cnxk/cn9k_rx.h        | 20 ++++++++++----------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ada230ea1d..cfe729cef9 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,11 +118,17 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint8_t loff = 0;
 	uint64_t sa_base;
 	uint64_t **wqe;
+	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
 	wqe = vec->u64s;
 
+	rte_prefetch_non_temporal(&vec->ptrs[0]);
+#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
+	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
+		rte_prefetch_non_temporal(&vec->ptrs[i]);
+
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
 	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
@@ -191,15 +197,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t u64[2];
 	} gw;
 	uint64_t tstamp_ptr;
-	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
-		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		"sub %[mbuf], %H[wdata], #0x80				\n"
-		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
+		"caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+		: [wdata] "+r"(gw.get_work)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -208,14 +212,12 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
-	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	ws->gw_rdata = gw.u64[0];
-	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
-		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
-		    (gw.u64[0] & 0xffffffff);
-
-	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+	if (gw.u64[1]) {
+		gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+			    (gw.u64[0] & (0x3FFull << 36)) << 4 |
+			    (gw.u64[0] & 0xffffffff);
 		if ((flags & CPT_RX_WQE_F) &&
 		    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 		     RTE_EVENT_TYPE_CRYPTODEV)) {
@@ -223,7 +225,10 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 			   RTE_EVENT_TYPE_ETHDEV) {
 			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			uint64_t mbuf;
 
+			mbuf = gw.u64[1] - sizeof(struct rte_mbuf);
+			rte_prefetch0((void *)mbuf);
 			if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 				struct rte_mbuf *m;
 				uintptr_t sa_base;
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 8b00fcc660..564e50f0af 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -610,10 +610,10 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}
 
 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
 
 		/* Get NIX_RX_SG_S for size and buffer pointer */
 		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 1178f95317..d36f292c95 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -388,16 +388,16 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags =
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
 
-	mbuf->pkt_len = len;
-	mbuf->data_len = len;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
-
 	mbuf->ol_flags = ol_flags;
+	*(uint64_t *)(&mbuf->rearm_data) = val;
+	mbuf->pkt_len = len;
 
-	if (flag & NIX_RX_MULTI_SEG_F)
+	if (flag & NIX_RX_MULTI_SEG_F) {
 		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
-	else
+	} else {
+		mbuf->data_len = len;
 		mbuf->next = NULL;
+	}
 }
 
 static inline uint16_t
@@ -769,10 +769,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
-		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
-
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
@@ -797,6 +793,10 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			mbuf3->next = NULL;
 		}
 
+		/* Store the mbufs to rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
+		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
-- 
2.17.1


  parent reply	other threads:[~2022-02-10 10:20 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
2022-01-19  7:13 ` [PATCH v2 2/4] event/cnxk: store and reuse workslot status pbhagavatula
2022-01-19  7:13 ` [PATCH v2 3/4] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-01-19  7:13 ` [PATCH v2 4/4] net/cnxk: improve Rx performance pbhagavatula
2022-02-07 14:03 ` [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue Jerin Jacob
2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
2022-02-10 10:19   ` Jerin Jacob
2022-02-10 13:15   ` [PATCH v4] " pbhagavatula
2022-02-11 10:27     ` Jerin Jacob
2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
2022-02-10 10:19   ` [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-02-10 10:19   ` pbhagavatula [this message]
2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
2022-02-10 13:20     ` [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-02-10 13:20     ` [PATCH v4 3/3] event/cnxk: improve Rx performance pbhagavatula
2022-02-14  9:29     ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status Jerin Jacob

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220210101940.1669-3-pbhagavatula@marvell.com \
    --to=pbhagavatula@marvell.com \
    --cc=dev@dpdk.org \
    --cc=jerinj@marvell.com \
    --cc=kirankumark@marvell.com \
    --cc=ndabilpuram@marvell.com \
    --cc=skori@marvell.com \
    --cc=skoteshwar@marvell.com \
    --cc=sthotton@marvell.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.