All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: linux-nfs@vger.kernel.org, linux-rdma@vger.kernel.org
Subject: [PATCH v2 08/12] svcrdma: Post Send WR chain
Date: Sun, 04 Feb 2024 18:17:22 -0500	[thread overview]
Message-ID: <170708864233.28128.8304695511753070556.stgit@bazille.1015granger.net> (raw)
In-Reply-To: <170708844422.28128.2979813721958631192.stgit@bazille.1015granger.net>

From: Chuck Lever <chuck.lever@oracle.com>

Eventually I'd like the server to post the reply's Send WR along
with any Write WRs using only a single call to ib_post_send(), in
order to reduce the NIC's doorbell rate.

To do this, add an anchor for a WR chain to svc_rdma_send_ctxt, and
refactor svc_rdma_send() to post this WR chain to the Send Queue. For
the moment, the posted chain will continue to contain a single Send
WR.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h            |    6 ++-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |    2 +
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      |   49 +++++++++++++++++++---------
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e7595ae62fe2..ee05087d6499 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -210,6 +210,8 @@ struct svc_rdma_send_ctxt {
 
 	struct svcxprt_rdma	*sc_rdma;
 	struct ib_send_wr	sc_send_wr;
+	struct ib_send_wr	*sc_wr_chain;
+	int			sc_sqecount;
 	struct ib_cqe		sc_cqe;
 	struct xdr_buf		sc_hdrbuf;
 	struct xdr_stream	sc_stream;
@@ -258,8 +260,8 @@ extern struct svc_rdma_send_ctxt *
 		svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
 extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 				   struct svc_rdma_send_ctxt *ctxt);
-extern int svc_rdma_send(struct svcxprt_rdma *rdma,
-			 struct svc_rdma_send_ctxt *ctxt);
+extern int svc_rdma_post_send(struct svcxprt_rdma *rdma,
+			      struct svc_rdma_send_ctxt *ctxt);
 extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *sctxt,
 				  const struct svc_rdma_pcl *write_pcl,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c9be6778643b..e5a78b761012 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -90,7 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
 	sctxt->sc_send_wr.opcode = IB_WR_SEND;
-	return svc_rdma_send(rdma, sctxt);
+	return svc_rdma_post_send(rdma, sctxt);
 }
 
 /* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 0ee9185f5f3f..0f02fb09d5b0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -208,6 +208,9 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
 	ctxt->sc_send_wr.num_sge = 0;
 	ctxt->sc_cur_sge_no = 0;
 	ctxt->sc_page_count = 0;
+	ctxt->sc_wr_chain = &ctxt->sc_send_wr;
+	ctxt->sc_sqecount = 1;
+
 	return ctxt;
 
 out_empty:
@@ -293,7 +296,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	struct svc_rdma_send_ctxt *ctxt =
 		container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
 
-	svc_rdma_wake_send_waiters(rdma, 1);
+	svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount);
 
 	if (unlikely(wc->status != IB_WC_SUCCESS))
 		goto flushed;
@@ -312,36 +315,44 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 }
 
 /**
- * svc_rdma_send - Post a single Send WR
- * @rdma: transport on which to post the WR
- * @ctxt: send ctxt with a Send WR ready to post
+ * svc_rdma_post_send - Post a WR chain to the Send Queue
+ * @rdma: transport context
+ * @ctxt: WR chain to post
  *
  * Copy fields in @ctxt to stack variables in order to guarantee
  * that these values remain available after the ib_post_send() call.
  * In some error flow cases, svc_rdma_wc_send() releases @ctxt.
  *
+ * Note there is potential for starvation when the Send Queue is
+ * full because there is no order to when waiting threads are
+ * awoken. The transport is typically provisioned with a deep
+ * enough Send Queue that SQ exhaustion should be a rare event.
+ *
  * Return values:
  *   %0: @ctxt's WR chain was posted successfully
  *   %-ENOTCONN: The connection was lost
  */
-int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
+int svc_rdma_post_send(struct svcxprt_rdma *rdma,
+		       struct svc_rdma_send_ctxt *ctxt)
 {
-	struct ib_send_wr *wr = &ctxt->sc_send_wr;
+	struct ib_send_wr *first_wr = ctxt->sc_wr_chain;
+	struct ib_send_wr *send_wr = &ctxt->sc_send_wr;
+	const struct ib_send_wr *bad_wr = first_wr;
 	struct rpc_rdma_cid cid = ctxt->sc_cid;
-	int ret;
+	int ret, sqecount = ctxt->sc_sqecount;
 
 	might_sleep();
 
 	/* Sync the transport header buffer */
 	ib_dma_sync_single_for_device(rdma->sc_pd->device,
-				      wr->sg_list[0].addr,
-				      wr->sg_list[0].length,
+				      send_wr->sg_list[0].addr,
+				      send_wr->sg_list[0].length,
 				      DMA_TO_DEVICE);
 
 	/* If the SQ is full, wait until an SQ entry is available */
 	while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
-		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
-			svc_rdma_wake_send_waiters(rdma, 1);
+		if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
+			svc_rdma_wake_send_waiters(rdma, sqecount);
 
 			/* When the transport is torn down, assume
 			 * ib_drain_sq() will trigger enough Send
@@ -358,12 +369,18 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 		}
 
 		trace_svcrdma_post_send(ctxt);
-		ret = ib_post_send(rdma->sc_qp, wr, NULL);
+		ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
 		if (ret) {
 			trace_svcrdma_sq_post_err(rdma, &cid, ret);
 			svc_xprt_deferred_close(&rdma->sc_xprt);
-			svc_rdma_wake_send_waiters(rdma, 1);
-			break;
+
+			/* If even one WR was posted, there will be a
+			 * Send completion that bumps sc_sq_avail.
+			 */
+			if (bad_wr == first_wr) {
+				svc_rdma_wake_send_waiters(rdma, sqecount);
+				break;
+			}
 		}
 		return 0;
 	}
@@ -884,7 +901,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 		sctxt->sc_send_wr.opcode = IB_WR_SEND;
 	}
 
-	return svc_rdma_send(rdma, sctxt);
+	return svc_rdma_post_send(rdma, sctxt);
 }
 
 /**
@@ -948,7 +965,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 	sctxt->sc_send_wr.num_sge = 1;
 	sctxt->sc_send_wr.opcode = IB_WR_SEND;
 	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
-	if (svc_rdma_send(rdma, sctxt))
+	if (svc_rdma_post_send(rdma, sctxt))
 		goto put_ctxt;
 	return;
 



  parent reply	other threads:[~2024-02-04 23:17 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-04 23:16 [PATCH v2 00/12] NFSD RDMA transport improvements Chuck Lever
2024-02-04 23:16 ` [PATCH v2 01/12] svcrdma: Reserve an extra WQE for ib_drain_rq() Chuck Lever
2024-02-04 23:16 ` [PATCH v2 02/12] svcrdma: Report CQ depths in debugging output Chuck Lever
2024-02-04 23:16 ` [PATCH v2 03/12] svcrdma: Update max_send_sges after QP is created Chuck Lever
2024-02-04 23:16 ` [PATCH v2 04/12] svcrdma: Increase the per-transport rw_ctx count Chuck Lever
2024-02-04 23:17 ` [PATCH v2 05/12] svcrdma: Fix SQ wake-ups Chuck Lever
2024-02-04 23:17 ` [PATCH v2 06/12] svcrdma: Prevent a UAF in svc_rdma_send() Chuck Lever
2024-02-04 23:17 ` [PATCH v2 07/12] svcrdma: Fix retry loop " Chuck Lever
2024-02-04 23:17 ` Chuck Lever [this message]
2024-02-04 23:17 ` [PATCH v2 09/12] svcrdma: Move write_info for Reply chunks into struct svc_rdma_send_ctxt Chuck Lever
2024-02-04 23:17 ` [PATCH v2 10/12] svcrdma: Post the Reply chunk and Send WR together Chuck Lever
2024-02-04 23:17 ` [PATCH v2 11/12] svcrdma: Post WRs for Write chunks in svc_rdma_sendto() Chuck Lever
2024-02-04 23:17 ` [PATCH v2 12/12] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=170708864233.28128.8304695511753070556.stgit@bazille.1015granger.net \
    --to=cel@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.