All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
To: anna.schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: [PATCH 8/9] xprtrdma: RPC completion should wait for Send completion
Date: Fri, 20 Oct 2017 10:48:36 -0400	[thread overview]
Message-ID: <20171020144836.14869.58073.stgit@manet.1015granger.net> (raw)
In-Reply-To: <20171020143635.14869.15714.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

When an RPC Call includes a file data payload, that payload can come
from pages in the page cache, or a user buffer (for direct I/O).

If the payload can fit inline, xprtrdma includes it in the Send
using a scatter-gather technique. xprtrdma mustn't allow the RPC
consumer to re-use the memory where that payload resides before the
Send completes. Otherwise, the new contents of that memory would be
exposed by an HCA retransmit of the Send operation.

So, block RPC completion on Send completion, but only in the case
where a separate file data payload is part of the Send. This
prevents the reuse of that memory while it is still part of a Send
operation without an undue cost to other cases.

Waiting is avoided in the common case because typically the Send
will have completed long before the RPC Reply arrives.

These days, an RPC timeout will trigger a disconnect, which tears
down the QP. The disconnect flushes all waiting Sends. This bounds
the amount of time the reply handler has to wait for a Send
completion.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |   26 +++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/transport.c |    5 +++--
 net/sunrpc/xprtrdma/verbs.c     |    3 ++-
 net/sunrpc/xprtrdma/xprt_rdma.h |    4 ++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 853dede..4fdeaac 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -534,6 +534,11 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 	for (count = sc->sc_unmap_count; count; ++sge, --count)
 		ib_dma_unmap_page(ia->ri_device,
 				  sge->addr, sge->length, DMA_TO_DEVICE);
+
+	if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
+		smp_mb__after_atomic();
+		wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
+	}
 }
 
 /* Prepare an SGE for the RPC-over-RDMA transport header.
@@ -667,6 +672,8 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 
 out:
 	sc->sc_wr.num_sge += sge_no;
+	if (sc->sc_unmap_count)
+		__set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 	return true;
 
 out_regbuf:
@@ -704,6 +711,8 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 		return -ENOBUFS;
 	req->rl_sendctx->sc_wr.num_sge = 0;
 	req->rl_sendctx->sc_unmap_count = 0;
+	req->rl_sendctx->sc_req = req;
+	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 
 	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
 		return -EIO;
@@ -1305,6 +1314,20 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	if (!list_empty(&req->rl_registered))
 		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
 						    &req->rl_registered);
+
+	/* Ensure that any DMA mapped pages associated with
+	 * the Send of the RPC Call have been unmapped before
+	 * allowing the RPC to complete. This protects argument
+	 * memory not controlled by the RPC client from being
+	 * re-used before we're done with it.
+	 */
+	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+		r_xprt->rx_stats.reply_waits_for_send++;
+		out_of_line_wait_on_bit(&req->rl_flags,
+					RPCRDMA_REQ_F_TX_RESOURCES,
+					bit_wait,
+					TASK_UNINTERRUPTIBLE);
+	}
 }
 
 /* Reply handling runs in the poll worker thread. Anything that
@@ -1384,7 +1407,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
 		__func__, rep, req, be32_to_cpu(rep->rr_xid));
 
-	if (list_empty(&req->rl_registered))
+	if (list_empty(&req->rl_registered) &&
+	    !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags))
 		rpcrdma_complete_rqst(rep);
 	else
 		queue_work(rpcrdma_receive_wq, &rep->rr_work);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 0c73439..dcc2df0 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -792,12 +792,13 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 		   r_xprt->rx_stats.failed_marshal_count,
 		   r_xprt->rx_stats.bad_reply_count,
 		   r_xprt->rx_stats.nomsg_call_count);
-	seq_printf(seq, "%lu %lu %lu %lu %lu\n",
+	seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
 		   r_xprt->rx_stats.mrs_recovered,
 		   r_xprt->rx_stats.mrs_orphaned,
 		   r_xprt->rx_stats.mrs_allocated,
 		   r_xprt->rx_stats.local_inv_needed,
-		   r_xprt->rx_stats.empty_sendctx_q);
+		   r_xprt->rx_stats.empty_sendctx_q,
+		   r_xprt->rx_stats.reply_waits_for_send);
 }
 
 static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index bab63ad..9a824fe 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1526,7 +1526,8 @@ struct rpcrdma_regbuf *
 	dprintk("RPC:       %s: posting %d s/g entries\n",
 		__func__, send_wr->num_sge);
 
-	if (!ep->rep_send_count) {
+	if (!ep->rep_send_count ||
+	    test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
 		send_wr->send_flags |= IB_SEND_SIGNALED;
 		ep->rep_send_count = ep->rep_send_batch;
 	} else {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c260475..bccd5d8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -236,11 +236,13 @@ struct rpcrdma_rep {
 
 /* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
  */
+struct rpcrdma_req;
 struct rpcrdma_xprt;
 struct rpcrdma_sendctx {
 	struct ib_send_wr	sc_wr;
 	struct ib_cqe		sc_cqe;
 	struct rpcrdma_xprt	*sc_xprt;
+	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
 	struct ib_sge		sc_sges[];
 };
@@ -387,6 +389,7 @@ struct rpcrdma_req {
 enum {
 	RPCRDMA_REQ_F_BACKCHANNEL = 0,
 	RPCRDMA_REQ_F_PENDING,
+	RPCRDMA_REQ_F_TX_RESOURCES,
 };
 
 static inline void
@@ -492,6 +495,7 @@ struct rpcrdma_stats {
 	/* accessed when receiving a reply */
 	unsigned long long	total_rdma_reply;
 	unsigned long long	fixup_copy_count;
+	unsigned long		reply_waits_for_send;
 	unsigned long		local_inv_needed;
 	unsigned long		nomsg_call_count;
 	unsigned long		bcall_count;

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

WARNING: multiple messages have this Message-ID (diff)
From: Chuck Lever <chuck.lever@oracle.com>
To: anna.schumaker@netapp.com
Cc: linux-rdma@vger.kernel.org, linux-nfs@vger.kernel.org
Subject: [PATCH 8/9] xprtrdma: RPC completion should wait for Send completion
Date: Fri, 20 Oct 2017 10:48:36 -0400	[thread overview]
Message-ID: <20171020144836.14869.58073.stgit@manet.1015granger.net> (raw)
In-Reply-To: <20171020143635.14869.15714.stgit@manet.1015granger.net>

When an RPC Call includes a file data payload, that payload can come
from pages in the page cache, or a user buffer (for direct I/O).

If the payload can fit inline, xprtrdma includes it in the Send
using a scatter-gather technique. xprtrdma mustn't allow the RPC
consumer to re-use the memory where that payload resides before the
Send completes. Otherwise, the new contents of that memory would be
exposed by an HCA retransmit of the Send operation.

So, block RPC completion on Send completion, but only in the case
where a separate file data payload is part of the Send. This
prevents the reuse of that memory while it is still part of a Send
operation without an undue cost to other cases.

Waiting is avoided in the common case because typically the Send
will have completed long before the RPC Reply arrives.

These days, an RPC timeout will trigger a disconnect, which tears
down the QP. The disconnect flushes all waiting Sends. This bounds
the amount of time the reply handler has to wait for a Send
completion.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |   26 +++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/transport.c |    5 +++--
 net/sunrpc/xprtrdma/verbs.c     |    3 ++-
 net/sunrpc/xprtrdma/xprt_rdma.h |    4 ++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 853dede..4fdeaac 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -534,6 +534,11 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 	for (count = sc->sc_unmap_count; count; ++sge, --count)
 		ib_dma_unmap_page(ia->ri_device,
 				  sge->addr, sge->length, DMA_TO_DEVICE);
+
+	if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
+		smp_mb__after_atomic();
+		wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
+	}
 }
 
 /* Prepare an SGE for the RPC-over-RDMA transport header.
@@ -667,6 +672,8 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 
 out:
 	sc->sc_wr.num_sge += sge_no;
+	if (sc->sc_unmap_count)
+		__set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 	return true;
 
 out_regbuf:
@@ -704,6 +711,8 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 		return -ENOBUFS;
 	req->rl_sendctx->sc_wr.num_sge = 0;
 	req->rl_sendctx->sc_unmap_count = 0;
+	req->rl_sendctx->sc_req = req;
+	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 
 	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
 		return -EIO;
@@ -1305,6 +1314,20 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	if (!list_empty(&req->rl_registered))
 		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
 						    &req->rl_registered);
+
+	/* Ensure that any DMA mapped pages associated with
+	 * the Send of the RPC Call have been unmapped before
+	 * allowing the RPC to complete. This protects argument
+	 * memory not controlled by the RPC client from being
+	 * re-used before we're done with it.
+	 */
+	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+		r_xprt->rx_stats.reply_waits_for_send++;
+		out_of_line_wait_on_bit(&req->rl_flags,
+					RPCRDMA_REQ_F_TX_RESOURCES,
+					bit_wait,
+					TASK_UNINTERRUPTIBLE);
+	}
 }
 
 /* Reply handling runs in the poll worker thread. Anything that
@@ -1384,7 +1407,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
 		__func__, rep, req, be32_to_cpu(rep->rr_xid));
 
-	if (list_empty(&req->rl_registered))
+	if (list_empty(&req->rl_registered) &&
+	    !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags))
 		rpcrdma_complete_rqst(rep);
 	else
 		queue_work(rpcrdma_receive_wq, &rep->rr_work);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 0c73439..dcc2df0 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -792,12 +792,13 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 		   r_xprt->rx_stats.failed_marshal_count,
 		   r_xprt->rx_stats.bad_reply_count,
 		   r_xprt->rx_stats.nomsg_call_count);
-	seq_printf(seq, "%lu %lu %lu %lu %lu\n",
+	seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
 		   r_xprt->rx_stats.mrs_recovered,
 		   r_xprt->rx_stats.mrs_orphaned,
 		   r_xprt->rx_stats.mrs_allocated,
 		   r_xprt->rx_stats.local_inv_needed,
-		   r_xprt->rx_stats.empty_sendctx_q);
+		   r_xprt->rx_stats.empty_sendctx_q,
+		   r_xprt->rx_stats.reply_waits_for_send);
 }
 
 static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index bab63ad..9a824fe 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1526,7 +1526,8 @@ struct rpcrdma_regbuf *
 	dprintk("RPC:       %s: posting %d s/g entries\n",
 		__func__, send_wr->num_sge);
 
-	if (!ep->rep_send_count) {
+	if (!ep->rep_send_count ||
+	    test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
 		send_wr->send_flags |= IB_SEND_SIGNALED;
 		ep->rep_send_count = ep->rep_send_batch;
 	} else {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c260475..bccd5d8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -236,11 +236,13 @@ struct rpcrdma_rep {
 
 /* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
  */
+struct rpcrdma_req;
 struct rpcrdma_xprt;
 struct rpcrdma_sendctx {
 	struct ib_send_wr	sc_wr;
 	struct ib_cqe		sc_cqe;
 	struct rpcrdma_xprt	*sc_xprt;
+	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
 	struct ib_sge		sc_sges[];
 };
@@ -387,6 +389,7 @@ struct rpcrdma_req {
 enum {
 	RPCRDMA_REQ_F_BACKCHANNEL = 0,
 	RPCRDMA_REQ_F_PENDING,
+	RPCRDMA_REQ_F_TX_RESOURCES,
 };
 
 static inline void
@@ -492,6 +495,7 @@ struct rpcrdma_stats {
 	/* accessed when receiving a reply */
 	unsigned long long	total_rdma_reply;
 	unsigned long long	fixup_copy_count;
+	unsigned long		reply_waits_for_send;
 	unsigned long		local_inv_needed;
 	unsigned long		nomsg_call_count;
 	unsigned long		bcall_count;


  parent reply	other threads:[~2017-10-20 14:48 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-20 14:47 [PATCH 0/9] More NFS/RDMA client patches for v4.15 Chuck Lever
2017-10-20 14:47 ` Chuck Lever
     [not found] ` <20171020143635.14869.15714.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>
2017-10-20 14:47   ` [PATCH 1/9] xprtrdma: Clean up SGE accounting in rpcrdma_prepare_msg_sges() Chuck Lever
2017-10-20 14:47     ` Chuck Lever
2017-10-20 14:47   ` [PATCH 2/9] xprtrdma: Fix error handling " Chuck Lever
2017-10-20 14:47     ` Chuck Lever
2017-10-20 14:47   ` [PATCH 3/9] xprtrdma: Change return value of rpcrdma_prepare_send_sges() Chuck Lever
2017-10-20 14:47     ` Chuck Lever
2017-10-20 14:48   ` [PATCH 4/9] xprtrdma: "Unoptimize" rpcrdma_prepare_hdr_sge() Chuck Lever
2017-10-20 14:48     ` Chuck Lever
2017-10-20 14:48   ` [PATCH 5/9] xprtrdma: Add data structure to manage RDMA Send arguments Chuck Lever
2017-10-20 14:48     ` Chuck Lever
2017-10-20 14:48   ` [PATCH 6/9] xprtrdma: Add a field of bit flags to struct rpcrdma_req Chuck Lever
2017-10-20 14:48     ` Chuck Lever
2017-10-20 14:48   ` [PATCH 7/9] xprtrdma: Refactor rpcrdma_deferred_completion Chuck Lever
2017-10-20 14:48     ` Chuck Lever
2017-10-20 14:48   ` Chuck Lever [this message]
2017-10-20 14:48     ` [PATCH 8/9] xprtrdma: RPC completion should wait for Send completion Chuck Lever
2017-10-20 14:48   ` [PATCH 9/9] xprtrdma: Remove atomic send completion counting Chuck Lever
2017-10-20 14:48     ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171020144836.14869.58073.stgit@manet.1015granger.net \
    --to=chuck.lever-qhclzuegtsvqt0dzr+alfa@public.gmane.org \
    --cc=anna.schumaker-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org \
    --cc=linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.