linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/2] Fix write pad
@ 2021-10-05 14:17 Chuck Lever
  2021-10-05 14:17 ` [PATCH v3 1/2] xprtrdma: Provide a buffer to pad Write chunks of unaligned length Chuck Lever
  2021-10-05 14:18 ` [PATCH v3 2/2] xprtrdma: Remove rpcrdma_ep::re_implicit_roundup Chuck Lever
  0 siblings, 2 replies; 4+ messages in thread
From: Chuck Lever @ 2021-10-05 14:17 UTC (permalink / raw)
  To: anna.schumaker, trondmy; +Cc: linux-nfs, linux-rdma

Hi Anna-

Can you include this series in your next pull request to Trond?

Trond, 

This series addresses your concern about xprtrdma using the 
rq_rcv_buf.tail vector in some cases to receive an XDR pad. It has
been tested with a legacy Solaris server, in addition to the usual
tests I run.

We haven't yet confirmed whether this addresses the GPU-Direct
issue, but since that is an out-of-tree user that depends on 
unsupported kernel facilities, I regard that confirmation as only
"nice to have", not mandatory. We are confident this change will
address that situation, based on our understanding of the issue.

Changes since v2:
- Address more review comments from Tom Talpey

---

Chuck Lever (2):
      xprtrdma: Provide a buffer to pad Write chunks of unaligned length
      xprtrdma: Remove rpcrdma_ep::re_implicit_roundup


 include/trace/events/rpcrdma.h  | 13 +++++++++---
 net/sunrpc/xprtrdma/frwr_ops.c  | 35 +++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/rpc_rdma.c  | 23 +++++++++++++---------
 net/sunrpc/xprtrdma/verbs.c     |  3 +--
 net/sunrpc/xprtrdma/xprt_rdma.h |  6 +++++-
 5 files changed, 65 insertions(+), 15 deletions(-)

--
Chuck Lever


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v3 1/2] xprtrdma: Provide a buffer to pad Write chunks of unaligned length
  2021-10-05 14:17 [PATCH v3 0/2] Fix write pad Chuck Lever
@ 2021-10-05 14:17 ` Chuck Lever
  2021-10-05 14:18 ` [PATCH v3 2/2] xprtrdma: Remove rpcrdma_ep::re_implicit_roundup Chuck Lever
  1 sibling, 0 replies; 4+ messages in thread
From: Chuck Lever @ 2021-10-05 14:17 UTC (permalink / raw)
  To: anna.schumaker, trondmy; +Cc: linux-nfs, linux-rdma

This is a buffer to be left persistently registered while a
connection is up. Connection tear-down will automatically DMA-unmap,
invalidate, and dereg the MR. A persistently registered buffer is
lower in cost to provide, and it can never be coalesced into the
RDMA segment that carries the data payload.

An RPC that provisions a Write chunk with a non-aligned length now
uses this MR rather than the tail buffer of the RPC's rq_rcv_buf.

Reviewed-By: Tom Talpey <tom@talpey.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/trace/events/rpcrdma.h  |   13 ++++++++++---
 net/sunrpc/xprtrdma/frwr_ops.c  |   35 +++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/rpc_rdma.c  |   23 ++++++++++++++---------
 net/sunrpc/xprtrdma/verbs.c     |    1 +
 net/sunrpc/xprtrdma/xprt_rdma.h |    5 +++++
 5 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index de4195499592..afb2e394797c 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -375,10 +375,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class,
 
 	TP_fast_assign(
 		const struct rpcrdma_req *req = mr->mr_req;
-		const struct rpc_task *task = req->rl_slot.rq_task;
 
-		__entry->task_id = task->tk_pid;
-		__entry->client_id = task->tk_client->cl_clid;
+		if (req) {
+			const struct rpc_task *task = req->rl_slot.rq_task;
+
+			__entry->task_id = task->tk_pid;
+			__entry->client_id = task->tk_client->cl_clid;
+		} else {
+			__entry->task_id = 0;
+			__entry->client_id = -1;
+		}
 		__entry->mr_id  = mr->mr_ibmr->res.id;
 		__entry->nents  = mr->mr_nents;
 		__entry->handle = mr->mr_handle;
@@ -639,6 +645,7 @@ TRACE_EVENT(xprtrdma_nomrs_err,
 DEFINE_RDCH_EVENT(read);
 DEFINE_WRCH_EVENT(write);
 DEFINE_WRCH_EVENT(reply);
+DEFINE_WRCH_EVENT(wp);
 
 TRACE_DEFINE_ENUM(rpcrdma_noch);
 TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index f700b34a5bfd..3eccf365fcb8 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -666,3 +666,38 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 */
 	rpcrdma_force_disconnect(ep);
 }
+
+/**
+ * frwr_wp_create - Create an MR for padding Write chunks
+ * @r_xprt: transport resources to use
+ *
+ * Return 0 on success, negative errno on failure.
+ */
+int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	struct rpcrdma_mr_seg seg;
+	struct rpcrdma_mr *mr;
+
+	mr = rpcrdma_mr_get(r_xprt);
+	if (!mr)
+		return -EAGAIN;
+	mr->mr_req = NULL;
+	ep->re_write_pad_mr = mr;
+
+	seg.mr_len = XDR_UNIT;
+	seg.mr_page = virt_to_page(ep->re_write_pad);
+	seg.mr_offset = offset_in_page(ep->re_write_pad);
+	if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr)))
+		return -EIO;
+	trace_xprtrdma_mr_fastreg(mr);
+
+	mr->mr_cqe.done = frwr_wc_fastreg;
+	mr->mr_regwr.wr.next = NULL;
+	mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
+	mr->mr_regwr.wr.num_sge = 0;
+	mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
+	mr->mr_regwr.wr.send_flags = 0;
+
+	return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL);
+}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c335c1361564..8035a983c8ce 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -255,15 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
 		page_base = 0;
 	}
 
-	if (type == rpcrdma_readch)
-		goto out;
-
-	/* When encoding a Write chunk, some servers need to see an
-	 * extra segment for non-XDR-aligned Write chunks. The upper
-	 * layer provides space in the tail iovec that may be used
-	 * for this purpose.
-	 */
-	if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
+	if (type == rpcrdma_readch || type == rpcrdma_writech)
 		goto out;
 
 	if (xdrbuf->tail[0].iov_len)
@@ -405,6 +397,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
 				     enum rpcrdma_chunktype wtype)
 {
 	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	struct rpcrdma_mr_seg *seg;
 	struct rpcrdma_mr *mr;
 	int nsegs, nchunks;
@@ -443,6 +436,18 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
 		nsegs -= mr->mr_nents;
 	} while (nsegs);
 
+	if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) {
+		if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr,
+					nsegs);
+		r_xprt->rx_stats.write_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+		nchunks++;
+		nsegs -= mr->mr_nents;
+	}
+
 	/* Update count of segments in this Write chunk */
 	*segcount = cpu_to_be32(nchunks);
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index aaec3c9be8db..c3784b7b6855 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -551,6 +551,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
 		goto out;
 	}
 	rpcrdma_mrs_create(r_xprt);
+	frwr_wp_create(r_xprt);
 
 out:
 	trace_xprtrdma_connect(r_xprt, rc);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d91f54eae00b..b6d8b3e6356c 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -68,12 +68,14 @@
 /*
  * RDMA Endpoint -- connection endpoint details
  */
+struct rpcrdma_mr;
 struct rpcrdma_ep {
 	struct kref		re_kref;
 	struct rdma_cm_id 	*re_id;
 	struct ib_pd		*re_pd;
 	unsigned int		re_max_rdma_segs;
 	unsigned int		re_max_fr_depth;
+	struct rpcrdma_mr	*re_write_pad_mr;
 	bool			re_implicit_roundup;
 	enum ib_mr_type		re_mrtype;
 	struct completion	re_done;
@@ -97,6 +99,8 @@ struct rpcrdma_ep {
 	unsigned int		re_inline_recv;	/* negotiated */
 
 	atomic_t		re_completion_ids;
+
+	char			re_write_pad[XDR_UNIT];
 };
 
 /* Pre-allocate extra Work Requests for handling reverse-direction
@@ -535,6 +539,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+int frwr_wp_create(struct rpcrdma_xprt *r_xprt);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v3 2/2] xprtrdma: Remove rpcrdma_ep::re_implicit_roundup
  2021-10-05 14:17 [PATCH v3 0/2] Fix write pad Chuck Lever
  2021-10-05 14:17 ` [PATCH v3 1/2] xprtrdma: Provide a buffer to pad Write chunks of unaligned length Chuck Lever
@ 2021-10-05 14:18 ` Chuck Lever
  1 sibling, 0 replies; 4+ messages in thread
From: Chuck Lever @ 2021-10-05 14:18 UTC (permalink / raw)
  To: anna.schumaker, trondmy; +Cc: linux-nfs, linux-rdma

Clean up: this field is no longer used.

xprt_rdma_pad_optimize is also no longer used, but is left in place
because it is part of the kernel/userspace API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c     |    2 --
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
 2 files changed, 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c3784b7b6855..3d3673ba9e1e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -205,14 +205,12 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
 	unsigned int rsize, wsize;
 
 	/* Default settings for RPC-over-RDMA Version One */
-	ep->re_implicit_roundup = xprt_rdma_pad_optimize;
 	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 
 	if (pmsg &&
 	    pmsg->cp_magic == rpcrdma_cmp_magic &&
 	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
-		ep->re_implicit_roundup = true;
 		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
 		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
 	}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b6d8b3e6356c..c79f92eeda76 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -76,7 +76,6 @@ struct rpcrdma_ep {
 	unsigned int		re_max_rdma_segs;
 	unsigned int		re_max_fr_depth;
 	struct rpcrdma_mr	*re_write_pad_mr;
-	bool			re_implicit_roundup;
 	enum ib_mr_type		re_mrtype;
 	struct completion	re_done;
 	unsigned int		re_send_count;



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v3 2/2] xprtrdma: Remove rpcrdma_ep::re_implicit_roundup
  2021-09-21 21:00 [PATCH v3 0/2] Provide a buffer for Write chunk padding Chuck Lever
@ 2021-09-21 21:00 ` Chuck Lever
  0 siblings, 0 replies; 4+ messages in thread
From: Chuck Lever @ 2021-09-21 21:00 UTC (permalink / raw)
  To: kolga; +Cc: linux-nfs, linux-rdma

Clean up: this field is no longer used.

xprt_rdma_pad_optimize is also no longer used, but is left in place
because it is part of the kernel/userspace API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c     |    2 --
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
 2 files changed, 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c3784b7b6855..3d3673ba9e1e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -205,14 +205,12 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
 	unsigned int rsize, wsize;
 
 	/* Default settings for RPC-over-RDMA Version One */
-	ep->re_implicit_roundup = xprt_rdma_pad_optimize;
 	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 
 	if (pmsg &&
 	    pmsg->cp_magic == rpcrdma_cmp_magic &&
 	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
-		ep->re_implicit_roundup = true;
 		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
 		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
 	}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b6d8b3e6356c..c79f92eeda76 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -76,7 +76,6 @@ struct rpcrdma_ep {
 	unsigned int		re_max_rdma_segs;
 	unsigned int		re_max_fr_depth;
 	struct rpcrdma_mr	*re_write_pad_mr;
-	bool			re_implicit_roundup;
 	enum ib_mr_type		re_mrtype;
 	struct completion	re_done;
 	unsigned int		re_send_count;



^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-10-05 14:18 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-05 14:17 [PATCH v3 0/2] Fix write pad Chuck Lever
2021-10-05 14:17 ` [PATCH v3 1/2] xprtrdma: Provide a buffer to pad Write chunks of unaligned length Chuck Lever
2021-10-05 14:18 ` [PATCH v3 2/2] xprtrdma: Remove rpcrdma_ep::re_implicit_roundup Chuck Lever
  -- strict thread matches above, loose matches on Subject: below --
2021-09-21 21:00 [PATCH v3 0/2] Provide a buffer for Write chunk padding Chuck Lever
2021-09-21 21:00 ` [PATCH v3 2/2] xprtrdma: Remove rpcrdma_ep::re_implicit_roundup Chuck Lever

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).