Linux-NFS Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v1 0/9] NFS/RDMA client patches for v5.6
@ 2020-01-03 16:56 Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 1/9] xprtrdma: Eliminate ri_max_send_sges Chuck Lever
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

Hi-

These patches clean up the RPC/RDMA client code in preparation for
rewriting the transport set-up code. These are the only NFS/RDMA-
related patches I plan to submit for v5.6.

---

Chuck Lever (9):
      xprtrdma: Eliminate ri_max_send_sges
      xprtrdma: Make sendctx queue lifetime the same as connection lifetime
      xprtrdma: Refactor initialization of ep->rep_max_requests
      xprtrdma: Eliminate per-transport "max pages"
      xprtrdma: Refactor frwr_is_supported
      xprtrdma: Allocate and map transport header buffers at connect time
      xprtrdma: Destroy rpcrdma_rep when Receive is flushed
      xprtrdma: Destroy reps from previous connection instance
      xprtrdma: DMA map rr_rdma_buf as each rpcrdma_rep is created


 include/trace/events/rpcrdma.h    |   12 +-
 net/sunrpc/xprtrdma/backchannel.c |    4 +
 net/sunrpc/xprtrdma/frwr_ops.c    |  104 ++++++++----------
 net/sunrpc/xprtrdma/rpc_rdma.c    |   20 +--
 net/sunrpc/xprtrdma/transport.c   |   17 +--
 net/sunrpc/xprtrdma/verbs.c       |  213 +++++++++++++++++++++----------------
 net/sunrpc/xprtrdma/xprt_rdma.h   |   14 +-
 7 files changed, 201 insertions(+), 183 deletions(-)

--
Chuck Lever


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 1/9] xprtrdma: Eliminate ri_max_send_sges
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 2/9] xprtrdma: Make sendctx queue lifetime the same as connection lifetime Chuck Lever
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

Clean-up. The max_send_sge value also happens to be stored in
ep->rep_attr. Let's keep just a single copy.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   10 ++++++++++
 net/sunrpc/xprtrdma/rpc_rdma.c  |    2 +-
 net/sunrpc/xprtrdma/verbs.c     |   17 +++--------------
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 50025408dc2a..24b8b75d0d49 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -183,6 +183,16 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
 {
 	struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
 	int max_qp_wr, depth, delta;
+	unsigned int max_sge;
+
+	max_sge = min_t(unsigned int, attrs->max_send_sge,
+			RPCRDMA_MAX_SEND_SGES);
+	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+		pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
+		return -ENOMEM;
+	}
+	ep->rep_attr.cap.max_send_sge = max_sge;
+	ep->rep_attr.cap.max_recv_sge = 1;
 
 	ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
 	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index aec3beb93b25..af917228d245 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -145,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
 			remaining -= min_t(unsigned int,
 					   PAGE_SIZE - offset, remaining);
 			offset = 0;
-			if (++count > r_xprt->rx_ia.ri_max_send_sges)
+			if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
 				return false;
 		}
 	}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4d7d7555569c..0ad81d3c735e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -469,21 +469,12 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
 	struct ib_cq *sendcq, *recvcq;
-	unsigned int max_sge;
 	int rc;
 
 	ep->rep_max_requests = xprt_rdma_slot_table_entries;
 	ep->rep_inline_send = xprt_rdma_max_inline_write;
 	ep->rep_inline_recv = xprt_rdma_max_inline_read;
 
-	max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge,
-			RPCRDMA_MAX_SEND_SGES);
-	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
-		pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
-		return -ENOMEM;
-	}
-	ia->ri_max_send_sges = max_sge;
-
 	rc = frwr_open(ia, ep);
 	if (rc)
 		return rc;
@@ -491,8 +482,6 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
 	ep->rep_attr.qp_context = ep;
 	ep->rep_attr.srq = NULL;
-	ep->rep_attr.cap.max_send_sge = max_sge;
-	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	ep->rep_attr.qp_type = IB_QPT_RC;
@@ -795,11 +784,11 @@ static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
 	kfree(buf->rb_sc_ctxs);
 }
 
-static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
+static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
 {
 	struct rpcrdma_sendctx *sc;
 
-	sc = kzalloc(struct_size(sc, sc_sges, ia->ri_max_send_sges),
+	sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
 		     GFP_KERNEL);
 	if (!sc)
 		return NULL;
@@ -827,7 +816,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
 
 	buf->rb_sc_last = i - 1;
 	for (i = 0; i <= buf->rb_sc_last; i++) {
-		sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
+		sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
 		if (!sc)
 			return -ENOMEM;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d796d68609ed..7655a99fd559 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -73,7 +73,6 @@ struct rpcrdma_ia {
 	int			ri_async_rc;
 	unsigned int		ri_max_segs;
 	unsigned int		ri_max_frwr_depth;
-	unsigned int		ri_max_send_sges;
 	bool			ri_implicit_roundup;
 	enum ib_mr_type		ri_mrtype;
 	unsigned long		ri_flags;



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 2/9] xprtrdma: Make sendctx queue lifetime the same as connection lifetime
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 1/9] xprtrdma: Eliminate ri_max_send_sges Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 3/9] xprtrdma: Refactor initialization of ep->rep_max_requests Chuck Lever
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

The size of the sendctx queue depends on the value stored in
ia->ri_max_send_sges. This value is determined by querying the
underlying device.

Eventually, rpcrdma_ia_open() and rpcrdma_ep_create() will be called
in the connect worker rather than at transport set-up time. The
underlying device will not have been chosen device set-up time.

The sendctx queue will thus have to be created after the underlying
device has been chosen via address and route resolution; in other
words, in the connect worker.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/trace/events/rpcrdma.h |   12 ++++++++----
 net/sunrpc/xprtrdma/verbs.c    |   22 +++++++++++++++-------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 28dd498e683e..6d9075507bd6 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -698,6 +698,7 @@ TRACE_EVENT(xprtrdma_post_send,
 
 	TP_STRUCT__entry(
 		__field(const void *, req)
+		__field(const void *, sc)
 		__field(unsigned int, task_id)
 		__field(unsigned int, client_id)
 		__field(int, num_sge)
@@ -712,14 +713,15 @@ TRACE_EVENT(xprtrdma_post_send,
 		__entry->client_id = rqst->rq_task->tk_client ?
 				     rqst->rq_task->tk_client->cl_clid : -1;
 		__entry->req = req;
+		__entry->sc = req->rl_sendctx;
 		__entry->num_sge = req->rl_wr.num_sge;
 		__entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED;
 		__entry->status = status;
 	),
 
-	TP_printk("task:%u@%u req=%p (%d SGE%s) %sstatus=%d",
+	TP_printk("task:%u@%u req=%p sc=%p (%d SGE%s) %sstatus=%d",
 		__entry->task_id, __entry->client_id,
-		__entry->req, __entry->num_sge,
+		__entry->req, __entry->sc, __entry->num_sge,
 		(__entry->num_sge == 1 ? "" : "s"),
 		(__entry->signaled ? "signaled " : ""),
 		__entry->status
@@ -818,6 +820,7 @@ TRACE_EVENT(xprtrdma_wc_send,
 
 	TP_STRUCT__entry(
 		__field(const void *, req)
+		__field(const void *, sc)
 		__field(unsigned int, unmap_count)
 		__field(unsigned int, status)
 		__field(unsigned int, vendor_err)
@@ -825,13 +828,14 @@ TRACE_EVENT(xprtrdma_wc_send,
 
 	TP_fast_assign(
 		__entry->req = sc->sc_req;
+		__entry->sc = sc;
 		__entry->unmap_count = sc->sc_unmap_count;
 		__entry->status = wc->status;
 		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
 	),
 
-	TP_printk("req=%p, unmapped %u pages: %s (%u/0x%x)",
-		__entry->req, __entry->unmap_count,
+	TP_printk("req=%p sc=%p unmapped=%u: %s (%u/0x%x)",
+		__entry->req, __entry->sc, __entry->unmap_count,
 		rdma_show_wc_status(__entry->status),
 		__entry->status, __entry->vendor_err
 	)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 0ad81d3c735e..3cc6d19f7f3a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,6 +74,8 @@
 /*
  * internal functions
  */
+static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
 				       struct rpcrdma_sendctx *sc);
 static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
@@ -427,6 +429,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
 		rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
 	}
 	rpcrdma_mrs_destroy(r_xprt);
+	rpcrdma_sendctxs_destroy(r_xprt);
 	ib_dealloc_pd(ia->ri_pd);
 	ia->ri_pd = NULL;
 
@@ -704,6 +707,10 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	rpcrdma_reset_cwnd(r_xprt);
 	rpcrdma_post_recvs(r_xprt, true);
 
+	rc = rpcrdma_sendctxs_create(r_xprt);
+	if (rc)
+		goto out;
+
 	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
 	if (rc)
 		goto out;
@@ -756,6 +763,7 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	rpcrdma_xprt_drain(r_xprt);
 	rpcrdma_reqs_reset(r_xprt);
 	rpcrdma_mrs_destroy(r_xprt);
+	rpcrdma_sendctxs_destroy(r_xprt);
 }
 
 /* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -775,13 +783,17 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  * queue activity, and rpcrdma_xprt_drain has flushed all remaining
  * Send requests.
  */
-static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt)
 {
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	unsigned long i;
 
+	if (!buf->rb_sc_ctxs)
+		return;
 	for (i = 0; i <= buf->rb_sc_last; i++)
 		kfree(buf->rb_sc_ctxs[i]);
 	kfree(buf->rb_sc_ctxs);
+	buf->rb_sc_ctxs = NULL;
 }
 
 static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
@@ -809,7 +821,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
 	 * Sends are posted.
 	 */
 	i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
-	dprintk("RPC:       %s: allocating %lu send_ctxs\n", __func__, i);
 	buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
 	if (!buf->rb_sc_ctxs)
 		return -ENOMEM;
@@ -823,6 +834,8 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
 		buf->rb_sc_ctxs[i] = sc;
 	}
 
+	buf->rb_sc_head = 0;
+	buf->rb_sc_tail = 0;
 	return 0;
 }
 
@@ -1165,10 +1178,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 
 	init_llist_head(&buf->rb_free_reps);
 
-	rc = rpcrdma_sendctxs_create(r_xprt);
-	if (rc)
-		goto out;
-
 	return 0;
 out:
 	rpcrdma_buffer_destroy(buf);
@@ -1244,7 +1253,6 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-	rpcrdma_sendctxs_destroy(buf);
 	rpcrdma_reps_destroy(buf);
 
 	while (!list_empty(&buf->rb_send_bufs)) {



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 3/9] xprtrdma: Refactor initialization of ep->rep_max_requests
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 1/9] xprtrdma: Eliminate ri_max_send_sges Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 2/9] xprtrdma: Make sendctx queue lifetime the same as connection lifetime Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 4/9] xprtrdma: Eliminate per-transport "max pages" Chuck Lever
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

Clean up: there is no need to keep two copies of the same value.
Also, in subsequent patches, rpcrdma_ep_create() will be called in
the connect worker rather than at set-up time.

Minor fix: Initialize the transport's sendctx to the value based on
the capabilities of the underlying device, not the maximum setting.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |    6 +++---
 net/sunrpc/xprtrdma/transport.c |    3 ++-
 net/sunrpc/xprtrdma/verbs.c     |    8 ++++----
 net/sunrpc/xprtrdma/xprt_rdma.h |    5 ++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index af917228d245..520323ddc930 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -909,7 +909,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 		goto out_err;
 	*p++ = rqst->rq_xid;
 	*p++ = rpcrdma_version;
-	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
+	*p++ = r_xprt->rx_buf.rb_max_requests;
 
 	/* When the ULP employs a GSS flavor that guarantees integrity
 	 * or privacy, direct data placement of individual data items
@@ -1480,8 +1480,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 
 	if (credits == 0)
 		credits = 1;	/* don't deadlock */
-	else if (credits > buf->rb_max_requests)
-		credits = buf->rb_max_requests;
+	else if (credits > r_xprt->rx_ep.rep_max_requests)
+		credits = r_xprt->rx_ep.rep_max_requests;
 	if (buf->rb_credits != credits)
 		rpcrdma_update_cwnd(r_xprt, credits);
 	rpcrdma_post_recvs(r_xprt, false);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 7395eb2cfdeb..f868a75057ad 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -316,7 +316,8 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (args->addrlen > sizeof(xprt->addr))
 		return ERR_PTR(-EBADF);
 
-	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0);
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
+			  xprt_rdma_slot_table_entries);
 	if (!xprt)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3cc6d19f7f3a..4f9595b72888 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -474,13 +474,14 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	struct ib_cq *sendcq, *recvcq;
 	int rc;
 
-	ep->rep_max_requests = xprt_rdma_slot_table_entries;
+	ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
 	ep->rep_inline_send = xprt_rdma_max_inline_write;
 	ep->rep_inline_recv = xprt_rdma_max_inline_read;
 
 	rc = frwr_open(ia, ep);
 	if (rc)
 		return rc;
+	r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
 
 	ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
 	ep->rep_attr.qp_context = ep;
@@ -820,7 +821,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
 	 * the ->send_request call to fail temporarily before too many
 	 * Sends are posted.
 	 */
-	i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+	i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
 	buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
 	if (!buf->rb_sc_ctxs)
 		return -ENOMEM;
@@ -1154,7 +1155,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	int i, rc;
 
-	buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
 	buf->rb_bc_srv_max_requests = 0;
 	spin_lock_init(&buf->rb_lock);
 	INIT_LIST_HEAD(&buf->rb_mrs);
@@ -1166,7 +1166,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	INIT_LIST_HEAD(&buf->rb_all_reps);
 
 	rc = -ENOMEM;
-	for (i = 0; i < buf->rb_max_requests; i++) {
+	for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) {
 		struct rpcrdma_req *req;
 
 		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 7655a99fd559..0fde694144f5 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -98,7 +98,7 @@ struct rpcrdma_ep {
 	wait_queue_head_t 	rep_connect_wait;
 	struct rpcrdma_connect_private	rep_cm_private;
 	struct rdma_conn_param	rep_remote_cma;
-	unsigned int		rep_max_requests;	/* set by /proc */
+	unsigned int		rep_max_requests;	/* depends on device */
 	unsigned int		rep_inline_send;	/* negotiated */
 	unsigned int		rep_inline_recv;	/* negotiated */
 	int			rep_receive_count;
@@ -372,7 +372,7 @@ struct rpcrdma_buffer {
 
 	struct llist_head	rb_free_reps;
 
-	u32			rb_max_requests;
+	__be32			rb_max_requests;
 	u32			rb_credits;	/* most recent credit grant */
 
 	u32			rb_bc_srv_max_requests;
@@ -582,7 +582,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
-extern unsigned int xprt_rdma_slot_table_entries;
 extern unsigned int xprt_rdma_max_inline_read;
 extern unsigned int xprt_rdma_max_inline_write;
 void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 4/9] xprtrdma: Eliminate per-transport "max pages"
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
                   ` (2 preceding siblings ...)
  2020-01-03 16:56 ` [PATCH v1 3/9] xprtrdma: Refactor initialization of ep->rep_max_requests Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 5/9] xprtrdma: Refactor frwr_is_supported Chuck Lever
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

To support device hotplug and migrating a connection between devices
of different capabilities, we have to guarantee that all in-kernel
devices can support the same max NFS payload size (1 megabyte).

This means that possibly one or two in-tree devices are no longer
supported for NFS/RDMA because they cannot support 1MB rsize/wsize.
The only one I confirmed was cxgb3, but it has already been removed
from the kernel.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   40 +++++++++++++++------------------------
 net/sunrpc/xprtrdma/rpc_rdma.c  |    2 +-
 net/sunrpc/xprtrdma/transport.c |   14 ++++----------
 net/sunrpc/xprtrdma/verbs.c     |    4 ++--
 net/sunrpc/xprtrdma/xprt_rdma.h |    3 +--
 5 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 24b8b75d0d49..f31ff54bb266 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -171,7 +171,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
  *	ep->rep_attr.cap.max_send_wr
  *	ep->rep_attr.cap.max_recv_wr
  *	ep->rep_max_requests
- *	ia->ri_max_segs
+ *	ia->ri_max_rdma_segs
  *
  * And these FRWR-related fields:
  *	ia->ri_max_frwr_depth
@@ -202,14 +202,12 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
 	 * capability, but perform optimally when the MRs are not larger
 	 * than a page.
 	 */
-	if (attrs->max_sge_rd > 1)
+	if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
 		ia->ri_max_frwr_depth = attrs->max_sge_rd;
 	else
 		ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
 	if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
 		ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
-	dprintk("RPC:       %s: max FR page list depth = %u\n",
-		__func__, ia->ri_max_frwr_depth);
 
 	/* Add room for frwr register and invalidate WRs.
 	 * 1. FRWR reg WR for head
@@ -253,30 +251,22 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
 	ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
 
-	ia->ri_max_segs =
+	ia->ri_max_rdma_segs =
 		DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
 	/* Reply chunks require segments for head and tail buffers */
-	ia->ri_max_segs += 2;
-	if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
-		ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS;
-	return 0;
-}
-
-/**
- * frwr_maxpages - Compute size of largest payload
- * @r_xprt: transport
- *
- * Returns maximum size of an RPC message, in pages.
- *
- * FRWR mode conveys a list of pages per chunk segment. The
- * maximum length of that list is the FRWR page list depth.
- */
-size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	ia->ri_max_rdma_segs += 2;
+	if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+		ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+
+	/* Ensure the underlying device is capable of conveying the
+	 * largest r/wsize NFS will ask for. This guarantees that
+	 * failing over from one RDMA device to another will not
+	 * break NFS I/O.
+	 */
+	if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+		return -ENOMEM;
 
-	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-		     (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
+	return 0;
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 520323ddc930..c6dcea06c754 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -111,7 +111,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
  */
 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
 {
-	unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs;
+	unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
 	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 
 	ep->rep_max_inline_send =
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index f868a75057ad..3cfeba68ee9a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -359,19 +359,13 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (rc)
 		goto out3;
 
-	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
-			  xprt_rdma_connect_worker);
-
-	xprt->max_payload = frwr_maxpages(new_xprt);
-	if (xprt->max_payload == 0)
-		goto out4;
-	xprt->max_payload <<= PAGE_SHIFT;
-	dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
-		__func__, xprt->max_payload);
-
 	if (!try_module_get(THIS_MODULE))
 		goto out4;
 
+	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
+			  xprt_rdma_connect_worker);
+	xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+
 	dprintk("RPC:       %s: %s:%s\n", __func__,
 		xprt->address_strings[RPC_DISPLAY_ADDR],
 		xprt->address_strings[RPC_DISPLAY_PORT]);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4f9595b72888..601a1bd56260 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -935,7 +935,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	unsigned int count;
 
-	for (count = 0; count < ia->ri_max_segs; count++) {
+	for (count = 0; count < ia->ri_max_rdma_segs; count++) {
 		struct rpcrdma_mr *mr;
 		int rc;
 
@@ -1017,7 +1017,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
 
 	/* Compute maximum header buffer size in bytes */
 	maxhdrsize = rpcrdma_fixed_maxsz + 3 +
-		     r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz;
+		     r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
 	maxhdrsize *= sizeof(__be32);
 	rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
 				  DMA_TO_DEVICE, flags);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0fde694144f5..aac4cf959c3a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -71,7 +71,7 @@ struct rpcrdma_ia {
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
 	int			ri_async_rc;
-	unsigned int		ri_max_segs;
+	unsigned int		ri_max_rdma_segs;
 	unsigned int		ri_max_frwr_depth;
 	bool			ri_implicit_roundup;
 	enum ib_mr_type		ri_mrtype;
@@ -539,7 +539,6 @@ void frwr_reset(struct rpcrdma_req *req);
 int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
 int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
 void frwr_release_mr(struct rpcrdma_mr *mr);
-size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt);
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				struct rpcrdma_mr_seg *seg,
 				int nsegs, bool writing, __be32 xid,



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 5/9] xprtrdma: Refactor frwr_is_supported
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
                   ` (3 preceding siblings ...)
  2020-01-03 16:56 ` [PATCH v1 4/9] xprtrdma: Eliminate per-transport "max pages" Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 6/9] xprtrdma: Allocate and map transport header buffers at connect time Chuck Lever
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

Refactor: Perform the "is supported" check in rpcrdma_ep_create()
instead of in rpcrdma_ia_open(). frwr_open() is where most of the
logic to query device attributes is already located.

The current code displays a redundant error message when the device
does not support FRWR. As an additional clean-up, this patch removes
the extra message.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   54 ++++++++++++++++-----------------------
 net/sunrpc/xprtrdma/verbs.c     |   14 +---------
 net/sunrpc/xprtrdma/xprt_rdma.h |    4 +--
 3 files changed, 25 insertions(+), 47 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index f31ff54bb266..b22a9d7f9c8d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -50,28 +50,6 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-/**
- * frwr_is_supported - Check if device supports FRWR
- * @device: interface adapter to check
- *
- * Returns true if device supports FRWR, otherwise false
- */
-bool frwr_is_supported(struct ib_device *device)
-{
-	struct ib_device_attr *attrs = &device->attrs;
-
-	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
-		goto out_not_supported;
-	if (attrs->max_fast_reg_page_list_len == 0)
-		goto out_not_supported;
-	return true;
-
-out_not_supported:
-	pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
-		device->name);
-	return false;
-}
-
 /**
  * frwr_release_mr - Destroy one MR
  * @mr: MR allocated by frwr_init_mr
@@ -163,13 +141,12 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 }
 
 /**
- * frwr_open - Prepare an endpoint for use with FRWR
- * @ia: interface adapter this endpoint will use
- * @ep: endpoint to prepare
+ * frwr_query_device - Prepare a transport for use with FRWR
+ * @r_xprt: controlling transport instance
+ * @device: RDMA device to query
  *
  * On success, sets:
- *	ep->rep_attr.cap.max_send_wr
- *	ep->rep_attr.cap.max_recv_wr
+ *	ep->rep_attr
  *	ep->rep_max_requests
  *	ia->ri_max_rdma_segs
  *
@@ -177,14 +154,27 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
  *	ia->ri_max_frwr_depth
  *	ia->ri_mrtype
  *
- * On failure, a negative errno is returned.
+ * Return values:
+ *   On success, returns zero.
+ *   %-EINVAL - the device does not support FRWR memory registration
+ *   %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
  */
-int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
+int frwr_query_device(struct rpcrdma_xprt *r_xprt,
+		      const struct ib_device *device)
 {
-	struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
+	const struct ib_device_attr *attrs = &device->attrs;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 	int max_qp_wr, depth, delta;
 	unsigned int max_sge;
 
+	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+	    attrs->max_fast_reg_page_list_len == 0) {
+		pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n",
+		       device->name);
+		return -EINVAL;
+	}
+
 	max_sge = min_t(unsigned int, attrs->max_send_sge,
 			RPCRDMA_MAX_SEND_SGES);
 	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
@@ -231,7 +221,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
 		} while (delta > 0);
 	}
 
-	max_qp_wr = ia->ri_id->device->attrs.max_qp_wr;
+	max_qp_wr = attrs->max_qp_wr;
 	max_qp_wr -= RPCRDMA_BACKWARD_WRS;
 	max_qp_wr -= 1;
 	if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
@@ -242,7 +232,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
 	if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
 		ep->rep_max_requests = max_qp_wr / depth;
 		if (!ep->rep_max_requests)
-			return -EINVAL;
+			return -ENOMEM;
 		ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
 	}
 	ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 601a1bd56260..92a52934b622 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -367,18 +367,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
 		goto out_err;
 	}
 
-	switch (xprt_rdma_memreg_strategy) {
-	case RPCRDMA_FRWR:
-		if (frwr_is_supported(ia->ri_id->device))
-			break;
-		/*FALLTHROUGH*/
-	default:
-		pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
-		       ia->ri_id->device->name, xprt_rdma_memreg_strategy);
-		rc = -EINVAL;
-		goto out_err;
-	}
-
 	return 0;
 
 out_err:
@@ -478,7 +466,7 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	ep->rep_inline_send = xprt_rdma_max_inline_write;
 	ep->rep_inline_recv = xprt_rdma_max_inline_read;
 
-	rc = frwr_open(ia, ep);
+	rc = frwr_query_device(r_xprt, ia->ri_id->device);
 	if (rc)
 		return rc;
 	r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index aac4cf959c3a..0aed1e98f2bf 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -534,9 +534,9 @@ rpcrdma_data_dir(bool writing)
 
 /* Memory registration calls xprtrdma/frwr_ops.c
  */
-bool frwr_is_supported(struct ib_device *device);
 void frwr_reset(struct rpcrdma_req *req);
-int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
+int frwr_query_device(struct rpcrdma_xprt *r_xprt,
+		      const struct ib_device *device);
 int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
 void frwr_release_mr(struct rpcrdma_mr *mr);
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 6/9] xprtrdma: Allocate and map transport header buffers at connect time
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
                   ` (4 preceding siblings ...)
  2020-01-03 16:56 ` [PATCH v1 5/9] xprtrdma: Refactor frwr_is_supported Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:56 ` [PATCH v1 7/9] xprtrdma: Destroy rpcrdma_rep when Receive is flushed Chuck Lever
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

Currently the underlying RDMA device is chosen at transport set-up
time. But it will soon be at connect time instead.

The maximum size of a transport header is based on device
capabilities. Thus transport header buffers have to be allocated
_after_ the underlying device has been chosen (via address and route
resolution); ie, in the connect worker.

Thus, move the allocation of transport header buffers to the connect
worker, after the point at which the underlying RDMA device has been
chosen.

This also means the RDMA device is available to do a DMA mapping of
these buffers at connect time, instead of in the hot I/O path. Make
that optimization as well.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/backchannel.c |    4 +
 net/sunrpc/xprtrdma/rpc_rdma.c    |   10 +--
 net/sunrpc/xprtrdma/verbs.c       |  106 +++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/xprt_rdma.h   |    1 
 4 files changed, 84 insertions(+), 37 deletions(-)

diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 9d02eae353c6..1a0ae0c61353 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -194,6 +194,10 @@ static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt)
 	req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
 	if (!req)
 		return NULL;
+	if (rpcrdma_req_setup(r_xprt, req)) {
+		rpcrdma_req_destroy(req);
+		return NULL;
+	}
 
 	xprt->bc_alloc_count++;
 	rqst = &req->rl_slot;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c6dcea06c754..28020ec104d4 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -580,22 +580,19 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
 
 /* Prepare an SGE for the RPC-over-RDMA transport header.
  */
-static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
 				    struct rpcrdma_req *req, u32 len)
 {
 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
 	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
 	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
 
-	if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
-		return false;
 	sge->addr = rdmab_addr(rb);
 	sge->length = len;
 	sge->lkey = rdmab_lkey(rb);
 
 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
 				      DMA_TO_DEVICE);
-	return true;
 }
 
 /* The head iovec is straightforward, as it is usually already
@@ -836,10 +833,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 	req->rl_wr.num_sge = 0;
 	req->rl_wr.opcode = IB_WR_SEND;
 
-	ret = -EIO;
-	if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
-		goto out_unmap;
+	rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen);
 
+	ret = -EIO;
 	switch (rtype) {
 	case rpcrdma_noch_pullup:
 		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 92a52934b622..7d06c6cd3d26 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -78,6 +78,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
 				       struct rpcrdma_sendctx *sc);
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
@@ -380,6 +381,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
  *
  * Divest transport H/W resources associated with this adapter,
  * but allow it to be restored later.
+ *
+ * Caller must hold the transport send lock.
  */
 void
 rpcrdma_ia_remove(struct rpcrdma_ia *ia)
@@ -387,8 +390,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
 	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
 						   rx_ia);
 	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct rpcrdma_req *req;
 
 	/* This is similar to rpcrdma_ep_destroy, but:
 	 * - Don't cancel the connect worker.
@@ -411,11 +412,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
 	 * mappings and MRs are gone.
 	 */
 	rpcrdma_reps_unmap(r_xprt);
-	list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
-		rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf);
-		rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
-		rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
-	}
+	rpcrdma_reqs_reset(r_xprt);
 	rpcrdma_mrs_destroy(r_xprt);
 	rpcrdma_sendctxs_destroy(r_xprt);
 	ib_dealloc_pd(ia->ri_pd);
@@ -714,6 +711,11 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 		goto out;
 	}
 
+	rc = rpcrdma_reqs_setup(r_xprt);
+	if (rc) {
+		rpcrdma_ep_disconnect(ep, ia);
+		goto out;
+	}
 	rpcrdma_mrs_create(r_xprt);
 
 out:
@@ -995,32 +997,19 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
 				       gfp_t flags)
 {
 	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
-	struct rpcrdma_regbuf *rb;
 	struct rpcrdma_req *req;
-	size_t maxhdrsize;
 
 	req = kzalloc(sizeof(*req), flags);
 	if (req == NULL)
 		goto out1;
 
-	/* Compute maximum header buffer size in bytes */
-	maxhdrsize = rpcrdma_fixed_maxsz + 3 +
-		     r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
-	maxhdrsize *= sizeof(__be32);
-	rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
-				  DMA_TO_DEVICE, flags);
-	if (!rb)
-		goto out2;
-	req->rl_rdmabuf = rb;
-	xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
-
 	req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags);
 	if (!req->rl_sendbuf)
-		goto out3;
+		goto out2;
 
 	req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags);
 	if (!req->rl_recvbuf)
-		goto out4;
+		goto out3;
 
 	INIT_LIST_HEAD(&req->rl_free_mrs);
 	INIT_LIST_HEAD(&req->rl_registered);
@@ -1029,10 +1018,8 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
 	spin_unlock(&buffer->rb_lock);
 	return req;
 
-out4:
-	kfree(req->rl_sendbuf);
 out3:
-	kfree(req->rl_rdmabuf);
+	kfree(req->rl_sendbuf);
 out2:
 	kfree(req);
 out1:
@@ -1040,23 +1027,82 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
 }
 
 /**
- * rpcrdma_reqs_reset - Reset all reqs owned by a transport
+ * rpcrdma_req_setup - Per-connection instance setup of an rpcrdma_req object
  * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req object to set up
  *
- * ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * Returns zero on success, and a negative errno on failure.
+ */
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct rpcrdma_regbuf *rb;
+	size_t maxhdrsize;
+
+	/* Compute maximum header buffer size in bytes */
+	maxhdrsize = rpcrdma_fixed_maxsz + 3 +
+		     r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+	maxhdrsize *= sizeof(__be32);
+	rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
+				  DMA_TO_DEVICE, GFP_KERNEL);
+	if (!rb)
+		goto out;
+
+	if (!__rpcrdma_regbuf_dma_map(r_xprt, rb))
+		goto out_free;
+
+	req->rl_rdmabuf = rb;
+	xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
+	return 0;
+
+out_free:
+	rpcrdma_regbuf_free(rb);
+out:
+	return -ENOMEM;
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
  * and thus can be walked without holding rb_lock. Eg. the
  * caller is holding the transport send lock to exclude
  * device removal or disconnection.
  */
-static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_req *req;
+	int rc;
 
 	list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
-		/* Credits are valid only for one connection */
-		req->rl_slot.rq_cong = 0;
+		rc = rpcrdma_req_setup(r_xprt, req);
+		if (rc)
+			return rc;
 	}
+	return 0;
+}
+
+static void rpcrdma_req_reset(struct rpcrdma_req *req)
+{
+	/* Credits are valid for only one connection */
+	req->rl_slot.rq_cong = 0;
+
+	rpcrdma_regbuf_free(req->rl_rdmabuf);
+	req->rl_rdmabuf = NULL;
+
+	rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
+	rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * and thus can be walked without holding rb_lock. Eg. the
+ * caller is holding the transport send lock to exclude
+ * device removal or disconnection.
+ */
+static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+
+	list_for_each_entry(req, &buf->rb_allreqs, rl_all)
+		rpcrdma_req_reset(req);
 }
 
 static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0aed1e98f2bf..37d5080c250b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -478,6 +478,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
  */
 struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
 				       gfp_t flags);
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void rpcrdma_req_destroy(struct rpcrdma_req *req);
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 7/9] xprtrdma: Destroy rpcrdma_rep when Receive is flushed
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
                   ` (5 preceding siblings ...)
  2020-01-03 16:56 ` [PATCH v1 6/9] xprtrdma: Allocate and map transport header buffers at connect time Chuck Lever
@ 2020-01-03 16:56 ` Chuck Lever
  2020-01-03 16:57 ` [PATCH v1 8/9] xprtrdma: Destroy reps from previous connection instance Chuck Lever
  2020-01-03 16:57 ` [PATCH v1 9/9] xprtrdma: DMA map rr_rdma_buf as each rpcrdma_rep is created Chuck Lever
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:56 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

This reduces the hardware and memory footprint of an unconnected
transport.

At some point in the future, transport reconnect will allow
resolving the destination IP address through a different device. The
current change enables reps for the new connection to be allocated
on whichever NUMA node the new device affines to after a reconnect.

Note that this does not destroy _all_ the transport's reps... there
will be a few that are still part of a running RPC completion.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c |    9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 7d06c6cd3d26..269df615a024 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -80,6 +80,7 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
 				       struct rpcrdma_sendctx *sc);
 static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
@@ -177,7 +178,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	return;
 
 out_flushed:
-	rpcrdma_recv_buffer_put(rep);
+	rpcrdma_rep_destroy(rep);
 }
 
 static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
@@ -1105,6 +1106,9 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
 		rpcrdma_req_reset(req);
 }
 
+/* No locking needed here. This function is called only by the
+ * Receive completion handler.
+ */
 static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 					      bool temp)
 {
@@ -1137,6 +1141,9 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	return NULL;
 }
 
+/* No locking needed here. This function is invoked only by the
+ * Receive completion handler, or during transport shutdown.
+ */
 static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
 {
 	list_del(&rep->rr_all);



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 8/9] xprtrdma: Destroy reps from previous connection instance
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
                   ` (6 preceding siblings ...)
  2020-01-03 16:56 ` [PATCH v1 7/9] xprtrdma: Destroy rpcrdma_rep when Receive is flushed Chuck Lever
@ 2020-01-03 16:57 ` Chuck Lever
  2020-01-03 16:57 ` [PATCH v1 9/9] xprtrdma: DMA map rr_rdma_buf as each rpcrdma_rep is created Chuck Lever
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:57 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

To safely get rid of all rpcrdma_reps from a particular connection
instance, xprtrdma has to wait until each of those reps is finished
being used. A rep may be backing the rq_rcv_buf of an RPC that has
just completed, for example.

Since it is safe to invoke rpcrdma_rep_destroy() only in the Receive
completion handler, simply mark reps remaining in the rb_all_reps
list after the transport is drained. These will then be deleted as
rpcrdma_post_recvs pulls them off the rep free list.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 269df615a024..3169887f8547 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1173,8 +1173,10 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_rep *rep;
 
-	list_for_each_entry(rep, &buf->rb_all_reps, rr_all)
+	list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
 		rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
+		rep->rr_temp = true;
+	}
 }
 
 static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v1 9/9] xprtrdma: DMA map rr_rdma_buf as each rpcrdma_rep is created
  2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
                   ` (7 preceding siblings ...)
  2020-01-03 16:57 ` [PATCH v1 8/9] xprtrdma: Destroy reps from previous connection instance Chuck Lever
@ 2020-01-03 16:57 ` Chuck Lever
  8 siblings, 0 replies; 10+ messages in thread
From: Chuck Lever @ 2020-01-03 16:57 UTC (permalink / raw)
  To: linux-nfs, linux-rdma

Clean up: This simplifies the logic in rpcrdma_post_recvs.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c |   31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3169887f8547..c3e92b5607b5 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1109,8 +1109,9 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
 /* No locking needed here. This function is called only by the
  * Receive completion handler.
  */
-static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
-					      bool temp)
+static noinline
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
+				       bool temp)
 {
 	struct rpcrdma_rep *rep;
 
@@ -1123,6 +1124,9 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	if (!rep->rr_rdmabuf)
 		goto out_free;
 
+	if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
+		goto out_free_regbuf;
+
 	xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
 		     rdmab_length(rep->rr_rdmabuf));
 	rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1135,6 +1139,8 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
 	return rep;
 
+out_free_regbuf:
+	rpcrdma_regbuf_free(rep->rr_rdmabuf);
 out_free:
 	kfree(rep);
 out:
@@ -1536,7 +1542,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-	struct ib_recv_wr *i, *wr, *bad_wr;
+	struct ib_recv_wr *wr, *bad_wr;
 	struct rpcrdma_rep *rep;
 	int needed, count, rc;
 
@@ -1563,23 +1569,15 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 		if (!rep)
 			break;
 
+		trace_xprtrdma_post_recv(rep);
 		rep->rr_recv_wr.next = wr;
 		wr = &rep->rr_recv_wr;
 		--needed;
+		++count;
 	}
 	if (!wr)
 		goto out;
 
-	for (i = wr; i; i = i->next) {
-		rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
-
-		if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
-			goto release_wrs;
-
-		trace_xprtrdma_post_recv(rep);
-		++count;
-	}
-
 	rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
 			  (const struct ib_recv_wr **)&bad_wr);
 out:
@@ -1596,11 +1594,4 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 	}
 	ep->rep_receive_count += count;
 	return;
-
-release_wrs:
-	for (i = wr; i;) {
-		rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
-		i = i->next;
-		rpcrdma_recv_buffer_put(rep);
-	}
 }



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, back to index

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-03 16:56 [PATCH v1 0/9] NFS/RDMA client patches for v5.6 Chuck Lever
2020-01-03 16:56 ` [PATCH v1 1/9] xprtrdma: Eliminate ri_max_send_sges Chuck Lever
2020-01-03 16:56 ` [PATCH v1 2/9] xprtrdma: Make sendctx queue lifetime the same as connection lifetime Chuck Lever
2020-01-03 16:56 ` [PATCH v1 3/9] xprtrdma: Refactor initialization of ep->rep_max_requests Chuck Lever
2020-01-03 16:56 ` [PATCH v1 4/9] xprtrdma: Eliminate per-transport "max pages" Chuck Lever
2020-01-03 16:56 ` [PATCH v1 5/9] xprtrdma: Refactor frwr_is_supported Chuck Lever
2020-01-03 16:56 ` [PATCH v1 6/9] xprtrdma: Allocate and map transport header buffers at connect time Chuck Lever
2020-01-03 16:56 ` [PATCH v1 7/9] xprtrdma: Destroy rpcrdma_rep when Receive is flushed Chuck Lever
2020-01-03 16:57 ` [PATCH v1 8/9] xprtrdma: Destroy reps from previous connection instance Chuck Lever
2020-01-03 16:57 ` [PATCH v1 9/9] xprtrdma: DMA map rr_rdma_buf as each rpcrdma_rep is created Chuck Lever

Linux-NFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-nfs/0 linux-nfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-nfs linux-nfs/ https://lore.kernel.org/linux-nfs \
		linux-nfs@vger.kernel.org
	public-inbox-index linux-nfs

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-nfs


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git