All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-03-13 21:21 Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 01/16] xprtrdma: Display IPv6 addresses and port numbers correctly Chuck Lever
                   ` (16 more replies)
  0 siblings, 17 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:21 UTC (permalink / raw)
  To: linux-nfs

This is a series of client-side patches for NFS/RDMA. In preparation
for increasing the transport credit limit and maximum rsize/wsize,
I've re-factored the memory registration logic into separate files,
invoked via a method API.

Two optimizations appear in this series:

The old code pre-allocated 64 MRs for every RPC, and attached 64 MRs
to each RPC before posting it. The new code attaches just enough MRs
to handle each RPC. When no data payload chunk is needed, no MRs are
attached to the RPC. For modern HCAs, only one MR is needed for NFS
read or write data payloads.

The final patch in the series splits the rb_lock in two in order to
reduce lock contention.

The series is also available in the nfs-rdma-for-4.1 topic branch at

 git://linux-nfs.org/projects/cel/cel-2.6.git

---

Chuck Lever (16):
      xprtrdma: Display IPv6 addresses and port numbers correctly
      xprtrdma: Perform a full marshal on retransmit
      xprtrdma: Add vector of ops for each memory registration strategy
      xprtrdma: Add a "max_payload" op for each memreg mode
      xprtrdma: Add a "register_external" op for each memreg mode
      xprtrdma: Add a "deregister_external" op for each memreg mode
      xprtrdma: Add "init MRs" memreg op
      xprtrdma: Add "reset MRs" memreg op
      xprtrdma: Add "destroy MRs" memreg op
      xprtrdma: Add "open" memreg op
      xprtrdma: Handle non-SEND completions via a callout
      xprtrdma: Acquire FMRs in rpcrdma_fmr_register_external()
      xprtrdma: Acquire MRs in rpcrdma_register_external()
      xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy
      xprtrdma: Make rpcrdma_{un}map_one() into inline functions
      xprtrdma: Split rb_lock


 include/linux/sunrpc/xprtrdma.h    |    3 
 net/sunrpc/xprtrdma/Makefile       |    3 
 net/sunrpc/xprtrdma/fmr_ops.c      |  270 +++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |  485 ++++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |  110 +++++
 net/sunrpc/xprtrdma/rpc_rdma.c     |   82 ++-
 net/sunrpc/xprtrdma/transport.c    |   65 ++-
 net/sunrpc/xprtrdma/verbs.c        |  856 +++---------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |  108 ++++-
 9 files changed, 1096 insertions(+), 886 deletions(-)
 create mode 100644 net/sunrpc/xprtrdma/fmr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/frwr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/physical_ops.c

--
Chuck Lever

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 01/16] xprtrdma: Display IPv6 addresses and port numbers correctly
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
@ 2015-03-13 21:21 ` Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 02/16] xprtrdma: Perform a full marshal on retransmit Chuck Lever
                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:21 UTC (permalink / raw)
  To: linux-nfs

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/transport.c |   47 ++++++++++++++++++++++++++++++++-------
 net/sunrpc/xprtrdma/verbs.c     |   21 +++++++----------
 2 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2e192ba..26a62e7 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = {
 static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
 
 static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	char buf[20];
+
+	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	char buf[40];
+
+	snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma6";
+}
+
+static void
 xprt_rdma_format_addresses(struct rpc_xprt *xprt)
 {
 	struct sockaddr *sap = (struct sockaddr *)
 					&rpcx_to_rdmad(xprt).addr;
-	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	char buf[64];
+	char buf[128];
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		xprt_rdma_format_addresses4(xprt, sap);
+		break;
+	case AF_INET6:
+		xprt_rdma_format_addresses6(xprt, sap);
+		break;
+	default:
+		pr_err("rpcrdma: Unrecognized address family\n");
+		return;
+	}
 
 	(void)rpc_ntop(sap, buf, sizeof(buf));
 	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
 	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
 	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
 
-	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
-
-	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
-	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
-
 	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
 	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
 
-	/* netid */
-	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
 }
 
 static void
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 124676c..1aa55b7 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/prefetch.h>
+#include <linux/sunrpc/addr.h>
 #include <asm/bitops.h>
 
 #include "xprt_rdma.h"
@@ -424,7 +425,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	struct rpcrdma_ep *ep = &xprt->rx_ep;
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+	struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
 #endif
 	struct ib_qp_attr *attr = &ia->ri_qp_attr;
 	struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +481,8 @@ connected:
 		wake_up_all(&ep->rep_connect_wait);
 		/*FALLTHROUGH*/
 	default:
-		dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
-			__func__, &addr->sin_addr.s_addr,
-			ntohs(addr->sin_port), ep,
+		dprintk("RPC:       %s: %pIS:%u (ep 0x%p): %s\n",
+			__func__, sap, rpc_get_port(sap), ep,
 			CONNECTION_MSG(event->event));
 		break;
 	}
@@ -491,19 +491,16 @@ connected:
 	if (connstate == 1) {
 		int ird = attr->max_dest_rd_atomic;
 		int tird = ep->rep_remote_cma.responder_resources;
-		printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
-			"on %s, memreg %d slots %d ird %d%s\n",
-			&addr->sin_addr.s_addr,
-			ntohs(addr->sin_port),
+
+		pr_info("rpcrdma: connection to %pIS:%u on %s, memreg %d slots %d ird %d%s\n",
+			sap, rpc_get_port(sap),
 			ia->ri_id->device->name,
 			ia->ri_memreg_strategy,
 			xprt->rx_buf.rb_max_requests,
 			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
 	} else if (connstate < 0) {
-		printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
-			&addr->sin_addr.s_addr,
-			ntohs(addr->sin_port),
-			connstate);
+		pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
+			sap, rpc_get_port(sap), connstate);
 	}
 #endif
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 02/16] xprtrdma: Perform a full marshal on retransmit
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 01/16] xprtrdma: Display IPv6 addresses and port numbers correctly Chuck Lever
@ 2015-03-13 21:21 ` Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 03/16] xprtrdma: Add vector of ops for each memory registration strategy Chuck Lever
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:21 UTC (permalink / raw)
  To: linux-nfs

Commit 6ab59945f292 ("xprtrdma: Update rkeys after transport
reconnect" added logic in the ->send_request path to update the
chunk list when an RPC/RDMA request is retransmitted.

Note that rpc_xdr_encode() resets and re-encodes the entire RPC
send buffer for each retransmit of an RPC. The RPC send buffer
is not preserved from the previous transmission of an RPC.

Revert 6ab59945f292, and instead, just force each request to be
fully marshaled every time through ->send_request. This should
preserve the fix from 6ab59945f292, while also performing pullup
during retransmits.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |   71 ++++++++++++++++++---------------------
 net/sunrpc/xprtrdma/transport.c |    5 +--
 net/sunrpc/xprtrdma/xprt_rdma.h |   10 -----
 3 files changed, 34 insertions(+), 52 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 91ffde8..41456d9 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,6 +53,14 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
 	"pure inline",	/* no chunks */
@@ -284,28 +292,6 @@ out:
 }
 
 /*
- * Marshal chunks. This routine returns the header length
- * consumed by marshaling.
- *
- * Returns positive RPC/RDMA header size, or negative errno.
- */
-
-ssize_t
-rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
-{
-	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-	struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
-
-	if (req->rl_rtype != rpcrdma_noch)
-		result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
-					       headerp, req->rl_rtype);
-	else if (req->rl_wtype != rpcrdma_noch)
-		result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
-					       headerp, req->rl_wtype);
-	return result;
-}
-
-/*
  * Copy write data inline.
  * This function is used for "small" requests. Data which is passed
  * to RPC via iovecs (or page list) is copied directly into the
@@ -397,6 +383,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	char *base;
 	size_t rpclen, padlen;
 	ssize_t hdrlen;
+	enum rpcrdma_chunktype rtype, wtype;
 	struct rpcrdma_msg *headerp;
 
 	/*
@@ -433,13 +420,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * into pages; otherwise use reply chunks.
 	 */
 	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-		req->rl_wtype = rpcrdma_noch;
+		wtype = rpcrdma_noch;
 	else if (rqst->rq_rcv_buf.page_len == 0)
-		req->rl_wtype = rpcrdma_replych;
+		wtype = rpcrdma_replych;
 	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-		req->rl_wtype = rpcrdma_writech;
+		wtype = rpcrdma_writech;
 	else
-		req->rl_wtype = rpcrdma_replych;
+		wtype = rpcrdma_replych;
 
 	/*
 	 * Chunks needed for arguments?
@@ -456,16 +443,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * TBD check NFSv4 setacl
 	 */
 	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-		req->rl_rtype = rpcrdma_noch;
+		rtype = rpcrdma_noch;
 	else if (rqst->rq_snd_buf.page_len == 0)
-		req->rl_rtype = rpcrdma_areadch;
+		rtype = rpcrdma_areadch;
 	else
-		req->rl_rtype = rpcrdma_readch;
+		rtype = rpcrdma_readch;
 
 	/* The following simplification is not true forever */
-	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
-		req->rl_wtype = rpcrdma_noch;
-	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
+	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+		wtype = rpcrdma_noch;
+	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
 		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
 			__func__);
 		return -EIO;
@@ -479,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * When padding is in use and applies to the transfer, insert
 	 * it and change the message type.
 	 */
-	if (req->rl_rtype == rpcrdma_noch) {
+	if (rtype == rpcrdma_noch) {
 
 		padlen = rpcrdma_inline_pullup(rqst,
 						RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +481,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
 			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
 			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-			if (req->rl_wtype != rpcrdma_noch) {
+			if (wtype != rpcrdma_noch) {
 				dprintk("RPC:       %s: invalid chunk list\n",
 					__func__);
 				return -EIO;
@@ -515,18 +502,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 			 * on receive. Therefore, we request a reply chunk
 			 * for non-writes wherever feasible and efficient.
 			 */
-			if (req->rl_wtype == rpcrdma_noch)
-				req->rl_wtype = rpcrdma_replych;
+			if (wtype == rpcrdma_noch)
+				wtype = rpcrdma_replych;
 		}
 	}
 
-	hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
+	if (rtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+					       headerp, rtype);
+		wtype = rtype;	/* simplify dprintk */
+
+	} else if (wtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+					       headerp, wtype);
+	}
 	if (hdrlen < 0)
 		return hdrlen;
 
 	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
 		" headerp 0x%p base 0x%p lkey 0x%x\n",
-		__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
+		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
 		headerp, base, rdmab_lkey(req->rl_rdmabuf));
 
 	/*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 26a62e7..271d306 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -608,10 +608,7 @@ xprt_rdma_send_request(struct rpc_task *task)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	int rc = 0;
 
-	if (req->rl_niovs == 0)
-		rc = rpcrdma_marshal_req(rqst);
-	else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
-		rc = rpcrdma_marshal_chunks(rqst, 0);
+	rc = rpcrdma_marshal_req(rqst);
 	if (rc < 0)
 		goto failed_marshal;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0a16fb6..c8afd83 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -143,14 +143,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 	return (struct rpcrdma_msg *)rb->rg_base;
 }
 
-enum rpcrdma_chunktype {
-	rpcrdma_noch = 0,
-	rpcrdma_readch,
-	rpcrdma_areadch,
-	rpcrdma_writech,
-	rpcrdma_replych
-};
-
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -258,7 +250,6 @@ struct rpcrdma_req {
 	unsigned int	rl_niovs;	/* 0, 2 or 4 */
 	unsigned int	rl_nchunks;	/* non-zero if chunks */
 	unsigned int	rl_connect_cookie;	/* retry detection */
-	enum rpcrdma_chunktype	rl_rtype, rl_wtype;
 	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
 	struct ib_sge	rl_send_iov[4];	/* for active requests */
@@ -418,7 +409,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
-ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
 int rpcrdma_marshal_req(struct rpc_rqst *);
 size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 03/16] xprtrdma: Add vector of ops for each memory registration strategy
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 01/16] xprtrdma: Display IPv6 addresses and port numbers correctly Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 02/16] xprtrdma: Perform a full marshal on retransmit Chuck Lever
@ 2015-03-13 21:21 ` Chuck Lever
  2015-03-13 21:21 ` [PATCH v1 04/16] xprtrdma: Add a "max_payload" op for each memreg mode Chuck Lever
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:21 UTC (permalink / raw)
  To: linux-nfs

Instead of employing switch() statements, let's use the typical
Linux kernel idiom for handling behavioral variation: virtual
functions.

Define a vector of operations for each supported memory registration
mode, and add a source file for each mode.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/Makefile       |    3 ++-
 net/sunrpc/xprtrdma/fmr_ops.c      |   22 ++++++++++++++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   22 ++++++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |   24 ++++++++++++++++++++++++
 net/sunrpc/xprtrdma/verbs.c        |   11 +++++++----
 net/sunrpc/xprtrdma/xprt_rdma.h    |   12 ++++++++++++
 6 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 net/sunrpc/xprtrdma/fmr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/frwr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/physical_ops.c

diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index da5136f..579f72b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
 
-xprtrdma-y := transport.o rpc_rdma.o verbs.o
+xprtrdma-y := transport.o rpc_rdma.o verbs.o \
+	fmr_ops.o frwr_ops.o physical_ops.o
 
 obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
 
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 0000000..ffb7d93
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Memory Regions (FMR).
+ * Referred to sometimes as MTHCAFMR mode.
+ *
+ * FMR uses synchronous memory registration and deregistration.
+ * FMR registration is known to be fast, but FMR deregistration
+ * can take tens of usecs to complete.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+	.ro_displayname			= "fmr",
+};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 0000000..79173f9
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR). Also referred to sometimes as FRMR mode.
+ *
+ * FRWR features ordered asynchronous registration and deregistration
+ * of arbitrarily sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+	.ro_displayname			= "frwr",
+};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 0000000..b0922ac
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* No-op chunk preparation. All client memory is pre-registered.
+ * Sometimes referred to as ALLPHYSICAL mode.
+ *
+ * Physical registration is simple because all client memory is
+ * pre-registered and never deregistered. This mode is good for
+ * adapter bring up, but is considered not safe: the server is
+ * trusted not to abuse its access to client memory not involved
+ * in RDMA I/O.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+	.ro_displayname			= "physical",
+};
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 1aa55b7..e4d9d9c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -492,10 +492,10 @@ connected:
 		int ird = attr->max_dest_rd_atomic;
 		int tird = ep->rep_remote_cma.responder_resources;
 
-		pr_info("rpcrdma: connection to %pIS:%u on %s, memreg %d slots %d ird %d%s\n",
+		pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
 			sap, rpc_get_port(sap),
 			ia->ri_id->device->name,
-			ia->ri_memreg_strategy,
+			ia->ri_ops->ro_displayname,
 			xprt->rx_buf.rb_max_requests,
 			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
 	} else if (connstate < 0) {
@@ -649,13 +649,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	 */
 	switch (memreg) {
 	case RPCRDMA_FRMR:
+		ia->ri_ops = &rpcrdma_frwr_memreg_ops;
 		break;
 	case RPCRDMA_ALLPHYSICAL:
+		ia->ri_ops = &rpcrdma_physical_memreg_ops;
 		mem_priv = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_WRITE |
 				IB_ACCESS_REMOTE_READ;
 		goto register_setup;
 	case RPCRDMA_MTHCAFMR:
+		ia->ri_ops = &rpcrdma_fmr_memreg_ops;
 		if (ia->ri_have_dma_lkey)
 			break;
 		mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -675,8 +678,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		rc = -ENOMEM;
 		goto out3;
 	}
-	dprintk("RPC:       %s: memory registration strategy is %d\n",
-		__func__, memreg);
+	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
+		__func__, ia->ri_ops->ro_displayname);
 
 	/* Else will do memory reg/dereg for each chunk */
 	ia->ri_memreg_strategy = memreg;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c8afd83..ef3cf4a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -60,6 +60,7 @@
  * Interface Adapter -- one per transport instance
  */
 struct rpcrdma_ia {
+	const struct rpcrdma_memreg_ops	*ri_ops;
 	rwlock_t		ri_qplock;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
@@ -331,6 +332,17 @@ struct rpcrdma_stats {
 };
 
 /*
+ * Per-registration mode operations
+ */
+struct rpcrdma_memreg_ops {
+	const char	*ro_displayname;
+};
+
+extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
+
+/*
  * RPCRDMA transport -- encapsulates the structures above for
  * integration with RPC.
  *


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 04/16] xprtrdma: Add a "max_payload" op for each memreg mode
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (2 preceding siblings ...)
  2015-03-13 21:21 ` [PATCH v1 03/16] xprtrdma: Add vector of ops for each memory registration strategy Chuck Lever
@ 2015-03-13 21:21 ` Chuck Lever
  2015-03-13 21:22 ` [PATCH v1 05/16] xprtrdma: Add a "register_external" " Chuck Lever
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:21 UTC (permalink / raw)
  To: linux-nfs

The max_payload computation is generalized to ensure that the
payload maximum is the lesser of RPC_MAX_DATA_SEGS and the number of
data segments that can be transmitted in an inline buffer.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   13 ++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   13 ++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |   10 +++++++
 net/sunrpc/xprtrdma/transport.c    |    5 +++-
 net/sunrpc/xprtrdma/verbs.c        |   49 +++++++++++-------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    5 +++-
 6 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index ffb7d93..eec2660 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -17,6 +17,19 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+/* Maximum scatter/gather per FMR */
+#define RPCRDMA_MAX_FMR_SGES	(64)
+
+/* FMR mode conveys up to 64 pages of payload per chunk segment.
+ */
+static size_t
+fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+	.ro_maxpages			= fmr_op_maxpages,
 	.ro_displayname			= "fmr",
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 79173f9..73a5ac8 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -17,6 +17,19 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+/* FRWR mode conveys a list of pages per chunk segment. The
+ * maximum length of that list is the FRWR page list depth.
+ */
+static size_t
+frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+	.ro_maxpages			= frwr_op_maxpages,
 	.ro_displayname			= "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index b0922ac..28ade19 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -19,6 +19,16 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+/* PHYSICAL memory registration conveys one page per chunk segment.
+ */
+static size_t
+physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     rpcrdma_max_segments(r_xprt));
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+	.ro_maxpages			= physical_op_maxpages,
 	.ro_displayname			= "physical",
 };
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 271d306..9a9da40 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -406,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
 			  xprt_rdma_connect_worker);
 
 	xprt_rdma_format_addresses(xprt);
-	xprt->max_payload = rpcrdma_max_payload(new_xprt);
+	xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+	if (xprt->max_payload == 0)
+		goto out4;
+	xprt->max_payload <<= PAGE_SHIFT;
 	dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
 		__func__, xprt->max_payload);
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e4d9d9c..837d4ea 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -2215,43 +2215,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 	return rc;
 }
 
-/* Physical mapping means one Read/Write list entry per-page.
- * All list entries must fit within an inline buffer
- *
- * NB: The server must return a Write list for NFS READ,
- *     which has the same constraint. Factor in the inline
- *     rsize as well.
+/* How many chunk list items fit within our inline buffers?
  */
-static size_t
-rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+unsigned int
+rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
-	unsigned int inline_size, pages;
-
-	inline_size = min_t(unsigned int,
-			    cdata->inline_wsize, cdata->inline_rsize);
-	inline_size -= RPCRDMA_HDRLEN_MIN;
-	pages = inline_size / sizeof(struct rpcrdma_segment);
-	return pages << PAGE_SHIFT;
-}
+	int bytes, segments;
 
-static size_t
-rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
-{
-	return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
-}
-
-size_t
-rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
-{
-	size_t result;
-
-	switch (r_xprt->rx_ia.ri_memreg_strategy) {
-	case RPCRDMA_ALLPHYSICAL:
-		result = rpcrdma_physical_max_payload(r_xprt);
-		break;
-	default:
-		result = rpcrdma_mr_max_payload(r_xprt);
+	bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
+	bytes -= RPCRDMA_HDRLEN_MIN;
+	if (bytes < sizeof(struct rpcrdma_segment) * 2) {
+		pr_warn("RPC:       %s: inline threshold too small\n",
+			__func__);
+		return 0;
 	}
-	return result;
+
+	segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
+	dprintk("RPC:       %s: max chunk list size = %d segments\n",
+		__func__, segments);
+	return segments;
 }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ef3cf4a..59e627e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -334,7 +334,9 @@ struct rpcrdma_stats {
 /*
  * Per-registration mode operations
  */
+struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
+	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	const char	*ro_displayname;
 };
 
@@ -411,6 +413,8 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 			 struct rpcrdma_regbuf *);
 
+unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+
 /*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
  */
@@ -422,7 +426,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
 int rpcrdma_marshal_req(struct rpc_rqst *);
-size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 
 /* Temporary NFS request map cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_map_cachep;


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 05/16] xprtrdma: Add a "register_external" op for each memreg mode
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (3 preceding siblings ...)
  2015-03-13 21:21 ` [PATCH v1 04/16] xprtrdma: Add a "max_payload" op for each memreg mode Chuck Lever
@ 2015-03-13 21:22 ` Chuck Lever
  2015-03-13 21:22 ` [PATCH v1 06/16] xprtrdma: Add a "deregister_external" " Chuck Lever
                   ` (11 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:22 UTC (permalink / raw)
  To: linux-nfs

There is very little common processing among the different external
memory registration functions. Have rpcrdma_create_chunks() call
the registration method directly. This removes a stack frame and a
switch statement from the external registration path.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   51 +++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   88 ++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |   17 ++++
 net/sunrpc/xprtrdma/rpc_rdma.c     |    5 +
 net/sunrpc/xprtrdma/verbs.c        |  172 +-----------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    6 +
 6 files changed, 166 insertions(+), 173 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index eec2660..45fb646 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -29,7 +29,58 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
 }
 
+/* Use the ib_map_phys_fmr() verb to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+	   int nsegs, bool writing)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_mw *mw = seg1->rl_mw;
+	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+	int len, pageoff, i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (nsegs > RPCRDMA_MAX_FMR_SGES)
+		nsegs = RPCRDMA_MAX_FMR_SGES;
+	for (i = 0; i < nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		physaddrs[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+
+	rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+	if (rc)
+		goto out_maperr;
+
+	seg1->mr_rkey = mw->r.fmr->rkey;
+	seg1->mr_base = seg1->mr_dma + pageoff;
+	seg1->mr_nsegs = i;
+	seg1->mr_len = len;
+	return i;
+
+out_maperr:
+	dprintk("RPC:       %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+		__func__, len, (unsigned long long)seg1->mr_dma,
+		pageoff, i, rc);
+	while (i--)
+		rpcrdma_unmap_one(ia, --seg);
+	return rc;
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+	.ro_map				= fmr_op_map,
 	.ro_maxpages			= fmr_op_maxpages,
 	.ro_displayname			= "fmr",
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 73a5ac8..2b5ccb0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -29,7 +29,95 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
 }
 
+/* Post a FAST_REG Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+	    int nsegs, bool writing)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_mw *mw = seg1->rl_mw;
+	struct rpcrdma_frmr *frmr = &mw->r.frmr;
+	struct ib_mr *mr = frmr->fr_mr;
+	struct ib_send_wr fastreg_wr, *bad_wr;
+	u8 key;
+	int len, pageoff;
+	int i, rc;
+	int seg_len;
+	u64 pa;
+	int page_no;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (nsegs > ia->ri_max_frmr_depth)
+		nsegs = ia->ri_max_frmr_depth;
+	for (page_no = i = 0; i < nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		pa = seg->mr_dma;
+		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+			frmr->fr_pgl->page_list[page_no++] = pa;
+			pa += PAGE_SIZE;
+		}
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
+		__func__, mw, i);
+
+	frmr->fr_state = FRMR_IS_VALID;
+
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.wr_id = (unsigned long)(void *)mw;
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.page_list_len = page_no;
+	fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+	if (fastreg_wr.wr.fast_reg.length < len) {
+		rc = -EIO;
+		goto out_err;
+	}
+	fastreg_wr.wr.fast_reg.access_flags = writing ?
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+				IB_ACCESS_REMOTE_READ;
+	key = (u8)(mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(mr, ++key);
+	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+	if (rc)
+		goto out_senderr;
+
+	seg1->mr_rkey = mr->rkey;
+	seg1->mr_base = seg1->mr_dma + pageoff;
+	seg1->mr_nsegs = i;
+	seg1->mr_len = len;
+	return i;
+
+out_senderr:
+	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+	ib_update_fast_reg_key(mr, --key);
+
+out_err:
+	frmr->fr_state = FRMR_IS_INVALID;
+	while (i--)
+		rpcrdma_unmap_one(ia, --seg);
+	return rc;
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+	.ro_map				= frwr_op_map,
 	.ro_maxpages			= frwr_op_maxpages,
 	.ro_displayname			= "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 28ade19..5a284ee 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -28,7 +28,24 @@ physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt));
 }
 
+/* The client's physical memory is already exposed for
+ * remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+		int nsegs, bool writing)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	rpcrdma_map_one(ia, seg, writing);
+	seg->mr_rkey = ia->ri_bind_mem->rkey;
+	seg->mr_base = seg->mr_dma;
+	seg->mr_nsegs = 1;
+	return 1;
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+	.ro_map				= physical_op_map,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_displayname			= "physical",
 };
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 41456d9..6ab1d03 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -187,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	struct rpcrdma_write_array *warray = NULL;
 	struct rpcrdma_write_chunk *cur_wchunk = NULL;
 	__be32 *iptr = headerp->rm_body.rm_chunks;
+	int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
 
 	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
 		/* a read chunk - server will RDMA Read our memory */
@@ -209,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	if (nsegs < 0)
 		return nsegs;
 
+	map = r_xprt->rx_ia.ri_ops->ro_map;
 	do {
-		n = rpcrdma_register_external(seg, nsegs,
-						cur_wchunk != NULL, r_xprt);
+		n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
 		if (n <= 0)
 			goto out;
 		if (cur_rchunk) {	/* read */
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 837d4ea..851ed97 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1857,8 +1857,8 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
  * Wrappers for chunk registration, shared by read/write chunk code.
  */
 
-static void
-rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
+void
+rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, bool writing)
 {
 	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 	seg->mr_dmalen = seg->mr_len;
@@ -1878,7 +1878,7 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
 	}
 }
 
-static void
+void
 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
 {
 	if (seg->mr_page)
@@ -1890,93 +1890,6 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
 }
 
 static int
-rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
-			int *nsegs, int writing, struct rpcrdma_ia *ia,
-			struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	struct rpcrdma_mw *mw = seg1->rl_mw;
-	struct rpcrdma_frmr *frmr = &mw->r.frmr;
-	struct ib_mr *mr = frmr->fr_mr;
-	struct ib_send_wr fastreg_wr, *bad_wr;
-	u8 key;
-	int len, pageoff;
-	int i, rc;
-	int seg_len;
-	u64 pa;
-	int page_no;
-
-	pageoff = offset_in_page(seg1->mr_offset);
-	seg1->mr_offset -= pageoff;	/* start of page */
-	seg1->mr_len += pageoff;
-	len = -pageoff;
-	if (*nsegs > ia->ri_max_frmr_depth)
-		*nsegs = ia->ri_max_frmr_depth;
-	for (page_no = i = 0; i < *nsegs;) {
-		rpcrdma_map_one(ia, seg, writing);
-		pa = seg->mr_dma;
-		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-			frmr->fr_pgl->page_list[page_no++] = pa;
-			pa += PAGE_SIZE;
-		}
-		len += seg->mr_len;
-		++seg;
-		++i;
-		/* Check for holes */
-		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-			break;
-	}
-	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-		__func__, mw, i);
-
-	frmr->fr_state = FRMR_IS_VALID;
-
-	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
-	fastreg_wr.wr_id = (unsigned long)(void *)mw;
-	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
-	fastreg_wr.wr.fast_reg.page_list_len = page_no;
-	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-	fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-	if (fastreg_wr.wr.fast_reg.length < len) {
-		rc = -EIO;
-		goto out_err;
-	}
-
-	/* Bump the key */
-	key = (u8)(mr->rkey & 0x000000FF);
-	ib_update_fast_reg_key(mr, ++key);
-
-	fastreg_wr.wr.fast_reg.access_flags = (writing ?
-				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
-				IB_ACCESS_REMOTE_READ);
-	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
-	DECR_CQCOUNT(&r_xprt->rx_ep);
-
-	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
-	if (rc) {
-		dprintk("RPC:       %s: failed ib_post_send for register,"
-			" status %i\n", __func__, rc);
-		ib_update_fast_reg_key(mr, --key);
-		goto out_err;
-	} else {
-		seg1->mr_rkey = mr->rkey;
-		seg1->mr_base = seg1->mr_dma + pageoff;
-		seg1->mr_nsegs = i;
-		seg1->mr_len = len;
-	}
-	*nsegs = i;
-	return 0;
-out_err:
-	frmr->fr_state = FRMR_IS_INVALID;
-	while (i--)
-		rpcrdma_unmap_one(ia, --seg);
-	return rc;
-}
-
-static int
 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
 			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
 {
@@ -2007,49 +1920,6 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
 }
 
 static int
-rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
-			int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
-	int len, pageoff, i, rc;
-
-	pageoff = offset_in_page(seg1->mr_offset);
-	seg1->mr_offset -= pageoff;	/* start of page */
-	seg1->mr_len += pageoff;
-	len = -pageoff;
-	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
-		*nsegs = RPCRDMA_MAX_DATA_SEGS;
-	for (i = 0; i < *nsegs;) {
-		rpcrdma_map_one(ia, seg, writing);
-		physaddrs[i] = seg->mr_dma;
-		len += seg->mr_len;
-		++seg;
-		++i;
-		/* Check for holes */
-		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-			break;
-	}
-	rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
-	if (rc) {
-		dprintk("RPC:       %s: failed ib_map_phys_fmr "
-			"%u@0x%llx+%i (%d)... status %i\n", __func__,
-			len, (unsigned long long)seg1->mr_dma,
-			pageoff, i, rc);
-		while (i--)
-			rpcrdma_unmap_one(ia, --seg);
-	} else {
-		seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
-		seg1->mr_base = seg1->mr_dma + pageoff;
-		seg1->mr_nsegs = i;
-		seg1->mr_len = len;
-	}
-	*nsegs = i;
-	return rc;
-}
-
-static int
 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
 			struct rpcrdma_ia *ia)
 {
@@ -2070,42 +1940,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
 }
 
 int
-rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
-			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	int rc = 0;
-
-	switch (ia->ri_memreg_strategy) {
-
-	case RPCRDMA_ALLPHYSICAL:
-		rpcrdma_map_one(ia, seg, writing);
-		seg->mr_rkey = ia->ri_bind_mem->rkey;
-		seg->mr_base = seg->mr_dma;
-		seg->mr_nsegs = 1;
-		nsegs = 1;
-		break;
-
-	/* Registration using frmr registration */
-	case RPCRDMA_FRMR:
-		rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
-		break;
-
-	/* Registration using fmr memory registration */
-	case RPCRDMA_MTHCAFMR:
-		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
-		break;
-
-	default:
-		return -EIO;
-	}
-	if (rc)
-		return rc;
-
-	return nsegs;
-}
-
-int
 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 		struct rpcrdma_xprt *r_xprt)
 {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 59e627e..7bf077b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -336,6 +336,8 @@ struct rpcrdma_stats {
  */
 struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
+	int		(*ro_map)(struct rpcrdma_xprt *,
+				  struct rpcrdma_mr_seg *, int, bool);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	const char	*ro_displayname;
 };
@@ -403,8 +405,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
-int rpcrdma_register_external(struct rpcrdma_mr_seg *,
-				int, int, struct rpcrdma_xprt *);
 int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
 				struct rpcrdma_xprt *);
 
@@ -414,6 +414,8 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 			 struct rpcrdma_regbuf *);
 
 unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+void rpcrdma_map_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *, bool);
+void rpcrdma_unmap_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *);
 
 /*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 06/16] xprtrdma: Add a "deregister_external" op for each memreg mode
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (4 preceding siblings ...)
  2015-03-13 21:22 ` [PATCH v1 05/16] xprtrdma: Add a "register_external" " Chuck Lever
@ 2015-03-13 21:22 ` Chuck Lever
  2015-03-17 14:37   ` Anna Schumaker
  2015-03-13 21:22 ` [PATCH v1 07/16] xprtrdma: Add "init MRs" memreg op Chuck Lever
                   ` (10 subsequent siblings)
  16 siblings, 1 reply; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:22 UTC (permalink / raw)
  To: linux-nfs

There is very little common processing among the different external
memory deregistration functions.

In addition, instead of calling the deregistration function for each
segment, have one call release all segments for a request. This makes
the API a little asymmetrical, but a hair faster.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   37 ++++++++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   46 ++++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |   13 ++++++
 net/sunrpc/xprtrdma/rpc_rdma.c     |    7 +--
 net/sunrpc/xprtrdma/transport.c    |    8 +---
 net/sunrpc/xprtrdma/verbs.c        |   81 ------------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    5 +-
 7 files changed, 103 insertions(+), 94 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 45fb646..9b983b4 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -20,6 +20,32 @@
 /* Maximum scatter/gather per FMR */
 #define RPCRDMA_MAX_FMR_SGES	(64)
 
+/* Use the ib_unmap_fmr() verb to prevent further remote
+ * access via RDMA READ or RDMA WRITE.
+ */
+static int
+__fmr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int rc, nsegs = seg->mr_nsegs;
+	LIST_HEAD(l);
+
+	list_add(&seg1->rl_mw->r.fmr->list, &l);
+	rc = ib_unmap_fmr(&l);
+	read_lock(&ia->ri_qplock);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	read_unlock(&ia->ri_qplock);
+	if (rc)
+		goto out_err;
+	return nsegs;
+
+out_err:
+	dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
+	return nsegs;
+}
+
 /* FMR mode conveys up to 64 pages of payload per chunk segment.
  */
 static size_t
@@ -79,8 +105,19 @@ out_maperr:
 	return rc;
 }
 
+static void
+fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+	     unsigned int count)
+{
+	unsigned int i;
+
+	for (i = 0; count--;)
+		i += __fmr_unmap(r_xprt, &req->rl_segments[i]);
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
+	.ro_unmap			= fmr_op_unmap,
 	.ro_maxpages			= fmr_op_maxpages,
 	.ro_displayname			= "fmr",
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 2b5ccb0..05b5761 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -17,6 +17,41 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+/* Post a LOCAL_INV Work Request to prevent further remote access
+ * via RDMA READ or RDMA WRITE.
+ */
+static int
+__frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc, nsegs = seg->mr_nsegs;
+
+	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+
+	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	read_lock(&ia->ri_qplock);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	read_unlock(&ia->ri_qplock);
+	if (rc)
+		goto out_err;
+	return nsegs;
+
+out_err:
+	/* Force rpcrdma_buffer_get() to retry */
+	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+	return nsegs;
+}
+
 /* FRWR mode conveys a list of pages per chunk segment. The
  * maximum length of that list is the FRWR page list depth.
  */
@@ -116,8 +151,19 @@ out_err:
 	return rc;
 }
 
+static void
+frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+	      unsigned int count)
+{
+	unsigned int i;
+
+	for (i = 0; count--;)
+		i += __frwr_unmap(r_xprt, &req->rl_segments[i]);
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
+	.ro_unmap			= frwr_op_unmap,
 	.ro_maxpages			= frwr_op_maxpages,
 	.ro_displayname			= "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 5a284ee..f2c15be 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -44,8 +44,21 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	return 1;
 }
 
+/* Unmap a memory region, but leave it registered.
+ */
+static void
+physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		  unsigned int count)
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		rpcrdma_unmap_one(&r_xprt->rx_ia, &req->rl_segments[i]);
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
+	.ro_unmap			= physical_op_unmap,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_displayname			= "physical",
 };
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 6ab1d03..7b51d9d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -284,11 +284,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
-		for (pos = 0; nchunks--;)
-			pos += rpcrdma_deregister_external(
-					&req->rl_segments[pos], r_xprt);
-	}
+	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR)
+		r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, nchunks);
 	return n;
 }
 
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9a9da40..c484671 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -571,7 +571,6 @@ xprt_rdma_free(void *buffer)
 	struct rpcrdma_req *req;
 	struct rpcrdma_xprt *r_xprt;
 	struct rpcrdma_regbuf *rb;
-	int i;
 
 	if (buffer == NULL)
 		return;
@@ -582,11 +581,8 @@ xprt_rdma_free(void *buffer)
 
 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-	for (i = 0; req->rl_nchunks;) {
-		--req->rl_nchunks;
-		i += rpcrdma_deregister_external(
-			&req->rl_segments[i], r_xprt);
-	}
+	r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, req->rl_nchunks);
+	req->rl_nchunks = 0;
 
 	rpcrdma_buffer_put(req);
 }
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 851ed97..a1621fd 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1509,7 +1509,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
 	}
 }
 
-/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+/* rpcrdma_unmap_one() was already done during deregistration.
  * Redo only the ib_post_send().
  */
 static void
@@ -1889,85 +1889,6 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
 				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
 }
 
-static int
-rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
-			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	struct ib_send_wr invalidate_wr, *bad_wr;
-	int rc;
-
-	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
-
-	memset(&invalidate_wr, 0, sizeof invalidate_wr);
-	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
-	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
-	DECR_CQCOUNT(&r_xprt->rx_ep);
-
-	read_lock(&ia->ri_qplock);
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
-	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-	read_unlock(&ia->ri_qplock);
-	if (rc) {
-		/* Force rpcrdma_buffer_get() to retry */
-		seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
-		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
-			" status %i\n", __func__, rc);
-	}
-	return rc;
-}
-
-static int
-rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
-			struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	LIST_HEAD(l);
-	int rc;
-
-	list_add(&seg1->rl_mw->r.fmr->list, &l);
-	rc = ib_unmap_fmr(&l);
-	read_lock(&ia->ri_qplock);
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
-	read_unlock(&ia->ri_qplock);
-	if (rc)
-		dprintk("RPC:       %s: failed ib_unmap_fmr,"
-			" status %i\n", __func__, rc);
-	return rc;
-}
-
-int
-rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
-		struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	int nsegs = seg->mr_nsegs, rc;
-
-	switch (ia->ri_memreg_strategy) {
-
-	case RPCRDMA_ALLPHYSICAL:
-		read_lock(&ia->ri_qplock);
-		rpcrdma_unmap_one(ia, seg);
-		read_unlock(&ia->ri_qplock);
-		break;
-
-	case RPCRDMA_FRMR:
-		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
-		break;
-
-	case RPCRDMA_MTHCAFMR:
-		rc = rpcrdma_deregister_fmr_external(seg, ia);
-		break;
-
-	default:
-		break;
-	}
-	return nsegs;
-}
-
 /*
  * Prepost any receive buffer, then post send.
  *
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 7bf077b..3aabbb2 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -338,6 +338,8 @@ struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
 	int		(*ro_map)(struct rpcrdma_xprt *,
 				  struct rpcrdma_mr_seg *, int, bool);
+	void		(*ro_unmap)(struct rpcrdma_xprt *,
+				    struct rpcrdma_req *, unsigned int);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	const char	*ro_displayname;
 };
@@ -405,9 +407,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
-int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
-				struct rpcrdma_xprt *);
-
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 					    size_t, gfp_t);
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 07/16] xprtrdma: Add "init MRs" memreg op
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (5 preceding siblings ...)
  2015-03-13 21:22 ` [PATCH v1 06/16] xprtrdma: Add a "deregister_external" " Chuck Lever
@ 2015-03-13 21:22 ` Chuck Lever
  2015-03-13 21:22 ` [PATCH v1 08/16] xprtrdma: Add "reset " Chuck Lever
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:22 UTC (permalink / raw)
  To: linux-nfs

This method is used when setting up a new transport instance to
create a pool of Memory Region objects that will be used to register
memory during operation.

Memory Regions are not needed for "physical" registration, since
->prepare and ->release are basically no-ops for that mode.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   42 +++++++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   66 +++++++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |    7 ++
 net/sunrpc/xprtrdma/verbs.c        |  104 +-----------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    1 
 5 files changed, 119 insertions(+), 101 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 9b983b4..1501db0 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -55,6 +55,47 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
 }
 
+static int
+fmr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+	struct ib_fmr_attr fmr_attr = {
+		.max_pages	= RPCRDMA_MAX_FMR_SGES,
+		.max_maps	= 1,
+		.page_shift	= PAGE_SHIFT
+	};
+	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+	struct rpcrdma_mw *r;
+	int i, rc;
+
+	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_all);
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
+
+	while (i--) {
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (!r)
+			return -ENOMEM;
+
+		r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+		if (IS_ERR(r->r.fmr))
+			goto out_fmr_err;
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+	return 0;
+
+out_fmr_err:
+	rc = PTR_ERR(r->r.fmr);
+	dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
+	kfree(r);
+	return rc;
+}
+
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -119,5 +160,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
 	.ro_unmap			= fmr_op_unmap,
 	.ro_maxpages			= fmr_op_maxpages,
+	.ro_init			= fmr_op_init,
 	.ro_displayname			= "fmr",
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 05b5761..975372c 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -17,6 +17,35 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+static int
+__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
+	    unsigned int depth)
+{
+	struct rpcrdma_frmr *f = &r->r.frmr;
+	int rc;
+
+	f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+	if (IS_ERR(f->fr_mr))
+		goto out_mr_err;
+	f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
+	if (IS_ERR(f->fr_pgl))
+		goto out_list_err;
+	return 0;
+
+out_mr_err:
+	rc = PTR_ERR(f->fr_mr);
+	dprintk("RPC:       %s: ib_alloc_fast_reg_mr status %i\n",
+		__func__, rc);
+	return rc;
+
+out_list_err:
+	rc = PTR_ERR(f->fr_pgl);
+	dprintk("RPC:       %s: ib_alloc_fast_reg_page_list status %i\n",
+		__func__, rc);
+	ib_dereg_mr(f->fr_mr);
+	return rc;
+}
+
 /* Post a LOCAL_INV Work Request to prevent further remote access
  * via RDMA READ or RDMA WRITE.
  */
@@ -64,6 +93,42 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
 }
 
+static int
+frwr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+	int i;
+
+	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_all);
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
+
+	while (i--) {
+		struct rpcrdma_mw *r;
+		int rc;
+
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (!r)
+			return -ENOMEM;
+
+		rc = __frwr_init(r, pd, device, depth);
+		if (rc) {
+			kfree(r);
+			return rc;
+		}
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+
+	return 0;
+}
+
 /* Post a FAST_REG Work Request to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -165,5 +230,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
 	.ro_unmap			= frwr_op_unmap,
 	.ro_maxpages			= frwr_op_maxpages,
+	.ro_init			= frwr_op_init,
 	.ro_displayname			= "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index f2c15be..ae2b0bc 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -28,6 +28,12 @@ physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt));
 }
 
+static int
+physical_op_init(struct rpcrdma_xprt *r_xprt)
+{
+	return 0;
+}
+
 /* The client's physical memory is already exposed for
  * remote access via RDMA READ or RDMA WRITE.
  */
@@ -60,5 +66,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
 	.ro_unmap			= physical_op_unmap,
 	.ro_maxpages			= physical_op_maxpages,
+	.ro_init			= physical_op_init,
 	.ro_displayname			= "physical",
 };
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index a1621fd..d7810d6 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1123,91 +1123,6 @@ out:
 	return ERR_PTR(rc);
 }
 
-static int
-rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
-	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
-	struct ib_fmr_attr fmr_attr = {
-		.max_pages	= RPCRDMA_MAX_DATA_SEGS,
-		.max_maps	= 1,
-		.page_shift	= PAGE_SHIFT
-	};
-	struct rpcrdma_mw *r;
-	int i, rc;
-
-	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-	dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
-
-	while (i--) {
-		r = kzalloc(sizeof(*r), GFP_KERNEL);
-		if (r == NULL)
-			return -ENOMEM;
-
-		r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
-		if (IS_ERR(r->r.fmr)) {
-			rc = PTR_ERR(r->r.fmr);
-			dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
-				__func__, rc);
-			goto out_free;
-		}
-
-		list_add(&r->mw_list, &buf->rb_mws);
-		list_add(&r->mw_all, &buf->rb_all);
-	}
-	return 0;
-
-out_free:
-	kfree(r);
-	return rc;
-}
-
-static int
-rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_frmr *f;
-	struct rpcrdma_mw *r;
-	int i, rc;
-
-	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-	dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
-
-	while (i--) {
-		r = kzalloc(sizeof(*r), GFP_KERNEL);
-		if (r == NULL)
-			return -ENOMEM;
-		f = &r->r.frmr;
-
-		f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-						ia->ri_max_frmr_depth);
-		if (IS_ERR(f->fr_mr)) {
-			rc = PTR_ERR(f->fr_mr);
-			dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
-				"failed %i\n", __func__, rc);
-			goto out_free;
-		}
-
-		f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
-							ia->ri_max_frmr_depth);
-		if (IS_ERR(f->fr_pgl)) {
-			rc = PTR_ERR(f->fr_pgl);
-			dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
-				"failed %i\n", __func__, rc);
-
-			ib_dereg_mr(f->fr_mr);
-			goto out_free;
-		}
-
-		list_add(&r->mw_list, &buf->rb_mws);
-		list_add(&r->mw_all, &buf->rb_all);
-	}
-
-	return 0;
-
-out_free:
-	kfree(r);
-	return rc;
-}
-
 int
 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 {
@@ -1244,22 +1159,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
 	p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
 
-	INIT_LIST_HEAD(&buf->rb_mws);
-	INIT_LIST_HEAD(&buf->rb_all);
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		rc = rpcrdma_init_frmrs(ia, buf);
-		if (rc)
-			goto out;
-		break;
-	case RPCRDMA_MTHCAFMR:
-		rc = rpcrdma_init_fmrs(ia, buf);
-		if (rc)
-			goto out;
-		break;
-	default:
-		break;
-	}
+	rc = ia->ri_ops->ro_init(r_xprt);
+	if (rc)
+		goto out;
 
 	for (i = 0; i < buf->rb_max_requests; i++) {
 		struct rpcrdma_req *req;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 3aabbb2..4fe3c38 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -341,6 +341,7 @@ struct rpcrdma_memreg_ops {
 	void		(*ro_unmap)(struct rpcrdma_xprt *,
 				    struct rpcrdma_req *, unsigned int);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
+	int		(*ro_init)(struct rpcrdma_xprt *);
 	const char	*ro_displayname;
 };
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 08/16] xprtrdma: Add "reset MRs" memreg op
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (6 preceding siblings ...)
  2015-03-13 21:22 ` [PATCH v1 07/16] xprtrdma: Add "init MRs" memreg op Chuck Lever
@ 2015-03-13 21:22 ` Chuck Lever
  2015-03-13 21:22 ` [PATCH v1 09/16] xprtrdma: Add "destroy " Chuck Lever
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:22 UTC (permalink / raw)
  To: linux-nfs

This method is invoked when a transport instance is about to be
reconnected. Each Memory Region object is reset to its initial
state.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   23 ++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   46 ++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |    6 ++
 net/sunrpc/xprtrdma/verbs.c        |  103 +-----------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    1 
 5 files changed, 78 insertions(+), 101 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1501db0..1ccb3de 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -156,10 +156,33 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		i += __fmr_unmap(r_xprt, &req->rl_segments[i]);
 }
 
+/* After a disconnect, unmap all FMRs.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_fmr_external().
+ */
+static void
+fmr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mw *r;
+	LIST_HEAD(list);
+	int rc;
+
+	list_for_each_entry(r, &buf->rb_all, mw_all)
+		list_add(&r->r.fmr->list, &list);
+
+	rc = ib_unmap_fmr(&list);
+	if (rc)
+		dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
+			__func__, rc);
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
 	.ro_unmap			= fmr_op_unmap,
 	.ro_maxpages			= fmr_op_maxpages,
 	.ro_init			= fmr_op_init,
+	.ro_reset			= fmr_op_reset,
 	.ro_displayname			= "fmr",
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 975372c..b4ce0e5 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -81,6 +81,18 @@ out_err:
 	return nsegs;
 }
 
+static void
+__frwr_release(struct rpcrdma_mw *r)
+{
+	int rc;
+
+	rc = ib_dereg_mr(r->r.frmr.fr_mr);
+	if (rc)
+		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
+			__func__, rc);
+	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+}
+
 /* FRWR mode conveys a list of pages per chunk segment. The
  * maximum length of that list is the FRWR page list depth.
  */
@@ -226,10 +238,44 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		i += __frwr_unmap(r_xprt, &req->rl_segments[i]);
 }
 
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+frwr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+	struct rpcrdma_mw *r;
+	int rc;
+
+	list_for_each_entry(r, &buf->rb_all, mw_all) {
+		if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+			continue;
+
+		__frwr_release(r);
+		rc = __frwr_init(r, pd, device, depth);
+		if (rc)
+			continue;
+
+		r->r.frmr.fr_state = FRMR_IS_INVALID;
+	}
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
 	.ro_unmap			= frwr_op_unmap,
 	.ro_maxpages			= frwr_op_maxpages,
 	.ro_init			= frwr_op_init,
+	.ro_reset			= frwr_op_reset,
 	.ro_displayname			= "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index ae2b0bc..0afc691 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -62,10 +62,16 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		rpcrdma_unmap_one(&r_xprt->rx_ia, &req->rl_segments[i]);
 }
 
+static void
+physical_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
 	.ro_unmap			= physical_op_unmap,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_init			= physical_op_init,
+	.ro_reset			= physical_op_reset,
 	.ro_displayname			= "physical",
 };
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index d7810d6..e17d91a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -63,9 +63,6 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
-static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
-
 /*
  * internal functions
  */
@@ -944,21 +941,9 @@ retry:
 		rpcrdma_ep_disconnect(ep, ia);
 		rpcrdma_flush_cqs(ep);
 
-		switch (ia->ri_memreg_strategy) {
-		case RPCRDMA_FRMR:
-			rpcrdma_reset_frmrs(ia);
-			break;
-		case RPCRDMA_MTHCAFMR:
-			rpcrdma_reset_fmrs(ia);
-			break;
-		case RPCRDMA_ALLPHYSICAL:
-			break;
-		default:
-			rc = -EIO;
-			goto out;
-		}
-
 		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+		ia->ri_ops->ro_reset(xprt);
+
 		id = rpcrdma_create_id(xprt, ia,
 				(struct sockaddr *)&xprt->rx_data.addr);
 		if (IS_ERR(id)) {
@@ -1288,90 +1273,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 	kfree(buf->rb_pool);
 }
 
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_xprt *r_xprt =
-				container_of(ia, struct rpcrdma_xprt, rx_ia);
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct list_head *pos;
-	struct rpcrdma_mw *r;
-	LIST_HEAD(l);
-	int rc;
-
-	list_for_each(pos, &buf->rb_all) {
-		r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
-		INIT_LIST_HEAD(&l);
-		list_add(&r->r.fmr->list, &l);
-		rc = ib_unmap_fmr(&l);
-		if (rc)
-			dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
-				__func__, rc);
-	}
-}
-
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each.  FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_xprt *r_xprt =
-				container_of(ia, struct rpcrdma_xprt, rx_ia);
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct list_head *pos;
-	struct rpcrdma_mw *r;
-	int rc;
-
-	list_for_each(pos, &buf->rb_all) {
-		r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
-		if (r->r.frmr.fr_state == FRMR_IS_INVALID)
-			continue;
-
-		rc = ib_dereg_mr(r->r.frmr.fr_mr);
-		if (rc)
-			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
-				__func__, rc);
-		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
-		r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-					ia->ri_max_frmr_depth);
-		if (IS_ERR(r->r.frmr.fr_mr)) {
-			rc = PTR_ERR(r->r.frmr.fr_mr);
-			dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
-				" failed %i\n", __func__, rc);
-			continue;
-		}
-		r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
-					ia->ri_id->device,
-					ia->ri_max_frmr_depth);
-		if (IS_ERR(r->r.frmr.fr_pgl)) {
-			rc = PTR_ERR(r->r.frmr.fr_pgl);
-			dprintk("RPC:       %s: "
-				"ib_alloc_fast_reg_page_list "
-				"failed %i\n", __func__, rc);
-
-			ib_dereg_mr(r->r.frmr.fr_mr);
-			continue;
-		}
-		r->r.frmr.fr_state = FRMR_IS_INVALID;
-	}
-}
-
 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
  * some req segments uninitialized.
  */
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 4fe3c38..cdf6763 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -342,6 +342,7 @@ struct rpcrdma_memreg_ops {
 				    struct rpcrdma_req *, unsigned int);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	int		(*ro_init)(struct rpcrdma_xprt *);
+	void		(*ro_reset)(struct rpcrdma_xprt *);
 	const char	*ro_displayname;
 };
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 09/16] xprtrdma: Add "destroy MRs" memreg op
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (7 preceding siblings ...)
  2015-03-13 21:22 ` [PATCH v1 08/16] xprtrdma: Add "reset " Chuck Lever
@ 2015-03-13 21:22 ` Chuck Lever
  2015-03-13 21:22 ` [PATCH v1 10/16] xprtrdma: Add "open" " Chuck Lever
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:22 UTC (permalink / raw)
  To: linux-nfs

Memory Region objects associated with a transport instance are
destroyed before the instance is shutdown and destroyed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   21 +++++++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   17 ++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |    6 ++++
 net/sunrpc/xprtrdma/verbs.c        |   52 +-----------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    1 +
 5 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1ccb3de..3115e4b 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -178,11 +178,32 @@ fmr_op_reset(struct rpcrdma_xprt *r_xprt)
 			__func__, rc);
 }
 
+static void
+fmr_op_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int rc;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		list_del(&r->mw_list);
+
+		rc = ib_dealloc_fmr(r->r.fmr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+				__func__, rc);
+
+		kfree(r);
+	}
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
 	.ro_unmap			= fmr_op_unmap,
 	.ro_maxpages			= fmr_op_maxpages,
 	.ro_init			= fmr_op_init,
 	.ro_reset			= fmr_op_reset,
+	.ro_destroy			= fmr_op_destroy,
 	.ro_displayname			= "fmr",
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index b4ce0e5..fc3a228 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -271,11 +271,28 @@ frwr_op_reset(struct rpcrdma_xprt *r_xprt)
 	}
 }
 
+static void
+frwr_op_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		list_del(&r->mw_list);
+
+		__frwr_release(r);
+
+		kfree(r);
+	}
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
 	.ro_unmap			= frwr_op_unmap,
 	.ro_maxpages			= frwr_op_maxpages,
 	.ro_init			= frwr_op_init,
 	.ro_reset			= frwr_op_reset,
+	.ro_destroy			= frwr_op_destroy,
 	.ro_displayname			= "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 0afc691..f8da8c4 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -67,11 +67,17 @@ physical_op_reset(struct rpcrdma_xprt *r_xprt)
 {
 }
 
+static void
+physical_op_destroy(struct rpcrdma_buffer *buf)
+{
+}
+
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
 	.ro_unmap			= physical_op_unmap,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_init			= physical_op_init,
 	.ro_reset			= physical_op_reset,
+	.ro_destroy			= physical_op_destroy,
 	.ro_displayname			= "physical",
 };
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e17d91a..dcbc736 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1198,47 +1198,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
 	kfree(req);
 }
 
-static void
-rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_mw *r;
-	int rc;
-
-	while (!list_empty(&buf->rb_all)) {
-		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-		list_del(&r->mw_all);
-		list_del(&r->mw_list);
-
-		rc = ib_dealloc_fmr(r->r.fmr);
-		if (rc)
-			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
-				__func__, rc);
-
-		kfree(r);
-	}
-}
-
-static void
-rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_mw *r;
-	int rc;
-
-	while (!list_empty(&buf->rb_all)) {
-		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-		list_del(&r->mw_all);
-		list_del(&r->mw_list);
-
-		rc = ib_dereg_mr(r->r.frmr.fr_mr);
-		if (rc)
-			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
-				__func__, rc);
-		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
-		kfree(r);
-	}
-}
-
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
@@ -1259,16 +1218,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 			rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
 	}
 
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		rpcrdma_destroy_frmrs(buf);
-		break;
-	case RPCRDMA_MTHCAFMR:
-		rpcrdma_destroy_fmrs(buf);
-		break;
-	default:
-		break;
-	}
+	ia->ri_ops->ro_destroy(buf);
 
 	kfree(buf->rb_pool);
 }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cdf6763..a0e3c3e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -343,6 +343,7 @@ struct rpcrdma_memreg_ops {
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	int		(*ro_init)(struct rpcrdma_xprt *);
 	void		(*ro_reset)(struct rpcrdma_xprt *);
+	void		(*ro_destroy)(struct rpcrdma_buffer *);
 	const char	*ro_displayname;
 };
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 10/16] xprtrdma: Add "open" memreg op
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (8 preceding siblings ...)
  2015-03-13 21:22 ` [PATCH v1 09/16] xprtrdma: Add "destroy " Chuck Lever
@ 2015-03-13 21:22 ` Chuck Lever
  2015-03-17 15:16   ` Anna Schumaker
  2015-03-13 21:23 ` [PATCH v1 11/16] xprtrdma: Handle non-SEND completions via a callout Chuck Lever
                   ` (6 subsequent siblings)
  16 siblings, 1 reply; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:22 UTC (permalink / raw)
  To: linux-nfs

The open op determines the size of various transport data structures
based on device capabilities and memory registration mode.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   22 +++++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |   60 ++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |   22 +++++++++++++
 net/sunrpc/xprtrdma/verbs.c        |   54 ++------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    3 ++
 5 files changed, 110 insertions(+), 51 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 3115e4b..96e6cd3 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -46,6 +46,27 @@ out_err:
 	return nsegs;
 }
 
+static int
+fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+	    struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr *devattr = &ia->ri_devattr;
+	unsigned int wrs, max_wrs;
+
+	max_wrs = devattr->max_qp_wr;
+	if (cdata->max_requests > max_wrs)
+		cdata->max_requests = max_wrs;
+
+	wrs = cdata->max_requests;
+	ep->rep_attr.cap.max_send_wr = wrs;
+	ep->rep_attr.cap.max_recv_wr = wrs;
+
+	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
+		__func__, ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr);
+	return 0;
+}
+
 /* FMR mode conveys up to 64 pages of payload per chunk segment.
  */
 static size_t
@@ -201,6 +222,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
 	.ro_unmap			= fmr_op_unmap,
+	.ro_open			= fmr_op_open,
 	.ro_maxpages			= fmr_op_maxpages,
 	.ro_init			= fmr_op_init,
 	.ro_reset			= fmr_op_reset,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index fc3a228..9bb4b2d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -93,6 +93,65 @@ __frwr_release(struct rpcrdma_mw *r)
 	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
 }
 
+static int
+frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+	     struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr *devattr = &ia->ri_devattr;
+	unsigned int wrs, max_wrs;
+	int depth = 7;
+
+	max_wrs = devattr->max_qp_wr;
+	if (cdata->max_requests > max_wrs)
+		cdata->max_requests = max_wrs;
+
+	wrs = cdata->max_requests;
+	ep->rep_attr.cap.max_send_wr = wrs;
+	ep->rep_attr.cap.max_recv_wr = wrs;
+
+	ia->ri_max_frmr_depth =
+			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+			      devattr->max_fast_reg_page_list_len);
+	dprintk("RPC:       %s: device's max FR page list len = %u\n",
+		__func__, ia->ri_max_frmr_depth);
+
+	/* Add room for frmr register and invalidate WRs.
+	 * 1. FRMR reg WR for head
+	 * 2. FRMR invalidate WR for head
+	 * 3. N FRMR reg WRs for pagelist
+	 * 4. N FRMR invalidate WRs for pagelist
+	 * 5. FRMR reg WR for tail
+	 * 6. FRMR invalidate WR for tail
+	 * 7. The RDMA_SEND WR
+	 */
+
+	/* Calculate N if the device max FRMR depth is smaller than
+	 * RPCRDMA_MAX_DATA_SEGS.
+	 */
+	if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+		int delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
+
+		do {
+			depth += 2; /* FRMR reg + invalidate */
+			delta -= ia->ri_max_frmr_depth;
+		} while (delta > 0);
+	}
+
+	ep->rep_attr.cap.max_send_wr *= depth;
+	if (ep->rep_attr.cap.max_send_wr > max_wrs) {
+		cdata->max_requests = max_wrs / depth;
+		if (!cdata->max_requests)
+			return -EINVAL;
+		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+					       depth;
+	}
+
+	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
+		__func__, ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr);
+	return 0;
+}
+
 /* FRWR mode conveys a list of pages per chunk segment. The
  * maximum length of that list is the FRWR page list depth.
  */
@@ -290,6 +349,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
 	.ro_unmap			= frwr_op_unmap,
+	.ro_open			= frwr_op_open,
 	.ro_maxpages			= frwr_op_maxpages,
 	.ro_init			= frwr_op_init,
 	.ro_reset			= frwr_op_reset,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index f8da8c4..0998f4f 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -19,6 +19,27 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+static int
+physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+		 struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr *devattr = &ia->ri_devattr;
+	unsigned int wrs, max_wrs;
+
+	max_wrs = devattr->max_qp_wr;
+	if (cdata->max_requests > max_wrs)
+		cdata->max_requests = max_wrs;
+
+	wrs = cdata->max_requests;
+	ep->rep_attr.cap.max_send_wr = wrs;
+	ep->rep_attr.cap.max_recv_wr = wrs;
+
+	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
+		__func__, ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr);
+	return 0;
+}
+
 /* PHYSICAL memory registration conveys one page per chunk segment.
  */
 static size_t
@@ -75,6 +96,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
 	.ro_unmap			= physical_op_unmap,
+	.ro_open			= physical_op_open,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_init			= physical_op_init,
 	.ro_reset			= physical_op_reset,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index dcbc736..17b2a29 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -621,11 +621,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 			dprintk("RPC:       %s: FRMR registration "
 				"not supported by HCA\n", __func__);
 			memreg = RPCRDMA_MTHCAFMR;
-		} else {
-			/* Mind the ia limit on FRMR page list depth */
-			ia->ri_max_frmr_depth = min_t(unsigned int,
-				RPCRDMA_MAX_DATA_SEGS,
-				devattr->max_fast_reg_page_list_len);
 		}
 	}
 	if (memreg == RPCRDMA_MTHCAFMR) {
@@ -734,56 +729,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	struct ib_cq *sendcq, *recvcq;
 	int rc, err;
 
-	/* check provider's send/recv wr limits */
-	if (cdata->max_requests > devattr->max_qp_wr)
-		cdata->max_requests = devattr->max_qp_wr;
+	rc = ia->ri_ops->ro_open(ia, ep, cdata);
+	if (rc)
+		return rc;
 
 	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
 	ep->rep_attr.qp_context = ep;
-	/* send_cq and recv_cq initialized below */
 	ep->rep_attr.srq = NULL;
-	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR: {
-		int depth = 7;
-
-		/* Add room for frmr register and invalidate WRs.
-		 * 1. FRMR reg WR for head
-		 * 2. FRMR invalidate WR for head
-		 * 3. N FRMR reg WRs for pagelist
-		 * 4. N FRMR invalidate WRs for pagelist
-		 * 5. FRMR reg WR for tail
-		 * 6. FRMR invalidate WR for tail
-		 * 7. The RDMA_SEND WR
-		 */
-
-		/* Calculate N if the device max FRMR depth is smaller than
-		 * RPCRDMA_MAX_DATA_SEGS.
-		 */
-		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
-			int delta = RPCRDMA_MAX_DATA_SEGS -
-				    ia->ri_max_frmr_depth;
-
-			do {
-				depth += 2; /* FRMR reg + invalidate */
-				delta -= ia->ri_max_frmr_depth;
-			} while (delta > 0);
-
-		}
-		ep->rep_attr.cap.max_send_wr *= depth;
-		if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
-			cdata->max_requests = devattr->max_qp_wr / depth;
-			if (!cdata->max_requests)
-				return -EINVAL;
-			ep->rep_attr.cap.max_send_wr = cdata->max_requests *
-						       depth;
-		}
-		break;
-	}
-	default:
-		break;
-	}
-	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a0e3c3e..a53a564 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -340,6 +340,9 @@ struct rpcrdma_memreg_ops {
 				  struct rpcrdma_mr_seg *, int, bool);
 	void		(*ro_unmap)(struct rpcrdma_xprt *,
 				    struct rpcrdma_req *, unsigned int);
+	int		(*ro_open)(struct rpcrdma_ia *,
+				   struct rpcrdma_ep *,
+				   struct rpcrdma_create_data_internal *);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	int		(*ro_init)(struct rpcrdma_xprt *);
 	void		(*ro_reset)(struct rpcrdma_xprt *);


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 11/16] xprtrdma: Handle non-SEND completions via a callout
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (9 preceding siblings ...)
  2015-03-13 21:22 ` [PATCH v1 10/16] xprtrdma: Add "open" " Chuck Lever
@ 2015-03-13 21:23 ` Chuck Lever
  2015-03-13 21:23 ` [PATCH v1 12/16] xprtrdma: Acquire FMRs in rpcrdma_fmr_register_external() Chuck Lever
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:23 UTC (permalink / raw)
  To: linux-nfs

Allow each memory registration mode to plug in a callout that handles
the completion of a memory registration operation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   17 +++++++++++++++++
 net/sunrpc/xprtrdma/verbs.c     |   14 +++++---------
 net/sunrpc/xprtrdma/xprt_rdma.h |    5 +++++
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 9bb4b2d..6e6d8ba 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -164,6 +164,22 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
 }
 
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+	struct rpcrdma_mw *r;
+
+	if (likely(wc->status == IB_WC_SUCCESS))
+		return;
+
+	/* WARNING: Only wr_id and status are reliable at this point */
+	r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+	dprintk("RPC:       %s: frmr %p (stale), status %d\n",
+		__func__, r, wc->status);
+	r->r.frmr.fr_state = FRMR_IS_STALE;
+}
+
 static int
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
@@ -195,6 +211,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
 
 		list_add(&r->mw_list, &buf->rb_mws);
 		list_add(&r->mw_all, &buf->rb_all);
+		r->mw_sendcompletion = frwr_sendcompletion;
 	}
 
 	return 0;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 17b2a29..f108a57 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -204,21 +204,17 @@ static const char * const wc_status[] = {
 static void
 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
 {
-	if (likely(wc->status == IB_WC_SUCCESS))
-		return;
-
 	/* WARNING: Only wr_id and status are reliable at this point */
-	if (wc->wr_id == 0ULL) {
-		if (wc->status != IB_WC_WR_FLUSH_ERR)
+	if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
+		if (wc->status != IB_WC_SUCCESS &&
+		    wc->status != IB_WC_WR_FLUSH_ERR)
 			pr_err("RPC:       %s: SEND: %s\n",
 			       __func__, COMPLETION_MSG(wc->status));
 	} else {
 		struct rpcrdma_mw *r;
 
 		r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
-		r->r.frmr.fr_state = FRMR_IS_STALE;
-		pr_err("RPC:       %s: frmr %p (stale): %s\n",
-		       __func__, r, COMPLETION_MSG(wc->status));
+		r->mw_sendcompletion(wc);
 	}
 }
 
@@ -1616,7 +1612,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	}
 
 	send_wr.next = NULL;
-	send_wr.wr_id = 0ULL;	/* no send cookie */
+	send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
 	send_wr.sg_list = req->rl_send_iov;
 	send_wr.num_sge = req->rl_niovs;
 	send_wr.opcode = IB_WR_SEND;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a53a564..40a0ee8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -106,6 +106,10 @@ struct rpcrdma_ep {
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
+/* Force completion handler to ignore the signal
+ */
+#define RPCRDMA_IGNORE_COMPLETION	(0ULL)
+
 /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
  *
  * The below structure appears at the front of a large region of kmalloc'd
@@ -206,6 +210,7 @@ struct rpcrdma_mw {
 		struct ib_fmr		*fmr;
 		struct rpcrdma_frmr	frmr;
 	} r;
+	void			(*mw_sendcompletion)(struct ib_wc *);
 	struct list_head	mw_list;
 	struct list_head	mw_all;
 };


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 12/16] xprtrdma: Acquire FMRs in rpcrdma_fmr_register_external()
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (10 preceding siblings ...)
  2015-03-13 21:23 ` [PATCH v1 11/16] xprtrdma: Handle non-SEND completions via a callout Chuck Lever
@ 2015-03-13 21:23 ` Chuck Lever
  2015-03-13 21:23 ` [PATCH v1 13/16] xprtrdma: Acquire MRs in rpcrdma_register_external() Chuck Lever
                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:23 UTC (permalink / raw)
  To: linux-nfs

Acquiring 64 FMRs in rpcrdma_buffer_get() while holding the buffer
pool lock is expensive, and unnecessary because FMR mode can
transfer up to a 1MB payload using just a single ib_fmr.

Instead, acquire ib_fmrs one-at-a-time as chunks are registered, and
return them to rb_mws immediately during deregistration.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c   |   41 ++++++++++++++++++++++++++++++++++++---
 net/sunrpc/xprtrdma/verbs.c     |   41 ++++++++++++++-------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
 3 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 96e6cd3..9c6c2e8 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -28,10 +28,11 @@ __fmr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_mw *mw = seg1->rl_mw;
 	int rc, nsegs = seg->mr_nsegs;
 	LIST_HEAD(l);
 
-	list_add(&seg1->rl_mw->r.fmr->list, &l);
+	list_add(&mw->r.fmr->list, &l);
 	rc = ib_unmap_fmr(&l);
 	read_lock(&ia->ri_qplock);
 	while (seg1->mr_nsegs--)
@@ -39,11 +40,14 @@ __fmr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	read_unlock(&ia->ri_qplock);
 	if (rc)
 		goto out_err;
+out:
+	seg1->rl_mw = NULL;
+	rpcrdma_put_mw(r_xprt, mw);
 	return nsegs;
 
 out_err:
 	dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
-	return nsegs;
+	goto out;
 }
 
 static int
@@ -117,6 +121,27 @@ out_fmr_err:
 	return rc;
 }
 
+static struct rpcrdma_mw *
+__fmr_get_mw(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mw *mw = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&buf->rb_lock, flags);
+
+	if (!list_empty(&buf->rb_mws)) {
+		mw = list_entry(buf->rb_mws.next,
+				struct rpcrdma_mw, mw_list);
+		list_del_init(&mw->mw_list);
+	} else {
+		pr_err("RPC:       %s: no MWs available\n", __func__);
+	}
+
+	spin_unlock_irqrestore(&buf->rb_lock, flags);
+	return mw;
+}
+
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -126,10 +151,18 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mr_seg *seg1 = seg;
-	struct rpcrdma_mw *mw = seg1->rl_mw;
+	struct rpcrdma_mw *mw;
 	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
 	int len, pageoff, i, rc;
 
+	mw = __fmr_get_mw(r_xprt);
+	if (!mw)
+		return -ENOMEM;
+	if (seg1->rl_mw) {
+		rpcrdma_put_mw(r_xprt, seg1->rl_mw);
+		seg1->rl_mw = NULL;
+	}
+
 	pageoff = offset_in_page(seg1->mr_offset);
 	seg1->mr_offset -= pageoff;	/* start of page */
 	seg1->mr_len += pageoff;
@@ -152,6 +185,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (rc)
 		goto out_maperr;
 
+	seg1->rl_mw = mw;
 	seg1->mr_rkey = mw->r.fmr->rkey;
 	seg1->mr_base = seg1->mr_dma + pageoff;
 	seg1->mr_nsegs = i;
@@ -164,6 +198,7 @@ out_maperr:
 		pageoff, i, rc);
 	while (i--)
 		rpcrdma_unmap_one(ia, --seg);
+	rpcrdma_put_mw(r_xprt, mw);
 	return rc;
 }
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f108a57..f2316d8 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1171,6 +1171,21 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 	kfree(buf->rb_pool);
 }
 
+void
+rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	unsigned long flags;
+
+	if (!list_empty(&mw->mw_list))
+		pr_warn("RPC:       %s: mw %p still on a list!\n",
+			__func__, mw);
+
+	spin_lock_irqsave(&buf->rb_lock, flags);
+	list_add_tail(&mw->mw_list, &buf->rb_mws);
+	spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
  * some req segments uninitialized.
  */
@@ -1292,28 +1307,6 @@ rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
 	return NULL;
 }
 
-static struct rpcrdma_req *
-rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_mw *r;
-	int i;
-
-	i = RPCRDMA_MAX_SEGS - 1;
-	while (!list_empty(&buf->rb_mws)) {
-		r = list_entry(buf->rb_mws.next,
-			       struct rpcrdma_mw, mw_list);
-		list_del(&r->mw_list);
-		req->rl_segments[i].rl_mw = r;
-		if (unlikely(i-- == 0))
-			return req;	/* Success */
-	}
-
-	/* Not enough entries on rb_mws for this req */
-	rpcrdma_buffer_put_sendbuf(req, buf);
-	rpcrdma_buffer_put_mrs(req, buf);
-	return NULL;
-}
-
 /*
  * Get a set of request/reply buffers.
  *
@@ -1355,9 +1348,6 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 	case RPCRDMA_FRMR:
 		req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
 		break;
-	case RPCRDMA_MTHCAFMR:
-		req = rpcrdma_buffer_get_fmrs(req, buffers);
-		break;
 	default:
 		break;
 	}
@@ -1382,7 +1372,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 	rpcrdma_buffer_put_sendbuf(req, buffers);
 	switch (ia->ri_memreg_strategy) {
 	case RPCRDMA_FRMR:
-	case RPCRDMA_MTHCAFMR:
 		rpcrdma_buffer_put_mrs(req, buffers);
 		break;
 	default:
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 40a0ee8..d5aa5b4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -426,6 +426,7 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
 void rpcrdma_map_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *, bool);
 void rpcrdma_unmap_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *);
+void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
 
 /*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 13/16] xprtrdma: Acquire MRs in rpcrdma_register_external()
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (11 preceding siblings ...)
  2015-03-13 21:23 ` [PATCH v1 12/16] xprtrdma: Acquire FMRs in rpcrdma_fmr_register_external() Chuck Lever
@ 2015-03-13 21:23 ` Chuck Lever
  2015-03-13 21:23 ` [PATCH v1 14/16] xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy Chuck Lever
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:23 UTC (permalink / raw)
  To: linux-nfs

Acquiring 64 MRs in rpcrdma_buffer_get() while holding the buffer
pool lock is expensive, and unnecessary because most modern adapters
can transfer 100s of KBs of payload using just a single MR.

Instead, acquire MRs one-at-a-time as chunks are registered, and
return them to rb_mws immediately during deregistration.

Note: commit 539431a437d2 ("xprtrdma: Don't invalidate FRMRs if
registration fails") is reverted: There is now a valid case where
registration can fail (with -ENOMEM) but the QP is still in RTS.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c |  126 ++++++++++++++++++++++++++++++++++++---
 net/sunrpc/xprtrdma/rpc_rdma.c |    3 -
 net/sunrpc/xprtrdma/verbs.c    |  130 ----------------------------------------
 3 files changed, 120 insertions(+), 139 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 6e6d8ba..d23e064 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -54,15 +54,16 @@ __frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mw *mw = seg1->rl_mw;
 	struct ib_send_wr invalidate_wr, *bad_wr;
 	int rc, nsegs = seg->mr_nsegs;
 
-	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+	mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
 	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+	invalidate_wr.wr_id = (unsigned long)(void *)mw;
 	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+	invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
 	read_lock(&ia->ri_qplock);
@@ -72,13 +73,17 @@ __frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	read_unlock(&ia->ri_qplock);
 	if (rc)
 		goto out_err;
+
+out:
+	seg1->rl_mw = NULL;
+	rpcrdma_put_mw(r_xprt, mw);
 	return nsegs;
 
 out_err:
 	/* Force rpcrdma_buffer_get() to retry */
-	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+	mw->r.frmr.fr_state = FRMR_IS_STALE;
 	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-	return nsegs;
+	goto out;
 }
 
 static void
@@ -217,6 +222,99 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
 	return 0;
 }
 
+/* rpcrdma_unmap_one() was already done by rpcrdma_frwr_releasesge().
+ * Redo only the ib_post_send().
+ */
+static void
+__retry_local_inv(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+	struct rpcrdma_xprt *r_xprt =
+				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	pr_warn("RPC:       %s: FRMR %p is stale\n", __func__, r);
+
+	/* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+	r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+	invalidate_wr.wr_id = (unsigned long)(void *)r;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	pr_warn("RPC:       %s: frmr %p invalidating rkey %08x\n",
+		__func__, r, r->r.frmr.fr_mr->rkey);
+
+	read_lock(&ia->ri_qplock);
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	read_unlock(&ia->ri_qplock);
+	if (rc) {
+		/* Force __frwr_get_mw() to retry */
+		r->r.frmr.fr_state = FRMR_IS_STALE;
+		dprintk("RPC:       %s: ib_post_send status %i\n",
+			__func__, rc);
+	}
+}
+
+static void
+__retry_flushed_linv(struct rpcrdma_buffer *buf, struct list_head *stale)
+{
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	struct list_head *pos;
+	struct rpcrdma_mw *r;
+	unsigned long flags;
+	unsigned count;
+
+	count = 0;
+	list_for_each(pos, stale) {
+		r = list_entry(pos, struct rpcrdma_mw, mw_list);
+		__retry_local_inv(ia, r);
+		++count;
+	}
+
+	pr_warn("RPC:       %s: adding %u MRs to rb_mws\n",
+		__func__, count);
+
+	spin_lock_irqsave(&buf->rb_lock, flags);
+	list_splice_tail(stale, &buf->rb_mws);
+	spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_mw *
+__frwr_get_mw(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mw *mw;
+	unsigned long flags;
+	LIST_HEAD(stale);
+	int count;
+
+	spin_lock_irqsave(&buf->rb_lock, flags);
+	count = 0;
+	while (!list_empty(&buf->rb_mws)) {
+		mw = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list);
+		list_del_init(&mw->mw_list);
+		if (mw->r.frmr.fr_state == FRMR_IS_INVALID)
+			goto out_unlock;
+		list_add_tail(&mw->mw_list, &stale);
+		count++;
+	}
+
+	pr_err("RPC:       %s: no INVALID MWs available\n", __func__);
+	mw = NULL;
+
+out_unlock:
+	spin_unlock_irqrestore(&buf->rb_lock, flags);
+	if (!list_empty(&stale)) {
+		dprintk("RPC:       %s: found %d unsuitable MWs\n",
+			__func__, count);
+		__retry_flushed_linv(buf, &stale);
+	}
+	return mw;
+}
+
 /* Post a FAST_REG Work Request to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -226,9 +324,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mr_seg *seg1 = seg;
-	struct rpcrdma_mw *mw = seg1->rl_mw;
-	struct rpcrdma_frmr *frmr = &mw->r.frmr;
-	struct ib_mr *mr = frmr->fr_mr;
+	struct rpcrdma_frmr *frmr;
+	struct rpcrdma_mw *mw;
+	struct ib_mr *mr;
 	struct ib_send_wr fastreg_wr, *bad_wr;
 	u8 key;
 	int len, pageoff;
@@ -237,12 +335,21 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	u64 pa;
 	int page_no;
 
+	mw = __frwr_get_mw(r_xprt);
+	if (!mw)
+		return -ENOMEM;
+	if (seg1->rl_mw) {
+		rpcrdma_put_mw(r_xprt, seg1->rl_mw);
+		seg1->rl_mw = NULL;
+	}
+
 	pageoff = offset_in_page(seg1->mr_offset);
 	seg1->mr_offset -= pageoff;	/* start of page */
 	seg1->mr_len += pageoff;
 	len = -pageoff;
 	if (nsegs > ia->ri_max_frmr_depth)
 		nsegs = ia->ri_max_frmr_depth;
+	frmr = &mw->r.frmr;
 	for (page_no = i = 0; i < nsegs;) {
 		rpcrdma_map_one(ia, seg, writing);
 		pa = seg->mr_dma;
@@ -278,6 +385,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	fastreg_wr.wr.fast_reg.access_flags = writing ?
 				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
 				IB_ACCESS_REMOTE_READ;
+	mr = frmr->fr_mr;
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
 	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
@@ -287,6 +395,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (rc)
 		goto out_senderr;
 
+	seg1->rl_mw = mw;
 	seg1->mr_rkey = mr->rkey;
 	seg1->mr_base = seg1->mr_dma + pageoff;
 	seg1->mr_nsegs = i;
@@ -301,6 +410,7 @@ out_err:
 	frmr->fr_state = FRMR_IS_INVALID;
 	while (i--)
 		rpcrdma_unmap_one(ia, --seg);
+	rpcrdma_put_mw(r_xprt, mw);
 	return rc;
 }
 
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 7b51d9d..b9689ca 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -284,8 +284,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR)
-		r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, nchunks);
+	r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, nchunks);
 	return n;
 }
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f2316d8..5b9c104 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1186,33 +1186,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 	spin_unlock_irqrestore(&buf->rb_lock, flags);
 }
 
-/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
- * some req segments uninitialized.
- */
-static void
-rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
-{
-	if (*mw) {
-		list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
-		*mw = NULL;
-	}
-}
-
-/* Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
-static void
-rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_mr_seg *seg = req->rl_segments;
-	struct rpcrdma_mr_seg *seg1 = seg;
-	int i;
-
-	for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
-		rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
-	rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
-}
-
 static void
 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
 {
@@ -1225,88 +1198,6 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
 	}
 }
 
-/* rpcrdma_unmap_one() was already done during deregistration.
- * Redo only the ib_post_send().
- */
-static void
-rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_xprt *r_xprt =
-				container_of(ia, struct rpcrdma_xprt, rx_ia);
-	struct ib_send_wr invalidate_wr, *bad_wr;
-	int rc;
-
-	dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
-
-	/* When this FRMR is re-inserted into rb_mws, it is no longer stale */
-	r->r.frmr.fr_state = FRMR_IS_INVALID;
-
-	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-	invalidate_wr.wr_id = (unsigned long)(void *)r;
-	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
-	DECR_CQCOUNT(&r_xprt->rx_ep);
-
-	dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
-		__func__, r, r->r.frmr.fr_mr->rkey);
-
-	read_lock(&ia->ri_qplock);
-	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-	read_unlock(&ia->ri_qplock);
-	if (rc) {
-		/* Force rpcrdma_buffer_get() to retry */
-		r->r.frmr.fr_state = FRMR_IS_STALE;
-		dprintk("RPC:       %s: ib_post_send failed, %i\n",
-			__func__, rc);
-	}
-}
-
-static void
-rpcrdma_retry_flushed_linv(struct list_head *stale,
-			   struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-	struct list_head *pos;
-	struct rpcrdma_mw *r;
-	unsigned long flags;
-
-	list_for_each(pos, stale) {
-		r = list_entry(pos, struct rpcrdma_mw, mw_list);
-		rpcrdma_retry_local_inv(r, ia);
-	}
-
-	spin_lock_irqsave(&buf->rb_lock, flags);
-	list_splice_tail(stale, &buf->rb_mws);
-	spin_unlock_irqrestore(&buf->rb_lock, flags);
-}
-
-static struct rpcrdma_req *
-rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
-			 struct list_head *stale)
-{
-	struct rpcrdma_mw *r;
-	int i;
-
-	i = RPCRDMA_MAX_SEGS - 1;
-	while (!list_empty(&buf->rb_mws)) {
-		r = list_entry(buf->rb_mws.next,
-			       struct rpcrdma_mw, mw_list);
-		list_del(&r->mw_list);
-		if (r->r.frmr.fr_state == FRMR_IS_STALE) {
-			list_add(&r->mw_list, stale);
-			continue;
-		}
-		req->rl_segments[i].rl_mw = r;
-		if (unlikely(i-- == 0))
-			return req;	/* Success */
-	}
-
-	/* Not enough entries on rb_mws for this req */
-	rpcrdma_buffer_put_sendbuf(req, buf);
-	rpcrdma_buffer_put_mrs(req, buf);
-	return NULL;
-}
-
 /*
  * Get a set of request/reply buffers.
  *
@@ -1319,12 +1210,11 @@ rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
-	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-	struct list_head stale;
 	struct rpcrdma_req *req;
 	unsigned long flags;
 
 	spin_lock_irqsave(&buffers->rb_lock, flags);
+
 	if (buffers->rb_send_index == buffers->rb_max_requests) {
 		spin_unlock_irqrestore(&buffers->rb_lock, flags);
 		dprintk("RPC:       %s: out of request buffers\n", __func__);
@@ -1343,17 +1233,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 	}
 	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
 
-	INIT_LIST_HEAD(&stale);
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
-		break;
-	default:
-		break;
-	}
 	spin_unlock_irqrestore(&buffers->rb_lock, flags);
-	if (!list_empty(&stale))
-		rpcrdma_retry_flushed_linv(&stale, buffers);
 	return req;
 }
 
@@ -1365,18 +1245,10 @@ void
 rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
 	struct rpcrdma_buffer *buffers = req->rl_buffer;
-	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
 	unsigned long flags;
 
 	spin_lock_irqsave(&buffers->rb_lock, flags);
 	rpcrdma_buffer_put_sendbuf(req, buffers);
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		rpcrdma_buffer_put_mrs(req, buffers);
-		break;
-	default:
-		break;
-	}
 	spin_unlock_irqrestore(&buffers->rb_lock, flags);
 }
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 14/16] xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (12 preceding siblings ...)
  2015-03-13 21:23 ` [PATCH v1 13/16] xprtrdma: Acquire MRs in rpcrdma_register_external() Chuck Lever
@ 2015-03-13 21:23 ` Chuck Lever
  2015-03-13 21:23 ` [PATCH v1 15/16] xprtrdma: Make rpcrdma_{un}map_one() into inline functions Chuck Lever
                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:23 UTC (permalink / raw)
  To: linux-nfs

Clean up: This field is no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xprtrdma.h |    3 ++-
 net/sunrpc/xprtrdma/verbs.c     |    3 ---
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 64a0a0a..10bc3cde 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -61,7 +61,8 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-/* memory registration strategies */
+/* Memory registration strategies, by number.
+ * This is part of a kernel / user space API. Do not remove. */
 enum rpcrdma_memreg {
 	RPCRDMA_BOUNCEBUFFERS = 0,
 	RPCRDMA_REGISTER,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 5b9c104..21b63fe 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -669,9 +669,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
 		__func__, ia->ri_ops->ro_displayname);
 
-	/* Else will do memory reg/dereg for each chunk */
-	ia->ri_memreg_strategy = memreg;
-
 	rwlock_init(&ia->ri_qplock);
 	return 0;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d5aa5b4..b167e44 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -69,7 +69,6 @@ struct rpcrdma_ia {
 	int			ri_have_dma_lkey;
 	struct completion	ri_done;
 	int			ri_async_rc;
-	enum rpcrdma_memreg	ri_memreg_strategy;
 	unsigned int		ri_max_frmr_depth;
 	struct ib_device_attr	ri_devattr;
 	struct ib_qp_attr	ri_qp_attr;


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 15/16] xprtrdma: Make rpcrdma_{un}map_one() into inline functions
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (13 preceding siblings ...)
  2015-03-13 21:23 ` [PATCH v1 14/16] xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy Chuck Lever
@ 2015-03-13 21:23 ` Chuck Lever
  2015-03-13 21:23 ` [PATCH v1 16/16] xprtrdma: Split rb_lock Chuck Lever
       [not found] ` <20150313211124.22471.14517.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:23 UTC (permalink / raw)
  To: linux-nfs

These functions are called in a loop for each page transferred via
RDMA READ or WRITE. Extract loop invariants and inline them to
reduce CPU overhead.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |   10 ++++++--
 net/sunrpc/xprtrdma/frwr_ops.c     |   10 ++++++--
 net/sunrpc/xprtrdma/physical_ops.c |   11 ++++++---
 net/sunrpc/xprtrdma/verbs.c        |   44 ++++++-----------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |   45 ++++++++++++++++++++++++++++++++++--
 5 files changed, 73 insertions(+), 47 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 9c6c2e8..1404f20 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -29,14 +29,16 @@ __fmr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_mw *mw = seg1->rl_mw;
+	struct ib_device *device;
 	int rc, nsegs = seg->mr_nsegs;
 	LIST_HEAD(l);
 
 	list_add(&mw->r.fmr->list, &l);
 	rc = ib_unmap_fmr(&l);
 	read_lock(&ia->ri_qplock);
+	device = ia->ri_id->device;
 	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
+		rpcrdma_unmap_one(device, seg++);
 	read_unlock(&ia->ri_qplock);
 	if (rc)
 		goto out_err;
@@ -150,6 +152,8 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	   int nsegs, bool writing)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct ib_device *device = ia->ri_id->device;
+	enum dma_data_direction direction = rpcrdma_data_dir(writing);
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_mw *mw;
 	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
@@ -170,7 +174,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (nsegs > RPCRDMA_MAX_FMR_SGES)
 		nsegs = RPCRDMA_MAX_FMR_SGES;
 	for (i = 0; i < nsegs;) {
-		rpcrdma_map_one(ia, seg, writing);
+		rpcrdma_map_one(device, seg, direction);
 		physaddrs[i] = seg->mr_dma;
 		len += seg->mr_len;
 		++seg;
@@ -197,7 +201,7 @@ out_maperr:
 		__func__, len, (unsigned long long)seg1->mr_dma,
 		pageoff, i, rc);
 	while (i--)
-		rpcrdma_unmap_one(ia, --seg);
+		rpcrdma_unmap_one(device, --seg);
 	rpcrdma_put_mw(r_xprt, mw);
 	return rc;
 }
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index d23e064..4494668 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -55,6 +55,7 @@ __frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw = seg1->rl_mw;
+	struct ib_device *device;
 	struct ib_send_wr invalidate_wr, *bad_wr;
 	int rc, nsegs = seg->mr_nsegs;
 
@@ -67,8 +68,9 @@ __frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
 	read_lock(&ia->ri_qplock);
+	device = ia->ri_id->device;
 	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
+		rpcrdma_unmap_one(device, seg++);
 	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
 	read_unlock(&ia->ri_qplock);
 	if (rc)
@@ -323,6 +325,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	    int nsegs, bool writing)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct ib_device *device = ia->ri_id->device;
+	enum dma_data_direction direction = rpcrdma_data_dir(writing);
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_frmr *frmr;
 	struct rpcrdma_mw *mw;
@@ -351,7 +355,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		nsegs = ia->ri_max_frmr_depth;
 	frmr = &mw->r.frmr;
 	for (page_no = i = 0; i < nsegs;) {
-		rpcrdma_map_one(ia, seg, writing);
+		rpcrdma_map_one(device, seg, direction);
 		pa = seg->mr_dma;
 		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
 			frmr->fr_pgl->page_list[page_no++] = pa;
@@ -409,7 +413,7 @@ out_senderr:
 out_err:
 	frmr->fr_state = FRMR_IS_INVALID;
 	while (i--)
-		rpcrdma_unmap_one(ia, --seg);
+		rpcrdma_unmap_one(device, --seg);
 	rpcrdma_put_mw(r_xprt, mw);
 	return rc;
 }
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 0998f4f..9ce7dfc 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -64,7 +64,8 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	rpcrdma_map_one(ia, seg, writing);
+	rpcrdma_map_one(ia->ri_id->device, seg,
+			rpcrdma_data_dir(writing));
 	seg->mr_rkey = ia->ri_bind_mem->rkey;
 	seg->mr_base = seg->mr_dma;
 	seg->mr_nsegs = 1;
@@ -77,10 +78,14 @@ static void
 physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		  unsigned int count)
 {
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	unsigned int i;
 
-	for (i = 0; i < count; i++)
-		rpcrdma_unmap_one(&r_xprt->rx_ia, &req->rl_segments[i]);
+	for (i = 0; i < count; i++) {
+		read_lock(&ia->ri_qplock);
+		rpcrdma_unmap_one(ia->ri_id->device, &req->rl_segments[i]);
+		read_unlock(&ia->ri_qplock);
+	}
 }
 
 static void
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 21b63fe..460c917 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1288,6 +1288,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  */
 
+void
+rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
+{
+	dprintk("RPC:       map_one: offset %p iova %llx len %zu\n",
+		seg->mr_offset,
+		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
+}
+
 static int
 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
 				struct ib_mr **mrp, struct ib_sge *iov)
@@ -1413,42 +1421,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 }
 
 /*
- * Wrappers for chunk registration, shared by read/write chunk code.
- */
-
-void
-rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, bool writing)
-{
-	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-	seg->mr_dmalen = seg->mr_len;
-	if (seg->mr_page)
-		seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
-				seg->mr_page, offset_in_page(seg->mr_offset),
-				seg->mr_dmalen, seg->mr_dir);
-	else
-		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
-				seg->mr_offset,
-				seg->mr_dmalen, seg->mr_dir);
-	if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
-		dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
-			__func__,
-			(unsigned long long)seg->mr_dma,
-			seg->mr_offset, seg->mr_dmalen);
-	}
-}
-
-void
-rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
-{
-	if (seg->mr_page)
-		ib_dma_unmap_page(ia->ri_id->device,
-				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-	else
-		ib_dma_unmap_single(ia->ri_id->device,
-				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
-/*
  * Prepost any receive buffer, then post send.
  *
  * Receive buffer is donated to hardware, reclaimed upon recv completion.
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b167e44..10c4fa5 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -423,11 +423,52 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 			 struct rpcrdma_regbuf *);
 
 unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
-void rpcrdma_map_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *, bool);
-void rpcrdma_unmap_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *);
 void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
 
 /*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+	return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+static inline void
+rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
+		enum dma_data_direction direction)
+{
+	seg->mr_dir = direction;
+	seg->mr_dmalen = seg->mr_len;
+
+	if (seg->mr_page)
+		seg->mr_dma = ib_dma_map_page(device,
+				seg->mr_page, offset_in_page(seg->mr_offset),
+				seg->mr_dmalen, seg->mr_dir);
+	else
+		seg->mr_dma = ib_dma_map_single(device,
+				seg->mr_offset,
+				seg->mr_dmalen, seg->mr_dir);
+
+	if (ib_dma_mapping_error(device, seg->mr_dma))
+		rpcrdma_mapping_error(seg);
+}
+
+static inline void
+rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
+{
+	if (seg->mr_page)
+		ib_dma_unmap_page(device,
+				  seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+	else
+		ib_dma_unmap_single(device,
+				    seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
+/*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
  */
 void rpcrdma_connect_worker(struct work_struct *);


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH v1 16/16] xprtrdma: Split rb_lock
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
                   ` (14 preceding siblings ...)
  2015-03-13 21:23 ` [PATCH v1 15/16] xprtrdma: Make rpcrdma_{un}map_one() into inline functions Chuck Lever
@ 2015-03-13 21:23 ` Chuck Lever
       [not found] ` <20150313211124.22471.14517.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>
  16 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:23 UTC (permalink / raw)
  To: linux-nfs

/proc/lock_stat showed contention between rpcrdma_buffer_get/put
and the MR allocation functions during I/O intensive workloads.

Now that MRs are no longer allocated in rpcrdma_buffer_get(),
there's no reason the rb_mws list has to be managed using the
same lock as the send/receive buffers. Split that lock. The
new lock does not need to disable interrupts because buffer
get/put is never called in an interrupt context.

struct rpcrdma_buffer is re-arranged to ensure rb_mwlock and
rb_mws is always in a different cacheline than rb_lock and the
buffer pointers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c   |    6 +++---
 net/sunrpc/xprtrdma/frwr_ops.c  |   28 ++++++++++++----------------
 net/sunrpc/xprtrdma/verbs.c     |    5 ++---
 net/sunrpc/xprtrdma/xprt_rdma.h |   16 +++++++++-------
 4 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1404f20..00d362d 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -96,6 +96,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_mw *r;
 	int i, rc;
 
+	spin_lock_init(&buf->rb_mwlock);
 	INIT_LIST_HEAD(&buf->rb_mws);
 	INIT_LIST_HEAD(&buf->rb_all);
 
@@ -128,9 +129,8 @@ __fmr_get_mw(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_mw *mw = NULL;
-	unsigned long flags;
 
-	spin_lock_irqsave(&buf->rb_lock, flags);
+	spin_lock(&buf->rb_mwlock);
 
 	if (!list_empty(&buf->rb_mws)) {
 		mw = list_entry(buf->rb_mws.next,
@@ -140,7 +140,7 @@ __fmr_get_mw(struct rpcrdma_xprt *r_xprt)
 		pr_err("RPC:       %s: no MWs available\n", __func__);
 	}
 
-	spin_unlock_irqrestore(&buf->rb_lock, flags);
+	spin_unlock(&buf->rb_mwlock);
 	return mw;
 }
 
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 4494668..7973b94 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -196,6 +196,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
 	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
 	int i;
 
+	spin_lock_init(&buf->rb_mwlock);
 	INIT_LIST_HEAD(&buf->rb_mws);
 	INIT_LIST_HEAD(&buf->rb_all);
 
@@ -228,10 +229,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
  * Redo only the ib_post_send().
  */
 static void
-__retry_local_inv(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+__retry_local_inv(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *r)
 {
-	struct rpcrdma_xprt *r_xprt =
-				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct ib_send_wr invalidate_wr, *bad_wr;
 	int rc;
 
@@ -258,30 +258,27 @@ __retry_local_inv(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
 		dprintk("RPC:       %s: ib_post_send status %i\n",
 			__func__, rc);
 	}
+
+	rpcrdma_put_mw(r_xprt, r);
 }
 
 static void
-__retry_flushed_linv(struct rpcrdma_buffer *buf, struct list_head *stale)
+__retry_flushed_linv(struct rpcrdma_xprt *r_xprt, struct list_head *stale)
 {
-	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
 	struct list_head *pos;
 	struct rpcrdma_mw *r;
-	unsigned long flags;
 	unsigned count;
 
 	count = 0;
 	list_for_each(pos, stale) {
 		r = list_entry(pos, struct rpcrdma_mw, mw_list);
-		__retry_local_inv(ia, r);
+		list_del_init(&r->mw_list);
+		__retry_local_inv(r_xprt, r);
 		++count;
 	}
 
-	pr_warn("RPC:       %s: adding %u MRs to rb_mws\n",
+	pr_warn("RPC:       %s: added %u MRs to rb_mws\n",
 		__func__, count);
-
-	spin_lock_irqsave(&buf->rb_lock, flags);
-	list_splice_tail(stale, &buf->rb_mws);
-	spin_unlock_irqrestore(&buf->rb_lock, flags);
 }
 
 static struct rpcrdma_mw *
@@ -289,11 +286,10 @@ __frwr_get_mw(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_mw *mw;
-	unsigned long flags;
 	LIST_HEAD(stale);
 	int count;
 
-	spin_lock_irqsave(&buf->rb_lock, flags);
+	spin_lock(&buf->rb_mwlock);
 	count = 0;
 	while (!list_empty(&buf->rb_mws)) {
 		mw = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list);
@@ -308,11 +304,11 @@ __frwr_get_mw(struct rpcrdma_xprt *r_xprt)
 	mw = NULL;
 
 out_unlock:
-	spin_unlock_irqrestore(&buf->rb_lock, flags);
+	spin_unlock(&buf->rb_mwlock);
 	if (!list_empty(&stale)) {
 		dprintk("RPC:       %s: found %d unsuitable MWs\n",
 			__func__, count);
-		__retry_flushed_linv(buf, &stale);
+		__retry_flushed_linv(r_xprt, &stale);
 	}
 	return mw;
 }
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 460c917..d4de417 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1172,15 +1172,14 @@ void
 rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	unsigned long flags;
 
 	if (!list_empty(&mw->mw_list))
 		pr_warn("RPC:       %s: mw %p still on a list!\n",
 			__func__, mw);
 
-	spin_lock_irqsave(&buf->rb_lock, flags);
+	spin_lock(&buf->rb_mwlock);
 	list_add_tail(&mw->mw_list, &buf->rb_mws);
-	spin_unlock_irqrestore(&buf->rb_lock, flags);
+	spin_unlock(&buf->rb_mwlock);
 }
 
 static void
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 10c4fa5..998f188 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -280,15 +280,17 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
  * One of these is associated with a transport instance
  */
 struct rpcrdma_buffer {
-	spinlock_t	rb_lock;	/* protects indexes */
-	u32		rb_max_requests;/* client max requests */
-	struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */
-	struct list_head rb_all;
-	int		rb_send_index;
+	spinlock_t		rb_mwlock;
+	struct list_head	rb_mws;
+	struct list_head	rb_all;
+	char			*rb_pool;
+
+	spinlock_t		rb_lock;
+	u32			rb_max_requests;
+	int			rb_send_index;
+	int			rb_recv_index;
 	struct rpcrdma_req	**rb_send_bufs;
-	int		rb_recv_index;
 	struct rpcrdma_rep	**rb_recv_bufs;
-	char		*rb_pool;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 06/16] xprtrdma: Add a "deregister_external" op for each memreg mode
  2015-03-13 21:22 ` [PATCH v1 06/16] xprtrdma: Add a "deregister_external" " Chuck Lever
@ 2015-03-17 14:37   ` Anna Schumaker
  2015-03-17 15:04     ` Chuck Lever
  0 siblings, 1 reply; 56+ messages in thread
From: Anna Schumaker @ 2015-03-17 14:37 UTC (permalink / raw)
  To: Chuck Lever, linux-nfs

On 03/13/2015 05:22 PM, Chuck Lever wrote:
> There is very little common processing among the different external
> memory deregistration functions.
> 
> In addition, instead of calling the deregistration function for each
> segment, have one call release all segments for a request. This makes
> the API a little asymmetrical, but a hair faster.

The common processing would be the for-each loop that you moved into the ro_unmap functions.  I'm not completely sold on this... how often do unmaps happen?

Anna

> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  net/sunrpc/xprtrdma/fmr_ops.c      |   37 ++++++++++++++++
>  net/sunrpc/xprtrdma/frwr_ops.c     |   46 ++++++++++++++++++++
>  net/sunrpc/xprtrdma/physical_ops.c |   13 ++++++
>  net/sunrpc/xprtrdma/rpc_rdma.c     |    7 +--
>  net/sunrpc/xprtrdma/transport.c    |    8 +---
>  net/sunrpc/xprtrdma/verbs.c        |   81 ------------------------------------
>  net/sunrpc/xprtrdma/xprt_rdma.h    |    5 +-
>  7 files changed, 103 insertions(+), 94 deletions(-)
> 
> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
> index 45fb646..9b983b4 100644
> --- a/net/sunrpc/xprtrdma/fmr_ops.c
> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
> @@ -20,6 +20,32 @@
>  /* Maximum scatter/gather per FMR */
>  #define RPCRDMA_MAX_FMR_SGES	(64)
>  
> +/* Use the ib_unmap_fmr() verb to prevent further remote
> + * access via RDMA READ or RDMA WRITE.
> + */
> +static int
> +__fmr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
> +{
> +	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
> +	struct rpcrdma_mr_seg *seg1 = seg;
> +	int rc, nsegs = seg->mr_nsegs;
> +	LIST_HEAD(l);
> +
> +	list_add(&seg1->rl_mw->r.fmr->list, &l);
> +	rc = ib_unmap_fmr(&l);
> +	read_lock(&ia->ri_qplock);
> +	while (seg1->mr_nsegs--)
> +		rpcrdma_unmap_one(ia, seg++);
> +	read_unlock(&ia->ri_qplock);
> +	if (rc)
> +		goto out_err;
> +	return nsegs;
> +
> +out_err:
> +	dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
> +	return nsegs;
> +}
> +
>  /* FMR mode conveys up to 64 pages of payload per chunk segment.
>   */
>  static size_t
> @@ -79,8 +105,19 @@ out_maperr:
>  	return rc;
>  }
>  
> +static void
> +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
> +	     unsigned int count)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; count--;)
> +		i += __fmr_unmap(r_xprt, &req->rl_segments[i]);
> +}
> +
>  const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
>  	.ro_map				= fmr_op_map,
> +	.ro_unmap			= fmr_op_unmap,
>  	.ro_maxpages			= fmr_op_maxpages,
>  	.ro_displayname			= "fmr",
>  };
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index 2b5ccb0..05b5761 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -17,6 +17,41 @@
>  # define RPCDBG_FACILITY	RPCDBG_TRANS
>  #endif
>  
> +/* Post a LOCAL_INV Work Request to prevent further remote access
> + * via RDMA READ or RDMA WRITE.
> + */
> +static int
> +__frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
> +{
> +	struct rpcrdma_mr_seg *seg1 = seg;
> +	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
> +	struct ib_send_wr invalidate_wr, *bad_wr;
> +	int rc, nsegs = seg->mr_nsegs;
> +
> +	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
> +
> +	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
> +	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
> +	invalidate_wr.opcode = IB_WR_LOCAL_INV;
> +	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
> +	DECR_CQCOUNT(&r_xprt->rx_ep);
> +
> +	read_lock(&ia->ri_qplock);
> +	while (seg1->mr_nsegs--)
> +		rpcrdma_unmap_one(ia, seg++);
> +	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
> +	read_unlock(&ia->ri_qplock);
> +	if (rc)
> +		goto out_err;
> +	return nsegs;
> +
> +out_err:
> +	/* Force rpcrdma_buffer_get() to retry */
> +	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
> +	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
> +	return nsegs;
> +}
> +
>  /* FRWR mode conveys a list of pages per chunk segment. The
>   * maximum length of that list is the FRWR page list depth.
>   */
> @@ -116,8 +151,19 @@ out_err:
>  	return rc;
>  }
>  
> +static void
> +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
> +	      unsigned int count)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; count--;)
> +		i += __frwr_unmap(r_xprt, &req->rl_segments[i]);
> +}
> +
>  const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
>  	.ro_map				= frwr_op_map,
> +	.ro_unmap			= frwr_op_unmap,
>  	.ro_maxpages			= frwr_op_maxpages,
>  	.ro_displayname			= "frwr",
>  };
> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
> index 5a284ee..f2c15be 100644
> --- a/net/sunrpc/xprtrdma/physical_ops.c
> +++ b/net/sunrpc/xprtrdma/physical_ops.c
> @@ -44,8 +44,21 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>  	return 1;
>  }
>  
> +/* Unmap a memory region, but leave it registered.
> + */
> +static void
> +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
> +		  unsigned int count)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < count; i++)
> +		rpcrdma_unmap_one(&r_xprt->rx_ia, &req->rl_segments[i]);
> +}
> +
>  const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
>  	.ro_map				= physical_op_map,
> +	.ro_unmap			= physical_op_unmap,
>  	.ro_maxpages			= physical_op_maxpages,
>  	.ro_displayname			= "physical",
>  };
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index 6ab1d03..7b51d9d 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -284,11 +284,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
>  	return (unsigned char *)iptr - (unsigned char *)headerp;
>  
>  out:
> -	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
> -		for (pos = 0; nchunks--;)
> -			pos += rpcrdma_deregister_external(
> -					&req->rl_segments[pos], r_xprt);
> -	}
> +	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR)
> +		r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, nchunks);
>  	return n;
>  }
>  
> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> index 9a9da40..c484671 100644
> --- a/net/sunrpc/xprtrdma/transport.c
> +++ b/net/sunrpc/xprtrdma/transport.c
> @@ -571,7 +571,6 @@ xprt_rdma_free(void *buffer)
>  	struct rpcrdma_req *req;
>  	struct rpcrdma_xprt *r_xprt;
>  	struct rpcrdma_regbuf *rb;
> -	int i;
>  
>  	if (buffer == NULL)
>  		return;
> @@ -582,11 +581,8 @@ xprt_rdma_free(void *buffer)
>  
>  	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
>  
> -	for (i = 0; req->rl_nchunks;) {
> -		--req->rl_nchunks;
> -		i += rpcrdma_deregister_external(
> -			&req->rl_segments[i], r_xprt);
> -	}
> +	r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, req->rl_nchunks);
> +	req->rl_nchunks = 0;
>  
>  	rpcrdma_buffer_put(req);
>  }
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 851ed97..a1621fd 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -1509,7 +1509,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
>  	}
>  }
>  
> -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
> +/* rpcrdma_unmap_one() was already done during deregistration.
>   * Redo only the ib_post_send().
>   */
>  static void
> @@ -1889,85 +1889,6 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
>  				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
>  }
>  
> -static int
> -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
> -			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
> -{
> -	struct rpcrdma_mr_seg *seg1 = seg;
> -	struct ib_send_wr invalidate_wr, *bad_wr;
> -	int rc;
> -
> -	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
> -
> -	memset(&invalidate_wr, 0, sizeof invalidate_wr);
> -	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
> -	invalidate_wr.opcode = IB_WR_LOCAL_INV;
> -	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
> -	DECR_CQCOUNT(&r_xprt->rx_ep);
> -
> -	read_lock(&ia->ri_qplock);
> -	while (seg1->mr_nsegs--)
> -		rpcrdma_unmap_one(ia, seg++);
> -	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
> -	read_unlock(&ia->ri_qplock);
> -	if (rc) {
> -		/* Force rpcrdma_buffer_get() to retry */
> -		seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
> -		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
> -			" status %i\n", __func__, rc);
> -	}
> -	return rc;
> -}
> -
> -static int
> -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
> -			struct rpcrdma_ia *ia)
> -{
> -	struct rpcrdma_mr_seg *seg1 = seg;
> -	LIST_HEAD(l);
> -	int rc;
> -
> -	list_add(&seg1->rl_mw->r.fmr->list, &l);
> -	rc = ib_unmap_fmr(&l);
> -	read_lock(&ia->ri_qplock);
> -	while (seg1->mr_nsegs--)
> -		rpcrdma_unmap_one(ia, seg++);
> -	read_unlock(&ia->ri_qplock);
> -	if (rc)
> -		dprintk("RPC:       %s: failed ib_unmap_fmr,"
> -			" status %i\n", __func__, rc);
> -	return rc;
> -}
> -
> -int
> -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
> -		struct rpcrdma_xprt *r_xprt)
> -{
> -	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
> -	int nsegs = seg->mr_nsegs, rc;
> -
> -	switch (ia->ri_memreg_strategy) {
> -
> -	case RPCRDMA_ALLPHYSICAL:
> -		read_lock(&ia->ri_qplock);
> -		rpcrdma_unmap_one(ia, seg);
> -		read_unlock(&ia->ri_qplock);
> -		break;
> -
> -	case RPCRDMA_FRMR:
> -		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
> -		break;
> -
> -	case RPCRDMA_MTHCAFMR:
> -		rc = rpcrdma_deregister_fmr_external(seg, ia);
> -		break;
> -
> -	default:
> -		break;
> -	}
> -	return nsegs;
> -}
> -
>  /*
>   * Prepost any receive buffer, then post send.
>   *
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index 7bf077b..3aabbb2 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -338,6 +338,8 @@ struct rpcrdma_xprt;
>  struct rpcrdma_memreg_ops {
>  	int		(*ro_map)(struct rpcrdma_xprt *,
>  				  struct rpcrdma_mr_seg *, int, bool);
> +	void		(*ro_unmap)(struct rpcrdma_xprt *,
> +				    struct rpcrdma_req *, unsigned int);
>  	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
>  	const char	*ro_displayname;
>  };
> @@ -405,9 +407,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
>  void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
>  void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
>  
> -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
> -				struct rpcrdma_xprt *);
> -
>  struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
>  					    size_t, gfp_t);
>  void rpcrdma_free_regbuf(struct rpcrdma_ia *,
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 06/16] xprtrdma: Add a "deregister_external" op for each memreg mode
  2015-03-17 14:37   ` Anna Schumaker
@ 2015-03-17 15:04     ` Chuck Lever
  0 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-17 15:04 UTC (permalink / raw)
  To: Anna Schumaker; +Cc: Linux NFS Mailing List


On Mar 17, 2015, at 7:37 AM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:

> On 03/13/2015 05:22 PM, Chuck Lever wrote:
>> There is very little common processing among the different external
>> memory deregistration functions.
>> 
>> In addition, instead of calling the deregistration function for each
>> segment, have one call release all segments for a request. This makes
>> the API a little asymmetrical, but a hair faster.
> 
> The common processing would be the for-each loop that you moved into the ro_unmap functions.  I'm not completely sold on this... how often do unmaps happen?

Once for every RPC.

> Anna
> 
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> net/sunrpc/xprtrdma/fmr_ops.c      |   37 ++++++++++++++++
>> net/sunrpc/xprtrdma/frwr_ops.c     |   46 ++++++++++++++++++++
>> net/sunrpc/xprtrdma/physical_ops.c |   13 ++++++
>> net/sunrpc/xprtrdma/rpc_rdma.c     |    7 +--
>> net/sunrpc/xprtrdma/transport.c    |    8 +---
>> net/sunrpc/xprtrdma/verbs.c        |   81 ------------------------------------
>> net/sunrpc/xprtrdma/xprt_rdma.h    |    5 +-
>> 7 files changed, 103 insertions(+), 94 deletions(-)
>> 
>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>> index 45fb646..9b983b4 100644
>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>> @@ -20,6 +20,32 @@
>> /* Maximum scatter/gather per FMR */
>> #define RPCRDMA_MAX_FMR_SGES	(64)
>> 
>> +/* Use the ib_unmap_fmr() verb to prevent further remote
>> + * access via RDMA READ or RDMA WRITE.
>> + */
>> +static int
>> +__fmr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>> +{
>> +	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>> +	struct rpcrdma_mr_seg *seg1 = seg;
>> +	int rc, nsegs = seg->mr_nsegs;
>> +	LIST_HEAD(l);
>> +
>> +	list_add(&seg1->rl_mw->r.fmr->list, &l);
>> +	rc = ib_unmap_fmr(&l);
>> +	read_lock(&ia->ri_qplock);
>> +	while (seg1->mr_nsegs--)
>> +		rpcrdma_unmap_one(ia, seg++);
>> +	read_unlock(&ia->ri_qplock);
>> +	if (rc)
>> +		goto out_err;
>> +	return nsegs;
>> +
>> +out_err:
>> +	dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
>> +	return nsegs;
>> +}
>> +
>> /* FMR mode conveys up to 64 pages of payload per chunk segment.
>>  */
>> static size_t
>> @@ -79,8 +105,19 @@ out_maperr:
>> 	return rc;
>> }
>> 
>> +static void
>> +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
>> +	     unsigned int count)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; count--;)
>> +		i += __fmr_unmap(r_xprt, &req->rl_segments[i]);
>> +}
>> +
>> const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
>> 	.ro_map				= fmr_op_map,
>> +	.ro_unmap			= fmr_op_unmap,
>> 	.ro_maxpages			= fmr_op_maxpages,
>> 	.ro_displayname			= "fmr",
>> };
>> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
>> index 2b5ccb0..05b5761 100644
>> --- a/net/sunrpc/xprtrdma/frwr_ops.c
>> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
>> @@ -17,6 +17,41 @@
>> # define RPCDBG_FACILITY	RPCDBG_TRANS
>> #endif
>> 
>> +/* Post a LOCAL_INV Work Request to prevent further remote access
>> + * via RDMA READ or RDMA WRITE.
>> + */
>> +static int
>> +__frwr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>> +{
>> +	struct rpcrdma_mr_seg *seg1 = seg;
>> +	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>> +	struct ib_send_wr invalidate_wr, *bad_wr;
>> +	int rc, nsegs = seg->mr_nsegs;
>> +
>> +	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
>> +
>> +	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
>> +	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
>> +	invalidate_wr.opcode = IB_WR_LOCAL_INV;
>> +	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
>> +	DECR_CQCOUNT(&r_xprt->rx_ep);
>> +
>> +	read_lock(&ia->ri_qplock);
>> +	while (seg1->mr_nsegs--)
>> +		rpcrdma_unmap_one(ia, seg++);
>> +	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
>> +	read_unlock(&ia->ri_qplock);
>> +	if (rc)
>> +		goto out_err;
>> +	return nsegs;
>> +
>> +out_err:
>> +	/* Force rpcrdma_buffer_get() to retry */
>> +	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
>> +	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
>> +	return nsegs;
>> +}
>> +
>> /* FRWR mode conveys a list of pages per chunk segment. The
>>  * maximum length of that list is the FRWR page list depth.
>>  */
>> @@ -116,8 +151,19 @@ out_err:
>> 	return rc;
>> }
>> 
>> +static void
>> +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
>> +	      unsigned int count)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; count--;)
>> +		i += __frwr_unmap(r_xprt, &req->rl_segments[i]);
>> +}
>> +
>> const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
>> 	.ro_map				= frwr_op_map,
>> +	.ro_unmap			= frwr_op_unmap,
>> 	.ro_maxpages			= frwr_op_maxpages,
>> 	.ro_displayname			= "frwr",
>> };
>> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
>> index 5a284ee..f2c15be 100644
>> --- a/net/sunrpc/xprtrdma/physical_ops.c
>> +++ b/net/sunrpc/xprtrdma/physical_ops.c
>> @@ -44,8 +44,21 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>> 	return 1;
>> }
>> 
>> +/* Unmap a memory region, but leave it registered.
>> + */
>> +static void
>> +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
>> +		  unsigned int count)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < count; i++)
>> +		rpcrdma_unmap_one(&r_xprt->rx_ia, &req->rl_segments[i]);
>> +}
>> +
>> const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
>> 	.ro_map				= physical_op_map,
>> +	.ro_unmap			= physical_op_unmap,
>> 	.ro_maxpages			= physical_op_maxpages,
>> 	.ro_displayname			= "physical",
>> };
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index 6ab1d03..7b51d9d 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -284,11 +284,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
>> 	return (unsigned char *)iptr - (unsigned char *)headerp;
>> 
>> out:
>> -	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
>> -		for (pos = 0; nchunks--;)
>> -			pos += rpcrdma_deregister_external(
>> -					&req->rl_segments[pos], r_xprt);
>> -	}
>> +	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR)
>> +		r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, nchunks);
>> 	return n;
>> }
>> 
>> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
>> index 9a9da40..c484671 100644
>> --- a/net/sunrpc/xprtrdma/transport.c
>> +++ b/net/sunrpc/xprtrdma/transport.c
>> @@ -571,7 +571,6 @@ xprt_rdma_free(void *buffer)
>> 	struct rpcrdma_req *req;
>> 	struct rpcrdma_xprt *r_xprt;
>> 	struct rpcrdma_regbuf *rb;
>> -	int i;
>> 
>> 	if (buffer == NULL)
>> 		return;
>> @@ -582,11 +581,8 @@ xprt_rdma_free(void *buffer)
>> 
>> 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
>> 
>> -	for (i = 0; req->rl_nchunks;) {
>> -		--req->rl_nchunks;
>> -		i += rpcrdma_deregister_external(
>> -			&req->rl_segments[i], r_xprt);
>> -	}
>> +	r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, req, req->rl_nchunks);
>> +	req->rl_nchunks = 0;
>> 
>> 	rpcrdma_buffer_put(req);
>> }
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index 851ed97..a1621fd 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -1509,7 +1509,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
>> 	}
>> }
>> 
>> -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
>> +/* rpcrdma_unmap_one() was already done during deregistration.
>>  * Redo only the ib_post_send().
>>  */
>> static void
>> @@ -1889,85 +1889,6 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
>> 				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
>> }
>> 
>> -static int
>> -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
>> -			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
>> -{
>> -	struct rpcrdma_mr_seg *seg1 = seg;
>> -	struct ib_send_wr invalidate_wr, *bad_wr;
>> -	int rc;
>> -
>> -	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
>> -
>> -	memset(&invalidate_wr, 0, sizeof invalidate_wr);
>> -	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
>> -	invalidate_wr.opcode = IB_WR_LOCAL_INV;
>> -	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
>> -	DECR_CQCOUNT(&r_xprt->rx_ep);
>> -
>> -	read_lock(&ia->ri_qplock);
>> -	while (seg1->mr_nsegs--)
>> -		rpcrdma_unmap_one(ia, seg++);
>> -	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
>> -	read_unlock(&ia->ri_qplock);
>> -	if (rc) {
>> -		/* Force rpcrdma_buffer_get() to retry */
>> -		seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
>> -		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
>> -			" status %i\n", __func__, rc);
>> -	}
>> -	return rc;
>> -}
>> -
>> -static int
>> -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
>> -			struct rpcrdma_ia *ia)
>> -{
>> -	struct rpcrdma_mr_seg *seg1 = seg;
>> -	LIST_HEAD(l);
>> -	int rc;
>> -
>> -	list_add(&seg1->rl_mw->r.fmr->list, &l);
>> -	rc = ib_unmap_fmr(&l);
>> -	read_lock(&ia->ri_qplock);
>> -	while (seg1->mr_nsegs--)
>> -		rpcrdma_unmap_one(ia, seg++);
>> -	read_unlock(&ia->ri_qplock);
>> -	if (rc)
>> -		dprintk("RPC:       %s: failed ib_unmap_fmr,"
>> -			" status %i\n", __func__, rc);
>> -	return rc;
>> -}
>> -
>> -int
>> -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
>> -		struct rpcrdma_xprt *r_xprt)
>> -{
>> -	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>> -	int nsegs = seg->mr_nsegs, rc;
>> -
>> -	switch (ia->ri_memreg_strategy) {
>> -
>> -	case RPCRDMA_ALLPHYSICAL:
>> -		read_lock(&ia->ri_qplock);
>> -		rpcrdma_unmap_one(ia, seg);
>> -		read_unlock(&ia->ri_qplock);
>> -		break;
>> -
>> -	case RPCRDMA_FRMR:
>> -		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
>> -		break;
>> -
>> -	case RPCRDMA_MTHCAFMR:
>> -		rc = rpcrdma_deregister_fmr_external(seg, ia);
>> -		break;
>> -
>> -	default:
>> -		break;
>> -	}
>> -	return nsegs;
>> -}
>> -
>> /*
>>  * Prepost any receive buffer, then post send.
>>  *
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index 7bf077b..3aabbb2 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -338,6 +338,8 @@ struct rpcrdma_xprt;
>> struct rpcrdma_memreg_ops {
>> 	int		(*ro_map)(struct rpcrdma_xprt *,
>> 				  struct rpcrdma_mr_seg *, int, bool);
>> +	void		(*ro_unmap)(struct rpcrdma_xprt *,
>> +				    struct rpcrdma_req *, unsigned int);
>> 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
>> 	const char	*ro_displayname;
>> };
>> @@ -405,9 +407,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
>> void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
>> void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
>> 
>> -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
>> -				struct rpcrdma_xprt *);
>> -
>> struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
>> 					    size_t, gfp_t);
>> void rpcrdma_free_regbuf(struct rpcrdma_ia *,
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com




^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 10/16] xprtrdma: Add "open" memreg op
  2015-03-13 21:22 ` [PATCH v1 10/16] xprtrdma: Add "open" " Chuck Lever
@ 2015-03-17 15:16   ` Anna Schumaker
  2015-03-17 15:19     ` Chuck Lever
  0 siblings, 1 reply; 56+ messages in thread
From: Anna Schumaker @ 2015-03-17 15:16 UTC (permalink / raw)
  To: Chuck Lever, linux-nfs

Hi Chuck,

On 03/13/2015 05:22 PM, Chuck Lever wrote:
> The open op determines the size of various transport data structures
> based on device capabilities and memory registration mode.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  net/sunrpc/xprtrdma/fmr_ops.c      |   22 +++++++++++++
>  net/sunrpc/xprtrdma/frwr_ops.c     |   60 ++++++++++++++++++++++++++++++++++++
>  net/sunrpc/xprtrdma/physical_ops.c |   22 +++++++++++++
>  net/sunrpc/xprtrdma/verbs.c        |   54 ++------------------------------
>  net/sunrpc/xprtrdma/xprt_rdma.h    |    3 ++
>  5 files changed, 110 insertions(+), 51 deletions(-)
> 
> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
> index 3115e4b..96e6cd3 100644
> --- a/net/sunrpc/xprtrdma/fmr_ops.c
> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
> @@ -46,6 +46,27 @@ out_err:
>  	return nsegs;
>  }
>  
> +static int
> +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
> +	    struct rpcrdma_create_data_internal *cdata)
> +{
> +	struct ib_device_attr *devattr = &ia->ri_devattr;
> +	unsigned int wrs, max_wrs;
> +
> +	max_wrs = devattr->max_qp_wr;
> +	if (cdata->max_requests > max_wrs)
> +		cdata->max_requests = max_wrs;
> +
> +	wrs = cdata->max_requests;
> +	ep->rep_attr.cap.max_send_wr = wrs;
> +	ep->rep_attr.cap.max_recv_wr = wrs;
> +
> +	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
> +		__func__, ep->rep_attr.cap.max_send_wr,
> +		ep->rep_attr.cap.max_recv_wr);
> +	return 0;
> +}

It looks like all three op_open functions are using this code line-for-line.  Can we keep this in the common code, and maybe make it a noop in the fmr and physical cases?

Anna

> +
>  /* FMR mode conveys up to 64 pages of payload per chunk segment.
>   */
>  static size_t
> @@ -201,6 +222,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
>  const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
>  	.ro_map				= fmr_op_map,
>  	.ro_unmap			= fmr_op_unmap,
> +	.ro_open			= fmr_op_open,
>  	.ro_maxpages			= fmr_op_maxpages,
>  	.ro_init			= fmr_op_init,
>  	.ro_reset			= fmr_op_reset,
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index fc3a228..9bb4b2d 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -93,6 +93,65 @@ __frwr_release(struct rpcrdma_mw *r)
>  	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
>  }
>  
> +static int
> +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
> +	     struct rpcrdma_create_data_internal *cdata)
> +{
> +	struct ib_device_attr *devattr = &ia->ri_devattr;
> +	unsigned int wrs, max_wrs;
> +	int depth = 7;
> +
> +	max_wrs = devattr->max_qp_wr;
> +	if (cdata->max_requests > max_wrs)
> +		cdata->max_requests = max_wrs;
> +
> +	wrs = cdata->max_requests;
> +	ep->rep_attr.cap.max_send_wr = wrs;
> +	ep->rep_attr.cap.max_recv_wr = wrs;
> +
> +	ia->ri_max_frmr_depth =
> +			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
> +			      devattr->max_fast_reg_page_list_len);
> +	dprintk("RPC:       %s: device's max FR page list len = %u\n",
> +		__func__, ia->ri_max_frmr_depth);
> +
> +	/* Add room for frmr register and invalidate WRs.
> +	 * 1. FRMR reg WR for head
> +	 * 2. FRMR invalidate WR for head
> +	 * 3. N FRMR reg WRs for pagelist
> +	 * 4. N FRMR invalidate WRs for pagelist
> +	 * 5. FRMR reg WR for tail
> +	 * 6. FRMR invalidate WR for tail
> +	 * 7. The RDMA_SEND WR
> +	 */
> +
> +	/* Calculate N if the device max FRMR depth is smaller than
> +	 * RPCRDMA_MAX_DATA_SEGS.
> +	 */
> +	if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
> +		int delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
> +
> +		do {
> +			depth += 2; /* FRMR reg + invalidate */
> +			delta -= ia->ri_max_frmr_depth;
> +		} while (delta > 0);
> +	}
> +
> +	ep->rep_attr.cap.max_send_wr *= depth;
> +	if (ep->rep_attr.cap.max_send_wr > max_wrs) {
> +		cdata->max_requests = max_wrs / depth;
> +		if (!cdata->max_requests)
> +			return -EINVAL;
> +		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
> +					       depth;
> +	}
> +
> +	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
> +		__func__, ep->rep_attr.cap.max_send_wr,
> +		ep->rep_attr.cap.max_recv_wr);
> +	return 0;
> +}
> +
>  /* FRWR mode conveys a list of pages per chunk segment. The
>   * maximum length of that list is the FRWR page list depth.
>   */
> @@ -290,6 +349,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
>  const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
>  	.ro_map				= frwr_op_map,
>  	.ro_unmap			= frwr_op_unmap,
> +	.ro_open			= frwr_op_open,
>  	.ro_maxpages			= frwr_op_maxpages,
>  	.ro_init			= frwr_op_init,
>  	.ro_reset			= frwr_op_reset,
> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
> index f8da8c4..0998f4f 100644
> --- a/net/sunrpc/xprtrdma/physical_ops.c
> +++ b/net/sunrpc/xprtrdma/physical_ops.c
> @@ -19,6 +19,27 @@
>  # define RPCDBG_FACILITY	RPCDBG_TRANS
>  #endif
>  
> +static int
> +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
> +		 struct rpcrdma_create_data_internal *cdata)
> +{
> +	struct ib_device_attr *devattr = &ia->ri_devattr;
> +	unsigned int wrs, max_wrs;
> +
> +	max_wrs = devattr->max_qp_wr;
> +	if (cdata->max_requests > max_wrs)
> +		cdata->max_requests = max_wrs;
> +
> +	wrs = cdata->max_requests;
> +	ep->rep_attr.cap.max_send_wr = wrs;
> +	ep->rep_attr.cap.max_recv_wr = wrs;
> +
> +	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
> +		__func__, ep->rep_attr.cap.max_send_wr,
> +		ep->rep_attr.cap.max_recv_wr);
> +	return 0;
> +}
> +
>  /* PHYSICAL memory registration conveys one page per chunk segment.
>   */
>  static size_t
> @@ -75,6 +96,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
>  const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
>  	.ro_map				= physical_op_map,
>  	.ro_unmap			= physical_op_unmap,
> +	.ro_open			= physical_op_open,
>  	.ro_maxpages			= physical_op_maxpages,
>  	.ro_init			= physical_op_init,
>  	.ro_reset			= physical_op_reset,
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index dcbc736..17b2a29 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -621,11 +621,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>  			dprintk("RPC:       %s: FRMR registration "
>  				"not supported by HCA\n", __func__);
>  			memreg = RPCRDMA_MTHCAFMR;
> -		} else {
> -			/* Mind the ia limit on FRMR page list depth */
> -			ia->ri_max_frmr_depth = min_t(unsigned int,
> -				RPCRDMA_MAX_DATA_SEGS,
> -				devattr->max_fast_reg_page_list_len);
>  		}
>  	}
>  	if (memreg == RPCRDMA_MTHCAFMR) {
> @@ -734,56 +729,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
>  	struct ib_cq *sendcq, *recvcq;
>  	int rc, err;
>  
> -	/* check provider's send/recv wr limits */
> -	if (cdata->max_requests > devattr->max_qp_wr)
> -		cdata->max_requests = devattr->max_qp_wr;
> +	rc = ia->ri_ops->ro_open(ia, ep, cdata);
> +	if (rc)
> +		return rc;
>  
>  	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
>  	ep->rep_attr.qp_context = ep;
> -	/* send_cq and recv_cq initialized below */
>  	ep->rep_attr.srq = NULL;
> -	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
> -	switch (ia->ri_memreg_strategy) {
> -	case RPCRDMA_FRMR: {
> -		int depth = 7;
> -
> -		/* Add room for frmr register and invalidate WRs.
> -		 * 1. FRMR reg WR for head
> -		 * 2. FRMR invalidate WR for head
> -		 * 3. N FRMR reg WRs for pagelist
> -		 * 4. N FRMR invalidate WRs for pagelist
> -		 * 5. FRMR reg WR for tail
> -		 * 6. FRMR invalidate WR for tail
> -		 * 7. The RDMA_SEND WR
> -		 */
> -
> -		/* Calculate N if the device max FRMR depth is smaller than
> -		 * RPCRDMA_MAX_DATA_SEGS.
> -		 */
> -		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
> -			int delta = RPCRDMA_MAX_DATA_SEGS -
> -				    ia->ri_max_frmr_depth;
> -
> -			do {
> -				depth += 2; /* FRMR reg + invalidate */
> -				delta -= ia->ri_max_frmr_depth;
> -			} while (delta > 0);
> -
> -		}
> -		ep->rep_attr.cap.max_send_wr *= depth;
> -		if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
> -			cdata->max_requests = devattr->max_qp_wr / depth;
> -			if (!cdata->max_requests)
> -				return -EINVAL;
> -			ep->rep_attr.cap.max_send_wr = cdata->max_requests *
> -						       depth;
> -		}
> -		break;
> -	}
> -	default:
> -		break;
> -	}
> -	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
>  	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
>  	ep->rep_attr.cap.max_recv_sge = 1;
>  	ep->rep_attr.cap.max_inline_data = 0;
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index a0e3c3e..a53a564 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -340,6 +340,9 @@ struct rpcrdma_memreg_ops {
>  				  struct rpcrdma_mr_seg *, int, bool);
>  	void		(*ro_unmap)(struct rpcrdma_xprt *,
>  				    struct rpcrdma_req *, unsigned int);
> +	int		(*ro_open)(struct rpcrdma_ia *,
> +				   struct rpcrdma_ep *,
> +				   struct rpcrdma_create_data_internal *);
>  	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
>  	int		(*ro_init)(struct rpcrdma_xprt *);
>  	void		(*ro_reset)(struct rpcrdma_xprt *);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 10/16] xprtrdma: Add "open" memreg op
  2015-03-17 15:16   ` Anna Schumaker
@ 2015-03-17 15:19     ` Chuck Lever
  0 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-17 15:19 UTC (permalink / raw)
  To: Anna Schumaker; +Cc: Linux NFS Mailing List


On Mar 17, 2015, at 8:16 AM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:

> Hi Chuck,
> 
> On 03/13/2015 05:22 PM, Chuck Lever wrote:
>> The open op determines the size of various transport data structures
>> based on device capabilities and memory registration mode.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> net/sunrpc/xprtrdma/fmr_ops.c      |   22 +++++++++++++
>> net/sunrpc/xprtrdma/frwr_ops.c     |   60 ++++++++++++++++++++++++++++++++++++
>> net/sunrpc/xprtrdma/physical_ops.c |   22 +++++++++++++
>> net/sunrpc/xprtrdma/verbs.c        |   54 ++------------------------------
>> net/sunrpc/xprtrdma/xprt_rdma.h    |    3 ++
>> 5 files changed, 110 insertions(+), 51 deletions(-)
>> 
>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>> index 3115e4b..96e6cd3 100644
>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>> @@ -46,6 +46,27 @@ out_err:
>> 	return nsegs;
>> }
>> 
>> +static int
>> +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
>> +	    struct rpcrdma_create_data_internal *cdata)
>> +{
>> +	struct ib_device_attr *devattr = &ia->ri_devattr;
>> +	unsigned int wrs, max_wrs;
>> +
>> +	max_wrs = devattr->max_qp_wr;
>> +	if (cdata->max_requests > max_wrs)
>> +		cdata->max_requests = max_wrs;
>> +
>> +	wrs = cdata->max_requests;
>> +	ep->rep_attr.cap.max_send_wr = wrs;
>> +	ep->rep_attr.cap.max_recv_wr = wrs;
>> +
>> +	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
>> +		__func__, ep->rep_attr.cap.max_send_wr,
>> +		ep->rep_attr.cap.max_recv_wr);
>> +	return 0;
>> +}
> 
> It looks like all three op_open functions are using this code line-for-line.  Can we keep this in the common code, and maybe make it a noop in the fmr and physical cases?

The reason for this is that the FRWR open function can adjust the
results of these calculations.

> Anna
> 
>> +
>> /* FMR mode conveys up to 64 pages of payload per chunk segment.
>>  */
>> static size_t
>> @@ -201,6 +222,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
>> const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
>> 	.ro_map				= fmr_op_map,
>> 	.ro_unmap			= fmr_op_unmap,
>> +	.ro_open			= fmr_op_open,
>> 	.ro_maxpages			= fmr_op_maxpages,
>> 	.ro_init			= fmr_op_init,
>> 	.ro_reset			= fmr_op_reset,
>> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
>> index fc3a228..9bb4b2d 100644
>> --- a/net/sunrpc/xprtrdma/frwr_ops.c
>> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
>> @@ -93,6 +93,65 @@ __frwr_release(struct rpcrdma_mw *r)
>> 	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
>> }
>> 
>> +static int
>> +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
>> +	     struct rpcrdma_create_data_internal *cdata)
>> +{
>> +	struct ib_device_attr *devattr = &ia->ri_devattr;
>> +	unsigned int wrs, max_wrs;
>> +	int depth = 7;
>> +
>> +	max_wrs = devattr->max_qp_wr;
>> +	if (cdata->max_requests > max_wrs)
>> +		cdata->max_requests = max_wrs;
>> +
>> +	wrs = cdata->max_requests;
>> +	ep->rep_attr.cap.max_send_wr = wrs;
>> +	ep->rep_attr.cap.max_recv_wr = wrs;
>> +
>> +	ia->ri_max_frmr_depth =
>> +			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
>> +			      devattr->max_fast_reg_page_list_len);
>> +	dprintk("RPC:       %s: device's max FR page list len = %u\n",
>> +		__func__, ia->ri_max_frmr_depth);
>> +
>> +	/* Add room for frmr register and invalidate WRs.
>> +	 * 1. FRMR reg WR for head
>> +	 * 2. FRMR invalidate WR for head
>> +	 * 3. N FRMR reg WRs for pagelist
>> +	 * 4. N FRMR invalidate WRs for pagelist
>> +	 * 5. FRMR reg WR for tail
>> +	 * 6. FRMR invalidate WR for tail
>> +	 * 7. The RDMA_SEND WR
>> +	 */
>> +
>> +	/* Calculate N if the device max FRMR depth is smaller than
>> +	 * RPCRDMA_MAX_DATA_SEGS.
>> +	 */
>> +	if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
>> +		int delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
>> +
>> +		do {
>> +			depth += 2; /* FRMR reg + invalidate */
>> +			delta -= ia->ri_max_frmr_depth;
>> +		} while (delta > 0);
>> +	}
>> +
>> +	ep->rep_attr.cap.max_send_wr *= depth;
>> +	if (ep->rep_attr.cap.max_send_wr > max_wrs) {
>> +		cdata->max_requests = max_wrs / depth;
>> +		if (!cdata->max_requests)
>> +			return -EINVAL;
>> +		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
>> +					       depth;
>> +	}
>> +
>> +	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
>> +		__func__, ep->rep_attr.cap.max_send_wr,
>> +		ep->rep_attr.cap.max_recv_wr);
>> +	return 0;
>> +}
>> +
>> /* FRWR mode conveys a list of pages per chunk segment. The
>>  * maximum length of that list is the FRWR page list depth.
>>  */
>> @@ -290,6 +349,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
>> const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
>> 	.ro_map				= frwr_op_map,
>> 	.ro_unmap			= frwr_op_unmap,
>> +	.ro_open			= frwr_op_open,
>> 	.ro_maxpages			= frwr_op_maxpages,
>> 	.ro_init			= frwr_op_init,
>> 	.ro_reset			= frwr_op_reset,
>> diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
>> index f8da8c4..0998f4f 100644
>> --- a/net/sunrpc/xprtrdma/physical_ops.c
>> +++ b/net/sunrpc/xprtrdma/physical_ops.c
>> @@ -19,6 +19,27 @@
>> # define RPCDBG_FACILITY	RPCDBG_TRANS
>> #endif
>> 
>> +static int
>> +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
>> +		 struct rpcrdma_create_data_internal *cdata)
>> +{
>> +	struct ib_device_attr *devattr = &ia->ri_devattr;
>> +	unsigned int wrs, max_wrs;
>> +
>> +	max_wrs = devattr->max_qp_wr;
>> +	if (cdata->max_requests > max_wrs)
>> +		cdata->max_requests = max_wrs;
>> +
>> +	wrs = cdata->max_requests;
>> +	ep->rep_attr.cap.max_send_wr = wrs;
>> +	ep->rep_attr.cap.max_recv_wr = wrs;
>> +
>> +	dprintk("RPC:       %s: pre-allocating %u send WRs, %u recv WRs\n",
>> +		__func__, ep->rep_attr.cap.max_send_wr,
>> +		ep->rep_attr.cap.max_recv_wr);
>> +	return 0;
>> +}
>> +
>> /* PHYSICAL memory registration conveys one page per chunk segment.
>>  */
>> static size_t
>> @@ -75,6 +96,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
>> const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
>> 	.ro_map				= physical_op_map,
>> 	.ro_unmap			= physical_op_unmap,
>> +	.ro_open			= physical_op_open,
>> 	.ro_maxpages			= physical_op_maxpages,
>> 	.ro_init			= physical_op_init,
>> 	.ro_reset			= physical_op_reset,
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index dcbc736..17b2a29 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -621,11 +621,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>> 			dprintk("RPC:       %s: FRMR registration "
>> 				"not supported by HCA\n", __func__);
>> 			memreg = RPCRDMA_MTHCAFMR;
>> -		} else {
>> -			/* Mind the ia limit on FRMR page list depth */
>> -			ia->ri_max_frmr_depth = min_t(unsigned int,
>> -				RPCRDMA_MAX_DATA_SEGS,
>> -				devattr->max_fast_reg_page_list_len);
>> 		}
>> 	}
>> 	if (memreg == RPCRDMA_MTHCAFMR) {
>> @@ -734,56 +729,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
>> 	struct ib_cq *sendcq, *recvcq;
>> 	int rc, err;
>> 
>> -	/* check provider's send/recv wr limits */
>> -	if (cdata->max_requests > devattr->max_qp_wr)
>> -		cdata->max_requests = devattr->max_qp_wr;
>> +	rc = ia->ri_ops->ro_open(ia, ep, cdata);
>> +	if (rc)
>> +		return rc;
>> 
>> 	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
>> 	ep->rep_attr.qp_context = ep;
>> -	/* send_cq and recv_cq initialized below */
>> 	ep->rep_attr.srq = NULL;
>> -	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
>> -	switch (ia->ri_memreg_strategy) {
>> -	case RPCRDMA_FRMR: {
>> -		int depth = 7;
>> -
>> -		/* Add room for frmr register and invalidate WRs.
>> -		 * 1. FRMR reg WR for head
>> -		 * 2. FRMR invalidate WR for head
>> -		 * 3. N FRMR reg WRs for pagelist
>> -		 * 4. N FRMR invalidate WRs for pagelist
>> -		 * 5. FRMR reg WR for tail
>> -		 * 6. FRMR invalidate WR for tail
>> -		 * 7. The RDMA_SEND WR
>> -		 */
>> -
>> -		/* Calculate N if the device max FRMR depth is smaller than
>> -		 * RPCRDMA_MAX_DATA_SEGS.
>> -		 */
>> -		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
>> -			int delta = RPCRDMA_MAX_DATA_SEGS -
>> -				    ia->ri_max_frmr_depth;
>> -
>> -			do {
>> -				depth += 2; /* FRMR reg + invalidate */
>> -				delta -= ia->ri_max_frmr_depth;
>> -			} while (delta > 0);
>> -
>> -		}
>> -		ep->rep_attr.cap.max_send_wr *= depth;
>> -		if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
>> -			cdata->max_requests = devattr->max_qp_wr / depth;
>> -			if (!cdata->max_requests)
>> -				return -EINVAL;
>> -			ep->rep_attr.cap.max_send_wr = cdata->max_requests *
>> -						       depth;
>> -		}
>> -		break;
>> -	}
>> -	default:
>> -		break;
>> -	}
>> -	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
>> 	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
>> 	ep->rep_attr.cap.max_recv_sge = 1;
>> 	ep->rep_attr.cap.max_inline_data = 0;
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index a0e3c3e..a53a564 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -340,6 +340,9 @@ struct rpcrdma_memreg_ops {
>> 				  struct rpcrdma_mr_seg *, int, bool);
>> 	void		(*ro_unmap)(struct rpcrdma_xprt *,
>> 				    struct rpcrdma_req *, unsigned int);
>> +	int		(*ro_open)(struct rpcrdma_ia *,
>> +				   struct rpcrdma_ep *,
>> +				   struct rpcrdma_create_data_internal *);
>> 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
>> 	int		(*ro_init)(struct rpcrdma_xprt *);
>> 	void		(*ro_reset)(struct rpcrdma_xprt *);
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
> 

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com




^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
@ 2015-05-05 15:44     ` Christoph Hellwig
  2015-03-13 21:21 ` [PATCH v1 02/16] xprtrdma: Perform a full marshal on retransmit Chuck Lever
                       ` (15 subsequent siblings)
  16 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 15:44 UTC (permalink / raw)
  To: Chuck Lever
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, Mar 13, 2015 at 05:21:17PM -0400, Chuck Lever wrote:
> This is a series of client-side patches for NFS/RDMA. In preparation
> for increasing the transport credit limit and maximum rsize/wsize,
> I've re-factored the memory registration logic into separate files,
> invoked via a method API.

Just curious if you ever though of moving this into the generic
rdma layer?

I've been working on a rdma based storage driver recently, and the
various different memory registration methods are a complete pain in the
ass, and impossible to test in and ULD without havin access to all kinds
of different hardware.

And from I see we litterly dont use them much different from the generic
dma mapping API helpers (at a very high level) so it seems it should
be easy to move a slightly expanded version of your API to the core
code.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 15:44     ` Christoph Hellwig
  0 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 15:44 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-nfs, linux-rdma

On Fri, Mar 13, 2015 at 05:21:17PM -0400, Chuck Lever wrote:
> This is a series of client-side patches for NFS/RDMA. In preparation
> for increasing the transport credit limit and maximum rsize/wsize,
> I've re-factored the memory registration logic into separate files,
> invoked via a method API.

Just curious if you ever though of moving this into the generic
rdma layer?

I've been working on a rdma based storage driver recently, and the
various different memory registration methods are a complete pain in the
ass, and impossible to test in and ULD without havin access to all kinds
of different hardware.

And from I see we litterly dont use them much different from the generic
dma mapping API helpers (at a very high level) so it seems it should
be easy to move a slightly expanded version of your API to the core
code.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 15:44     ` Christoph Hellwig
@ 2015-05-05 16:04         ` Chuck Lever
  -1 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-05-05 16:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA


On May 5, 2015, at 11:44 AM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:

> On Fri, Mar 13, 2015 at 05:21:17PM -0400, Chuck Lever wrote:
>> This is a series of client-side patches for NFS/RDMA. In preparation
>> for increasing the transport credit limit and maximum rsize/wsize,
>> I've re-factored the memory registration logic into separate files,
>> invoked via a method API.
> 
> Just curious if you ever though of moving this into the generic
> rdma layer?

Not really. The new files are really just shims that adapt the RPC/RDMA
transport to memory registration verbs. There’s only a few hundred lines
per registration mode, and it’s all fairly specific to RPC/RDMA.

> I've been working on a rdma based storage driver recently, and the
> various different memory registration methods are a complete pain in the
> ass, and impossible to test in and ULD without havin access to all kinds
> of different hardware.

Agree that the test matrix grows exponentially in complexity and
expense as more MR modes are introduced. We have strategies for
managing this when there’s a community involved, but when there’s
just one developer it’s a challenge.

> And from I see we litterly dont use them much different from the generic
> dma mapping API helpers (at a very high level) so it seems it should
> be easy to move a slightly expanded version of your API to the core
> code.

IMO FRWR is the only registration mode that has legs for the long term,
and is specifically designed for storage.

If you are not working on a legacy piece of code that has to support
older HCAs, why not stay with FRWR?

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 16:04         ` Chuck Lever
  0 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-05-05 16:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Linux NFS Mailing List, linux-rdma


On May 5, 2015, at 11:44 AM, Christoph Hellwig <hch@infradead.org> wrote:

> On Fri, Mar 13, 2015 at 05:21:17PM -0400, Chuck Lever wrote:
>> This is a series of client-side patches for NFS/RDMA. In preparation
>> for increasing the transport credit limit and maximum rsize/wsize,
>> I've re-factored the memory registration logic into separate files,
>> invoked via a method API.
> 
> Just curious if you ever though of moving this into the generic
> rdma layer?

Not really. The new files are really just shims that adapt the RPC/RDMA
transport to memory registration verbs. There’s only a few hundred lines
per registration mode, and it’s all fairly specific to RPC/RDMA.

> I've been working on a rdma based storage driver recently, and the
> various different memory registration methods are a complete pain in the
> ass, and impossible to test in and ULD without havin access to all kinds
> of different hardware.

Agree that the test matrix grows exponentially in complexity and
expense as more MR modes are introduced. We have strategies for
managing this when there’s a community involved, but when there’s
just one developer it’s a challenge.

> And from I see we litterly dont use them much different from the generic
> dma mapping API helpers (at a very high level) so it seems it should
> be easy to move a slightly expanded version of your API to the core
> code.

IMO FRWR is the only registration mode that has legs for the long term,
and is specifically designed for storage.

If you are not working on a legacy piece of code that has to support
older HCAs, why not stay with FRWR?

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com




^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 16:04         ` Chuck Lever
@ 2015-05-05 17:25             ` Christoph Hellwig
  -1 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 17:25 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, May 05, 2015 at 12:04:00PM -0400, Chuck Lever wrote:
> > Just curious if you ever though of moving this into the generic
> > rdma layer?
> 
> Not really. The new files are really just shims that adapt the RPC/RDMA
> transport to memory registration verbs. There?s only a few hundred lines
> per registration mode, and it?s all fairly specific to RPC/RDMA.

While it's using RPC/RDMA specific data structures it basically
abstracts out the action of mapping a number of pages onto the rdma
hardware.   There isn't a whole lot of interaction with the actual
sunrpc layer except for a few hardcoded limits.

Btw, this is not a critique of the code, it's an obvious major
improvement of what was there before, it justs seems like it would be
nice to move it up to a higher layer.

> > And from I see we litterly dont use them much different from the generic
> > dma mapping API helpers (at a very high level) so it seems it should
> > be easy to move a slightly expanded version of your API to the core
> > code.
> 
> IMO FRWR is the only registration mode that has legs for the long term,
> and is specifically designed for storage.
> 
> If you are not working on a legacy piece of code that has to support
> older HCAs, why not stay with FRWR?

The raw FRWR API seems like an absolute nightmare, and I'm bound to
get it wrong at first :)  This is only half joking, but despite that
it's the first target for sure.  It's just very frustrating that there
is no usable common API.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 17:25             ` Christoph Hellwig
  0 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 17:25 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Linux NFS Mailing List, linux-rdma

On Tue, May 05, 2015 at 12:04:00PM -0400, Chuck Lever wrote:
> > Just curious if you ever though of moving this into the generic
> > rdma layer?
> 
> Not really. The new files are really just shims that adapt the RPC/RDMA
> transport to memory registration verbs. There?s only a few hundred lines
> per registration mode, and it?s all fairly specific to RPC/RDMA.

While it's using RPC/RDMA specific data structures it basically
abstracts out the action of mapping a number of pages onto the rdma
hardware.   There isn't a whole lot of interaction with the actual
sunrpc layer except for a few hardcoded limits.

Btw, this is not a critique of the code, it's an obvious major
improvement of what was there before, it justs seems like it would be
nice to move it up to a higher layer.

> > And from I see we litterly dont use them much different from the generic
> > dma mapping API helpers (at a very high level) so it seems it should
> > be easy to move a slightly expanded version of your API to the core
> > code.
> 
> IMO FRWR is the only registration mode that has legs for the long term,
> and is specifically designed for storage.
> 
> If you are not working on a legacy piece of code that has to support
> older HCAs, why not stay with FRWR?

The raw FRWR API seems like an absolute nightmare, and I'm bound to
get it wrong at first :)  This is only half joking, but despite that
it's the first target for sure.  It's just very frustrating that there
is no usable common API.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 17:25             ` Christoph Hellwig
@ 2015-05-05 18:14                 ` Tom Talpey
  -1 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-05 18:14 UTC (permalink / raw)
  To: Christoph Hellwig, Chuck Lever
  Cc: Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 5/5/2015 1:25 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 12:04:00PM -0400, Chuck Lever wrote:
>> IMO FRWR is the only registration mode that has legs for the long term,
>> and is specifically designed for storage.
>>
>> If you are not working on a legacy piece of code that has to support
>> older HCAs, why not stay with FRWR?
>
> The raw FRWR API seems like an absolute nightmare, and I'm bound to
> get it wrong at first :)  This is only half joking, but despite that
> it's the first target for sure.  It's just very frustrating that there
> is no usable common API.

Memory registration is quite subtle with dependencies on the memory
being registered (user, kernel, physical), the requirements of the
upper layer (storage, etc) and the scope of the registration (scatter/
gather, memory protection, etc). I don't think you *want* a common API.

As you might guess, I can go on at length about this. :-) But, if
you have a kernel service, the ability to pin memory, and you
want it to go fast, you want FRWR.

Tom.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 18:14                 ` Tom Talpey
  0 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-05 18:14 UTC (permalink / raw)
  To: Christoph Hellwig, Chuck Lever; +Cc: Linux NFS Mailing List, linux-rdma

On 5/5/2015 1:25 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 12:04:00PM -0400, Chuck Lever wrote:
>> IMO FRWR is the only registration mode that has legs for the long term,
>> and is specifically designed for storage.
>>
>> If you are not working on a legacy piece of code that has to support
>> older HCAs, why not stay with FRWR?
>
> The raw FRWR API seems like an absolute nightmare, and I'm bound to
> get it wrong at first :)  This is only half joking, but despite that
> it's the first target for sure.  It's just very frustrating that there
> is no usable common API.

Memory registration is quite subtle with dependencies on the memory
being registered (user, kernel, physical), the requirements of the
upper layer (storage, etc) and the scope of the registration (scatter/
gather, memory protection, etc). I don't think you *want* a common API.

As you might guess, I can go on at length about this. :-) But, if
you have a kernel service, the ability to pin memory, and you
want it to go fast, you want FRWR.

Tom.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 18:14                 ` Tom Talpey
@ 2015-05-05 19:10                     ` Christoph Hellwig
  -1 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 19:10 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, May 05, 2015 at 02:14:30PM -0400, Tom Talpey wrote:
> As you might guess, I can go on at length about this. :-) But, if
> you have a kernel service, the ability to pin memory, and you
> want it to go fast, you want FRWR.

Basically most in-kernel consumers seem to have the same requirements:

 - register a struct page, which can be kernel or user memory (it's
   probably pinned in your Terms, but we don't really use that much in
   kernelspace).
 - In many but not all cases we might need an offset/length for each
   page (think struct bvec, paged sk_buffs, or scatterlists of some
   sort), in other an offset/len for the whole set of pages is fine,
   but that's a superset of the one above.
 - we usually want it to be as fast as possible

While my grep skills aren't the best I can't really find an in-kernel
user that doesn't fit the above, although I might have missed some
long-term registrations where we don't really care about the fast
part.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 19:10                     ` Christoph Hellwig
  0 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 19:10 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List, linux-rdma

On Tue, May 05, 2015 at 02:14:30PM -0400, Tom Talpey wrote:
> As you might guess, I can go on at length about this. :-) But, if
> you have a kernel service, the ability to pin memory, and you
> want it to go fast, you want FRWR.

Basically most in-kernel consumers seem to have the same requirements:

 - register a struct page, which can be kernel or user memory (it's
   probably pinned in your Terms, but we don't really use that much in
   kernelspace).
 - In many but not all cases we might need an offset/length for each
   page (think struct bvec, paged sk_buffs, or scatterlists of some
   sort), in other an offset/len for the whole set of pages is fine,
   but that's a superset of the one above.
 - we usually want it to be as fast as possible

While my grep skills aren't the best I can't really find an in-kernel
user that doesn't fit the above, although I might have missed some
long-term registrations where we don't really care about the fast
part.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 19:10                     ` Christoph Hellwig
@ 2015-05-05 20:57                         ` Tom Talpey
  -1 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-05 20:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chuck Lever, Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 5/5/2015 3:10 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 02:14:30PM -0400, Tom Talpey wrote:
>> As you might guess, I can go on at length about this. :-) But, if
>> you have a kernel service, the ability to pin memory, and you
>> want it to go fast, you want FRWR.
>
> Basically most in-kernel consumers seem to have the same requirements:
>
>   - register a struct page, which can be kernel or user memory (it's
>     probably pinned in your Terms, but we don't really use that much in
>     kernelspace).

Actually, I strongly disagree that the in-kernel consumers want to
register a struct page. They want to register a list of pages, often
a rather long one. They want this because it allows the RDMA layer to
address the list with a single memory handle. This is where things
get tricky.

So the "pinned" or "wired" term is because in order to do RDMA, the
page needs to have a fixed mapping to this handle. Usually, that means
a physical address. There are some new approaches that allow the NIC
to raise a fault and/or walk kernel page tables, but one way or the
other the page had better be resident. RDMA NICs, generally speaking,
don't buffer in-flight RDMA data, nor do you want them to.

>   - In many but not all cases we might need an offset/length for each
>     page (think struct bvec, paged sk_buffs, or scatterlists of some
>     sort), in other an offset/len for the whole set of pages is fine,
>     but that's a superset of the one above.

Yep, RDMA calls this FBO and length, and further, the protocol requires
that the data itself be contiguous within the registration, that is, the
FBO can be non-zero, but no other holes be present.

>   - we usually want it to be as fast as possible

In the case of file protocols such as NFS/RDMA and SMB Direct, as well
as block protocols such as iSER, these registrations are set up and
torn down on a per-I/O basis, in order to protect the data from
misbehaving peers or misbehaving hardware. So to me as a storage
protocol provider, "usually" means "always".

I totally get where you're coming from, my main question is whether
it's possible to nail the requirements of some useful common API.
It has been tried before, shall I say.

Tom.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 20:57                         ` Tom Talpey
  0 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-05 20:57 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Chuck Lever, Linux NFS Mailing List, linux-rdma

On 5/5/2015 3:10 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 02:14:30PM -0400, Tom Talpey wrote:
>> As you might guess, I can go on at length about this. :-) But, if
>> you have a kernel service, the ability to pin memory, and you
>> want it to go fast, you want FRWR.
>
> Basically most in-kernel consumers seem to have the same requirements:
>
>   - register a struct page, which can be kernel or user memory (it's
>     probably pinned in your Terms, but we don't really use that much in
>     kernelspace).

Actually, I strongly disagree that the in-kernel consumers want to
register a struct page. They want to register a list of pages, often
a rather long one. They want this because it allows the RDMA layer to
address the list with a single memory handle. This is where things
get tricky.

So the "pinned" or "wired" term is because in order to do RDMA, the
page needs to have a fixed mapping to this handle. Usually, that means
a physical address. There are some new approaches that allow the NIC
to raise a fault and/or walk kernel page tables, but one way or the
other the page had better be resident. RDMA NICs, generally speaking,
don't buffer in-flight RDMA data, nor do you want them to.

>   - In many but not all cases we might need an offset/length for each
>     page (think struct bvec, paged sk_buffs, or scatterlists of some
>     sort), in other an offset/len for the whole set of pages is fine,
>     but that's a superset of the one above.

Yep, RDMA calls this FBO and length, and further, the protocol requires
that the data itself be contiguous within the registration, that is, the
FBO can be non-zero, but no other holes be present.

>   - we usually want it to be as fast as possible

In the case of file protocols such as NFS/RDMA and SMB Direct, as well
as block protocols such as iSER, these registrations are set up and
torn down on a per-I/O basis, in order to protect the data from
misbehaving peers or misbehaving hardware. So to me as a storage
protocol provider, "usually" means "always".

I totally get where you're coming from, my main question is whether
it's possible to nail the requirements of some useful common API.
It has been tried before, shall I say.

Tom.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 20:57                         ` Tom Talpey
@ 2015-05-05 21:06                             ` Christoph Hellwig
  -1 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 21:06 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, May 05, 2015 at 04:57:55PM -0400, Tom Talpey wrote:
> Actually, I strongly disagree that the in-kernel consumers want to
> register a struct page. They want to register a list of pages, often
> a rather long one. They want this because it allows the RDMA layer to
> address the list with a single memory handle. This is where things
> get tricky.

Yes, I agree - my wording was wrong and if you look at the next point
it should be obvious that I meant multiple struct pages.

> So the "pinned" or "wired" term is because in order to do RDMA, the
> page needs to have a fixed mapping to this handle. Usually, that means
> a physical address. There are some new approaches that allow the NIC
> to raise a fault and/or walk kernel page tables, but one way or the
> other the page had better be resident. RDMA NICs, generally speaking,
> don't buffer in-flight RDMA data, nor do you want them to.

But that whole painpoint only existist for userspace ib verbs consumers.
And in-kernel consumer fits into the "pinned" or "wired" categegory,
as any local DMA requires it.

> >  - In many but not all cases we might need an offset/length for each
> >    page (think struct bvec, paged sk_buffs, or scatterlists of some
> >    sort), in other an offset/len for the whole set of pages is fine,
> >    but that's a superset of the one above.
> 
> Yep, RDMA calls this FBO and length, and further, the protocol requires
> that the data itself be contiguous within the registration, that is, the
> FBO can be non-zero, but no other holes be present.

The contiguous requirements isn't something we can alway guarantee.
While a lot of I/O will have that form the form where there are holes
can happen, although it's not common.

> >  - we usually want it to be as fast as possible
> 
> In the case of file protocols such as NFS/RDMA and SMB Direct, as well
> as block protocols such as iSER, these registrations are set up and
> torn down on a per-I/O basis, in order to protect the data from
> misbehaving peers or misbehaving hardware. So to me as a storage
> protocol provider, "usually" means "always".

Yes.  As I said I haven't actually found anything yet that doesn't fit
the pattern, but the RDMA in-kernel API is such a mess that I didn't
want to put my hand in the fire and say always.

> I totally get where you're coming from, my main question is whether
> it's possible to nail the requirements of some useful common API.
> It has been tried before, shall I say.

Do you have any information on these attempts and why the failed?  Note
that the only interesting ones would be for in-kernel consumers.
Userspace verbs are another order of magnitude more problems, so they're
not too interesting.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 21:06                             ` Christoph Hellwig
  0 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-05 21:06 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List, linux-rdma

On Tue, May 05, 2015 at 04:57:55PM -0400, Tom Talpey wrote:
> Actually, I strongly disagree that the in-kernel consumers want to
> register a struct page. They want to register a list of pages, often
> a rather long one. They want this because it allows the RDMA layer to
> address the list with a single memory handle. This is where things
> get tricky.

Yes, I agree - my wording was wrong and if you look at the next point
it should be obvious that I meant multiple struct pages.

> So the "pinned" or "wired" term is because in order to do RDMA, the
> page needs to have a fixed mapping to this handle. Usually, that means
> a physical address. There are some new approaches that allow the NIC
> to raise a fault and/or walk kernel page tables, but one way or the
> other the page had better be resident. RDMA NICs, generally speaking,
> don't buffer in-flight RDMA data, nor do you want them to.

But that whole painpoint only existist for userspace ib verbs consumers.
And in-kernel consumer fits into the "pinned" or "wired" categegory,
as any local DMA requires it.

> >  - In many but not all cases we might need an offset/length for each
> >    page (think struct bvec, paged sk_buffs, or scatterlists of some
> >    sort), in other an offset/len for the whole set of pages is fine,
> >    but that's a superset of the one above.
> 
> Yep, RDMA calls this FBO and length, and further, the protocol requires
> that the data itself be contiguous within the registration, that is, the
> FBO can be non-zero, but no other holes be present.

The contiguous requirements isn't something we can alway guarantee.
While a lot of I/O will have that form the form where there are holes
can happen, although it's not common.

> >  - we usually want it to be as fast as possible
> 
> In the case of file protocols such as NFS/RDMA and SMB Direct, as well
> as block protocols such as iSER, these registrations are set up and
> torn down on a per-I/O basis, in order to protect the data from
> misbehaving peers or misbehaving hardware. So to me as a storage
> protocol provider, "usually" means "always".

Yes.  As I said I haven't actually found anything yet that doesn't fit
the pattern, but the RDMA in-kernel API is such a mess that I didn't
want to put my hand in the fire and say always.

> I totally get where you're coming from, my main question is whether
> it's possible to nail the requirements of some useful common API.
> It has been tried before, shall I say.

Do you have any information on these attempts and why the failed?  Note
that the only interesting ones would be for in-kernel consumers.
Userspace verbs are another order of magnitude more problems, so they're
not too interesting.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 21:06                             ` Christoph Hellwig
@ 2015-05-05 21:32                                 ` Tom Talpey
  -1 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-05 21:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chuck Lever, Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 5/5/2015 5:06 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 04:57:55PM -0400, Tom Talpey wrote:
>> Actually, I strongly disagree that the in-kernel consumers want to
>> register a struct page. They want to register a list of pages, often
>> a rather long one. They want this because it allows the RDMA layer to
>> address the list with a single memory handle. This is where things
>> get tricky.
>
> Yes, I agree - my wording was wrong and if you look at the next point
> it should be obvious that I meant multiple struct pages.

Ok sounds good.

>
>> So the "pinned" or "wired" term is because in order to do RDMA, the
>> page needs to have a fixed mapping to this handle. Usually, that means
>> a physical address. There are some new approaches that allow the NIC
>> to raise a fault and/or walk kernel page tables, but one way or the
>> other the page had better be resident. RDMA NICs, generally speaking,
>> don't buffer in-flight RDMA data, nor do you want them to.
>
> But that whole painpoint only existist for userspace ib verbs consumers.
> And in-kernel consumer fits into the "pinned" or "wired" categegory,
> as any local DMA requires it.

True, but I think there's a bit more to it. For example, the buffer
cache is pinned, but the data on the page isn't dedicated to an i/o,
it's shared among file-layer stuff. Of course, a file-layer RDMA
protocol needs to play by those rules, but I'll use it as a warning
that it's not always simple.

Totally agree that kernel memory handling is easier than userspace,
and also that userspace APIs need to have appropriate kernel setup.
Note, this wasn't always the case. In the 2.4 days, when we first
coded the NFS/RDMA client, there was some rather ugly stuff.

>>>   - In many but not all cases we might need an offset/length for each
>>>     page (think struct bvec, paged sk_buffs, or scatterlists of some
>>>     sort), in other an offset/len for the whole set of pages is fine,
>>>     but that's a superset of the one above.
>>
>> Yep, RDMA calls this FBO and length, and further, the protocol requires
>> that the data itself be contiguous within the registration, that is, the
>> FBO can be non-zero, but no other holes be present.
>
> The contiguous requirements isn't something we can alway guarantee.
> While a lot of I/O will have that form the form where there are holes
> can happen, although it's not common.

Yeah, and the important takeaway is that a memory registration API
can't hide this - meaning, the upper layer needs to address it (hah!).
Often, once an upper layer has to do this, it can do better by doing
it itself. But that's perhaps too philosophical here. Let me just
say that transparency has proved to be the enemy of performance.

>
>>>   - we usually want it to be as fast as possible
>>
>> In the case of file protocols such as NFS/RDMA and SMB Direct, as well
>> as block protocols such as iSER, these registrations are set up and
>> torn down on a per-I/O basis, in order to protect the data from
>> misbehaving peers or misbehaving hardware. So to me as a storage
>> protocol provider, "usually" means "always".
>
> Yes.  As I said I haven't actually found anything yet that doesn't fit
> the pattern, but the RDMA in-kernel API is such a mess that I didn't
> want to put my hand in the fire and say always.
>
>> I totally get where you're coming from, my main question is whether
>> it's possible to nail the requirements of some useful common API.
>> It has been tried before, shall I say.
>
> Do you have any information on these attempts and why the failed?  Note
> that the only interesting ones would be for in-kernel consumers.
> Userspace verbs are another order of magnitude more problems, so they're
> not too interesting.

Hmm, most of these are userspace API experiences, and I would not be
so quick as to dismiss their applicability, or their lessons. There
was the old "kvipl" (kernel VI Provider Library), which had certain
simple memreg functions, but I'm not sure that API was ever in the
public domain (it was Intel's). There's kDAPL, based on DAPL which is
actually successful but exposes a somewhat different memory registration
model. And Solaris has an abstraction, which I haven't looked at
in years.

Up a layer, you might look into Portals, the many MPI implementations,
and maybe even some network shared memory stuff like clusters. Most of
these have been implemented as layers atop verbs (among others).

Tom.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 21:32                                 ` Tom Talpey
  0 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-05 21:32 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Chuck Lever, Linux NFS Mailing List, linux-rdma

On 5/5/2015 5:06 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 04:57:55PM -0400, Tom Talpey wrote:
>> Actually, I strongly disagree that the in-kernel consumers want to
>> register a struct page. They want to register a list of pages, often
>> a rather long one. They want this because it allows the RDMA layer to
>> address the list with a single memory handle. This is where things
>> get tricky.
>
> Yes, I agree - my wording was wrong and if you look at the next point
> it should be obvious that I meant multiple struct pages.

Ok sounds good.

>
>> So the "pinned" or "wired" term is because in order to do RDMA, the
>> page needs to have a fixed mapping to this handle. Usually, that means
>> a physical address. There are some new approaches that allow the NIC
>> to raise a fault and/or walk kernel page tables, but one way or the
>> other the page had better be resident. RDMA NICs, generally speaking,
>> don't buffer in-flight RDMA data, nor do you want them to.
>
> But that whole painpoint only existist for userspace ib verbs consumers.
> And in-kernel consumer fits into the "pinned" or "wired" categegory,
> as any local DMA requires it.

True, but I think there's a bit more to it. For example, the buffer
cache is pinned, but the data on the page isn't dedicated to an i/o,
it's shared among file-layer stuff. Of course, a file-layer RDMA
protocol needs to play by those rules, but I'll use it as a warning
that it's not always simple.

Totally agree that kernel memory handling is easier than userspace,
and also that userspace APIs need to have appropriate kernel setup.
Note, this wasn't always the case. In the 2.4 days, when we first
coded the NFS/RDMA client, there was some rather ugly stuff.

>>>   - In many but not all cases we might need an offset/length for each
>>>     page (think struct bvec, paged sk_buffs, or scatterlists of some
>>>     sort), in other an offset/len for the whole set of pages is fine,
>>>     but that's a superset of the one above.
>>
>> Yep, RDMA calls this FBO and length, and further, the protocol requires
>> that the data itself be contiguous within the registration, that is, the
>> FBO can be non-zero, but no other holes be present.
>
> The contiguous requirements isn't something we can alway guarantee.
> While a lot of I/O will have that form the form where there are holes
> can happen, although it's not common.

Yeah, and the important takeaway is that a memory registration API
can't hide this - meaning, the upper layer needs to address it (hah!).
Often, once an upper layer has to do this, it can do better by doing
it itself. But that's perhaps too philosophical here. Let me just
say that transparency has proved to be the enemy of performance.

>
>>>   - we usually want it to be as fast as possible
>>
>> In the case of file protocols such as NFS/RDMA and SMB Direct, as well
>> as block protocols such as iSER, these registrations are set up and
>> torn down on a per-I/O basis, in order to protect the data from
>> misbehaving peers or misbehaving hardware. So to me as a storage
>> protocol provider, "usually" means "always".
>
> Yes.  As I said I haven't actually found anything yet that doesn't fit
> the pattern, but the RDMA in-kernel API is such a mess that I didn't
> want to put my hand in the fire and say always.
>
>> I totally get where you're coming from, my main question is whether
>> it's possible to nail the requirements of some useful common API.
>> It has been tried before, shall I say.
>
> Do you have any information on these attempts and why the failed?  Note
> that the only interesting ones would be for in-kernel consumers.
> Userspace verbs are another order of magnitude more problems, so they're
> not too interesting.

Hmm, most of these are userspace API experiences, and I would not be
so quick as to dismiss their applicability, or their lessons. There
was the old "kvipl" (kernel VI Provider Library), which had certain
simple memreg functions, but I'm not sure that API was ever in the
public domain (it was Intel's). There's kDAPL, based on DAPL which is
actually successful but exposes a somewhat different memory registration
model. And Solaris has an abstraction, which I haven't looked at
in years.

Up a layer, you might look into Portals, the many MPI implementations,
and maybe even some network shared memory stuff like clusters. Most of
these have been implemented as layers atop verbs (among others).

Tom.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 21:32                                 ` Tom Talpey
@ 2015-05-05 22:38                                     ` Jason Gunthorpe
  -1 siblings, 0 replies; 56+ messages in thread
From: Jason Gunthorpe @ 2015-05-05 22:38 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, May 05, 2015 at 05:32:21PM -0400, Tom Talpey wrote:

> >Do you have any information on these attempts and why the failed?  Note
> >that the only interesting ones would be for in-kernel consumers.
> >Userspace verbs are another order of magnitude more problems, so they're
> >not too interesting.
> 
> Hmm, most of these are userspace API experiences, and I would not be
> so quick as to dismiss their applicability, or their lessons.

The specific use-case of a RDMA to/from a logical linear region broken
up into HW pages is incredibly kernel specific, and very friendly to
hardware support.

Heck, on modern systems 100% of these requirements can be solved just
by using the IOMMU. No need for the HCA at all. (HCA may be more
performant, of course)

This is a huge pain for everyone. ie The Lustre devs were talking
about how Lustre is not performant on newer HCAs because their code
doesn't support the new MR scheme.

It makes sense to me to have a dedicated API for this work load:

'post outbound rdma send/write of page region'
'prepare inbound rdma write of page region'
'post rdma read, result into page region'
'complete X'

I'd love to see someone propose some patches :)

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-05 22:38                                     ` Jason Gunthorpe
  0 siblings, 0 replies; 56+ messages in thread
From: Jason Gunthorpe @ 2015-05-05 22:38 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List, linux-rdma

On Tue, May 05, 2015 at 05:32:21PM -0400, Tom Talpey wrote:

> >Do you have any information on these attempts and why the failed?  Note
> >that the only interesting ones would be for in-kernel consumers.
> >Userspace verbs are another order of magnitude more problems, so they're
> >not too interesting.
> 
> Hmm, most of these are userspace API experiences, and I would not be
> so quick as to dismiss their applicability, or their lessons.

The specific use-case of a RDMA to/from a logical linear region broken
up into HW pages is incredibly kernel specific, and very friendly to
hardware support.

Heck, on modern systems 100% of these requirements can be solved just
by using the IOMMU. No need for the HCA at all. (HCA may be more
performant, of course)

This is a huge pain for everyone. ie The Lustre devs were talking
about how Lustre is not performant on newer HCAs because their code
doesn't support the new MR scheme.

It makes sense to me to have a dedicated API for this work load:

'post outbound rdma send/write of page region'
'prepare inbound rdma write of page region'
'post rdma read, result into page region'
'complete X'

I'd love to see someone propose some patches :)

Jason

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 22:38                                     ` Jason Gunthorpe
@ 2015-05-06  0:16                                         ` Tom Talpey
  -1 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-06  0:16 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Steve French

I'm adding Steve French because I'm about to talk about SMB.

On 5/5/2015 6:38 PM, Jason Gunthorpe wrote:
> On Tue, May 05, 2015 at 05:32:21PM -0400, Tom Talpey wrote:
>
>>> Do you have any information on these attempts and why the failed?  Note
>>> that the only interesting ones would be for in-kernel consumers.
>>> Userspace verbs are another order of magnitude more problems, so they're
>>> not too interesting.
>>
>> Hmm, most of these are userspace API experiences, and I would not be
>> so quick as to dismiss their applicability, or their lessons.
>
> The specific use-case of a RDMA to/from a logical linear region broken
> up into HW pages is incredibly kernel specific, and very friendly to
> hardware support.
>
> Heck, on modern systems 100% of these requirements can be solved just
> by using the IOMMU. No need for the HCA at all. (HCA may be more
> performant, of course)

I don't agree on "100%", because IOMMUs don't have the same protection
attributes as RDMA adapters (local R, local W, remote R, remote W). Also
they don't support handles for page lists quite like STags/RMRs, so they
require additional (R)DMA scatter/gather. But, I agree with your point
that they translate addresses just great.

> This is a huge pain for everyone. ie The Lustre devs were talking
> about how Lustre is not performant on newer HCAs because their code
> doesn't support the new MR scheme.
>
> It makes sense to me to have a dedicated API for this work load:
>
> 'post outbound rdma send/write of page region'

A bunch of writes followed by a send is a common sequence, but not
very complex (I think).

> 'prepare inbound rdma write of page region'

This is memory registration, with remote writability. That's what
the rpcrdma_register_external() API in xprtrdma/verbs.c does. It
takes a private rpcrdma structure, but it supports multiple memreg
strategies and pretty much does what you expect. I'm sure someone
could abstract it upward.

> 'post rdma read, result into page region'

The svcrdma stuff in the NFS RDMA server has this, it's called from
the XDR decoding.

> 'complete X'

This is trickier - invalidation has many interesting error cases.
But, on a sunny day with the breeze at our backs, sure.

> I'd love to see someone propose some patches :)

I'd like to mention something else. Many upper layers basically want
a socket, but memory registration and explicit RDMA break that. There
have been some relatively awful solutions to make it all transparent,
let's not go there.

The RPC/RDMA protocol was designed to tuck underneath RPC and
XDR, so, while not socket-like, it allowed RPC to hide RDMA
from (for example) NFS. NFS therefore did not have to change.
I thought transparency was a good idea at the time.

SMB Direct, in Windows, presents a socket-like interface for messaging
(connection, send/receive, etc), but makes memory registration and
RDMA Read / Write explicit. It's the SMB3 protocol that drives RDMA,
which it does only for SMB_READ and SMB_WRITE. The SMB3 upper layer
knows it's on an RDMA-capable connection, and "sets up" the transfer
by explicitly deciding to do an RDMA, which it does by asking the
SMB Direct driver to register memory. It then gets back one or more
handles, which it sends to the server in the SMB3 layer message.
The server performs the RDMA, and the reply indicates the result.
After which, the SMB3 upper layer explicitly de-registers.

If Linux upper layers considered adopting a similar approach by
carefully inserting RDMA operations conditionally, it can make
the lower layer's job much more efficient. And, efficiency is speed.
And in the end, the API throughout the stack will be simpler.

MHO.

Tom.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06  0:16                                         ` Tom Talpey
  0 siblings, 0 replies; 56+ messages in thread
From: Tom Talpey @ 2015-05-06  0:16 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma, Steve French

I'm adding Steve French because I'm about to talk about SMB.

On 5/5/2015 6:38 PM, Jason Gunthorpe wrote:
> On Tue, May 05, 2015 at 05:32:21PM -0400, Tom Talpey wrote:
>
>>> Do you have any information on these attempts and why the failed?  Note
>>> that the only interesting ones would be for in-kernel consumers.
>>> Userspace verbs are another order of magnitude more problems, so they're
>>> not too interesting.
>>
>> Hmm, most of these are userspace API experiences, and I would not be
>> so quick as to dismiss their applicability, or their lessons.
>
> The specific use-case of a RDMA to/from a logical linear region broken
> up into HW pages is incredibly kernel specific, and very friendly to
> hardware support.
>
> Heck, on modern systems 100% of these requirements can be solved just
> by using the IOMMU. No need for the HCA at all. (HCA may be more
> performant, of course)

I don't agree on "100%", because IOMMUs don't have the same protection
attributes as RDMA adapters (local R, local W, remote R, remote W). Also
they don't support handles for page lists quite like STags/RMRs, so they
require additional (R)DMA scatter/gather. But, I agree with your point
that they translate addresses just great.

> This is a huge pain for everyone. ie The Lustre devs were talking
> about how Lustre is not performant on newer HCAs because their code
> doesn't support the new MR scheme.
>
> It makes sense to me to have a dedicated API for this work load:
>
> 'post outbound rdma send/write of page region'

A bunch of writes followed by a send is a common sequence, but not
very complex (I think).

> 'prepare inbound rdma write of page region'

This is memory registration, with remote writability. That's what
the rpcrdma_register_external() API in xprtrdma/verbs.c does. It
takes a private rpcrdma structure, but it supports multiple memreg
strategies and pretty much does what you expect. I'm sure someone
could abstract it upward.

> 'post rdma read, result into page region'

The svcrdma stuff in the NFS RDMA server has this, it's called from
the XDR decoding.

> 'complete X'

This is trickier - invalidation has many interesting error cases.
But, on a sunny day with the breeze at our backs, sure.

> I'd love to see someone propose some patches :)

I'd like to mention something else. Many upper layers basically want
a socket, but memory registration and explicit RDMA break that. There
have been some relatively awful solutions to make it all transparent,
let's not go there.

The RPC/RDMA protocol was designed to tuck underneath RPC and
XDR, so, while not socket-like, it allowed RPC to hide RDMA
from (for example) NFS. NFS therefore did not have to change.
I thought transparency was a good idea at the time.

SMB Direct, in Windows, presents a socket-like interface for messaging
(connection, send/receive, etc), but makes memory registration and
RDMA Read / Write explicit. It's the SMB3 protocol that drives RDMA,
which it does only for SMB_READ and SMB_WRITE. The SMB3 upper layer
knows it's on an RDMA-capable connection, and "sets up" the transfer
by explicitly deciding to do an RDMA, which it does by asking the
SMB Direct driver to register memory. It then gets back one or more
handles, which it sends to the server in the SMB3 layer message.
The server performs the RDMA, and the reply indicates the result.
After which, the SMB3 upper layer explicitly de-registers.

If Linux upper layers considered adopting a similar approach by
carefully inserting RDMA operations conditionally, it can make
the lower layer's job much more efficient. And, efficiency is speed.
And in the end, the API throughout the stack will be simpler.

MHO.

Tom.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 22:38                                     ` Jason Gunthorpe
@ 2015-05-06  7:01                                         ` Bart Van Assche
  -1 siblings, 0 replies; 56+ messages in thread
From: Bart Van Assche @ 2015-05-06  7:01 UTC (permalink / raw)
  To: Jason Gunthorpe, Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 05/06/15 00:38, Jason Gunthorpe wrote:
> Heck, on modern systems 100% of these requirements can be solved just
> by using the IOMMU. No need for the HCA at all. (HCA may be more
> performant, of course)

Hello Jason,

Any performance tests I have run so far with the IOMMU enabled show much 
worse results than the same test with the IOMMU disabled. The perf tool 
learned me that this performance difference is due to lock contention 
caused by the IOMMU kernel code. I have not yet tried to verify whether 
this is an implementation issue or something fundamental.

This is why I'm not enthusiast about any approach that relies on the 
IOMMU being enabled.

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06  7:01                                         ` Bart Van Assche
  0 siblings, 0 replies; 56+ messages in thread
From: Bart Van Assche @ 2015-05-06  7:01 UTC (permalink / raw)
  To: Jason Gunthorpe, Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List, linux-rdma

On 05/06/15 00:38, Jason Gunthorpe wrote:
> Heck, on modern systems 100% of these requirements can be solved just
> by using the IOMMU. No need for the HCA at all. (HCA may be more
> performant, of course)

Hello Jason,

Any performance tests I have run so far with the IOMMU enabled show much 
worse results than the same test with the IOMMU disabled. The perf tool 
learned me that this performance difference is due to lock contention 
caused by the IOMMU kernel code. I have not yet tried to verify whether 
this is an implementation issue or something fundamental.

This is why I'm not enthusiast about any approach that relies on the 
IOMMU being enabled.

Bart.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 21:06                             ` Christoph Hellwig
@ 2015-05-06  7:09                                 ` Bart Van Assche
  -1 siblings, 0 replies; 56+ messages in thread
From: Bart Van Assche @ 2015-05-06  7:09 UTC (permalink / raw)
  To: Christoph Hellwig, Tom Talpey
  Cc: Chuck Lever, Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 05/05/15 23:06, Christoph Hellwig wrote:
> The contiguous requirements isn't something we can alway guarantee.
> While a lot of I/O will have that form the form where there are holes
> can happen, although it's not common.

Indeed. That is why there is code in the SRP initiator that uses 
multiple FRWR registrations when a discontiguous SG-list is passed to 
that driver by the SCSI mid-layer. Some time ago I had posted a test 
program that makes the SCSI mid-layer submit a discontiguous I/O request 
to a SCSI LLD (see also 
https://www.mail-archive.com/linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org/msg21224.html).

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06  7:09                                 ` Bart Van Assche
  0 siblings, 0 replies; 56+ messages in thread
From: Bart Van Assche @ 2015-05-06  7:09 UTC (permalink / raw)
  To: Christoph Hellwig, Tom Talpey
  Cc: Chuck Lever, Linux NFS Mailing List, linux-rdma

On 05/05/15 23:06, Christoph Hellwig wrote:
> The contiguous requirements isn't something we can alway guarantee.
> While a lot of I/O will have that form the form where there are holes
> can happen, although it's not common.

Indeed. That is why there is code in the SRP initiator that uses 
multiple FRWR registrations when a discontiguous SG-list is passed to 
that driver by the SCSI mid-layer. Some time ago I had posted a test 
program that makes the SCSI mid-layer submit a discontiguous I/O request 
to a SCSI LLD (see also 
https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg21224.html).

Bart.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-06  7:09                                 ` Bart Van Assche
@ 2015-05-06  7:29                                     ` Christoph Hellwig
  -1 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-06  7:29 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Tom Talpey, Chuck Lever,
	Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Wed, May 06, 2015 at 09:09:36AM +0200, Bart Van Assche wrote:
> On 05/05/15 23:06, Christoph Hellwig wrote:
> >The contiguous requirements isn't something we can alway guarantee.
> >While a lot of I/O will have that form the form where there are holes
> >can happen, although it's not common.
> 
> Indeed. That is why there is code in the SRP initiator that uses multiple
> FRWR registrations when a discontiguous SG-list is passed to that driver by
> the SCSI mid-layer. Some time ago I had posted a test program that makes the
> SCSI mid-layer submit a discontiguous I/O request to a SCSI LLD (see also
> https://www.mail-archive.com/linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org/msg21224.html).

Note that block layer now has the (fairly misnamed) QUEUE_FLAG_SG_GAPS
flag, which ensures drivers don't see S/G lists like this one.  I
haven't ever seen normal TYPE_FS producers submit bios like that, so in
practice the effect is to reject vectored SG_IO uses like your test
program.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06  7:29                                     ` Christoph Hellwig
  0 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-06  7:29 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Tom Talpey, Chuck Lever,
	Linux NFS Mailing List, linux-rdma

On Wed, May 06, 2015 at 09:09:36AM +0200, Bart Van Assche wrote:
> On 05/05/15 23:06, Christoph Hellwig wrote:
> >The contiguous requirements isn't something we can alway guarantee.
> >While a lot of I/O will have that form the form where there are holes
> >can happen, although it's not common.
> 
> Indeed. That is why there is code in the SRP initiator that uses multiple
> FRWR registrations when a discontiguous SG-list is passed to that driver by
> the SCSI mid-layer. Some time ago I had posted a test program that makes the
> SCSI mid-layer submit a discontiguous I/O request to a SCSI LLD (see also
> https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg21224.html).

Note that block layer now has the (fairly misnamed) QUEUE_FLAG_SG_GAPS
flag, which ensures drivers don't see S/G lists like this one.  I
haven't ever seen normal TYPE_FS producers submit bios like that, so in
practice the effect is to reject vectored SG_IO uses like your test
program.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 21:32                                 ` Tom Talpey
@ 2015-05-06  7:33                                     ` Christoph Hellwig
  -1 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-06  7:33 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, May 05, 2015 at 05:32:21PM -0400, Tom Talpey wrote:
> >But that whole painpoint only existist for userspace ib verbs consumers.
> >And in-kernel consumer fits into the "pinned" or "wired" categegory,
> >as any local DMA requires it.
> 
> True, but I think there's a bit more to it. For example, the buffer
> cache is pinned, but the data on the page isn't dedicated to an i/o,
> it's shared among file-layer stuff. Of course, a file-layer RDMA
> protocol needs to play by those rules, but I'll use it as a warning
> that it's not always simple.

Actually there is no buffer cache anymore, and the page cache now
keeps pages under writeback stable, that is doesn't allow modifications
while the page is written back.

Not that it matters, as both direct I/O for filesystems or SG_IO for
block devices allows I/O to user pages that can be modified while
dma is in progress.  So pretty much every in kernel user has to deal
with this case, maybe with the exception of network protocols.

> >The contiguous requirements isn't something we can alway guarantee.
> >While a lot of I/O will have that form the form where there are holes
> >can happen, although it's not common.
> 
> Yeah, and the important takeaway is that a memory registration API
> can't hide this - meaning, the upper layer needs to address it (hah!).
> Often, once an upper layer has to do this, it can do better by doing
> it itself. But that's perhaps too philosophical here. Let me just
> say that transparency has proved to be the enemy of performance.

Yes, I don't think that something that should be worked around at
an API at a low level.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06  7:33                                     ` Christoph Hellwig
  0 siblings, 0 replies; 56+ messages in thread
From: Christoph Hellwig @ 2015-05-06  7:33 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List, linux-rdma

On Tue, May 05, 2015 at 05:32:21PM -0400, Tom Talpey wrote:
> >But that whole painpoint only existist for userspace ib verbs consumers.
> >And in-kernel consumer fits into the "pinned" or "wired" categegory,
> >as any local DMA requires it.
> 
> True, but I think there's a bit more to it. For example, the buffer
> cache is pinned, but the data on the page isn't dedicated to an i/o,
> it's shared among file-layer stuff. Of course, a file-layer RDMA
> protocol needs to play by those rules, but I'll use it as a warning
> that it's not always simple.

Actually there is no buffer cache anymore, and the page cache now
keeps pages under writeback stable, that is doesn't allow modifications
while the page is written back.

Not that it matters, as both direct I/O for filesystems or SG_IO for
block devices allows I/O to user pages that can be modified while
dma is in progress.  So pretty much every in kernel user has to deal
with this case, maybe with the exception of network protocols.

> >The contiguous requirements isn't something we can alway guarantee.
> >While a lot of I/O will have that form the form where there are holes
> >can happen, although it's not common.
> 
> Yeah, and the important takeaway is that a memory registration API
> can't hide this - meaning, the upper layer needs to address it (hah!).
> Often, once an upper layer has to do this, it can do better by doing
> it itself. But that's perhaps too philosophical here. Let me just
> say that transparency has proved to be the enemy of performance.

Yes, I don't think that something that should be worked around at
an API at a low level.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-05 17:25             ` Christoph Hellwig
@ 2015-05-06 12:15                 ` Sagi Grimberg
  -1 siblings, 0 replies; 56+ messages in thread
From: Sagi Grimberg @ 2015-05-06 12:15 UTC (permalink / raw)
  To: Christoph Hellwig, Chuck Lever
  Cc: Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 5/5/2015 8:25 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 12:04:00PM -0400, Chuck Lever wrote:
>>> Just curious if you ever though of moving this into the generic
>>> rdma layer?
>>
>> Not really. The new files are really just shims that adapt the RPC/RDMA
>> transport to memory registration verbs. There?s only a few hundred lines
>> per registration mode, and it?s all fairly specific to RPC/RDMA.
>
> While it's using RPC/RDMA specific data structures it basically
> abstracts out the action of mapping a number of pages onto the rdma
> hardware.   There isn't a whole lot of interaction with the actual
> sunrpc layer except for a few hardcoded limits.
>
> Btw, this is not a critique of the code, it's an obvious major
> improvement of what was there before, it justs seems like it would be
> nice to move it up to a higher layer.
>
>>> And from I see we litterly dont use them much different from the generic
>>> dma mapping API helpers (at a very high level) so it seems it should
>>> be easy to move a slightly expanded version of your API to the core
>>> code.
>>
>> IMO FRWR is the only registration mode that has legs for the long term,
>> and is specifically designed for storage.
>>
>> If you are not working on a legacy piece of code that has to support
>> older HCAs, why not stay with FRWR?

Hey Christoph,

I agree here,

FMRs (and FMR pools) are not supported over VFs. Also, I've seen some
unpredictable performance in certain workloads because the fmr pool
maintains a flush thread that executes a HW sync (terribly blocking on
mlx4 devices) when hitting some dirty_watermark...

If you are writing a driver, I suggest to stick FRWR as well.

>
> The raw FRWR API seems like an absolute nightmare, and I'm bound to
> get it wrong at first :)  This is only half joking, but despite that
> it's the first target for sure.  It's just very frustrating that there
> is no usable common API.

The FRWR API is a work request interface. The advantage here is the
option to concatenate it with a send/rdma work request and save an extra
send queue lock and more importantly a doorbell. This matters in high
workloads. The iser target is doing this and I have a patch for the
initiator code as well.

I'm not sure that providing an API that allows you to do post-lists
might be simpler...

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06 12:15                 ` Sagi Grimberg
  0 siblings, 0 replies; 56+ messages in thread
From: Sagi Grimberg @ 2015-05-06 12:15 UTC (permalink / raw)
  To: Christoph Hellwig, Chuck Lever; +Cc: Linux NFS Mailing List, linux-rdma

On 5/5/2015 8:25 PM, Christoph Hellwig wrote:
> On Tue, May 05, 2015 at 12:04:00PM -0400, Chuck Lever wrote:
>>> Just curious if you ever though of moving this into the generic
>>> rdma layer?
>>
>> Not really. The new files are really just shims that adapt the RPC/RDMA
>> transport to memory registration verbs. There?s only a few hundred lines
>> per registration mode, and it?s all fairly specific to RPC/RDMA.
>
> While it's using RPC/RDMA specific data structures it basically
> abstracts out the action of mapping a number of pages onto the rdma
> hardware.   There isn't a whole lot of interaction with the actual
> sunrpc layer except for a few hardcoded limits.
>
> Btw, this is not a critique of the code, it's an obvious major
> improvement of what was there before, it justs seems like it would be
> nice to move it up to a higher layer.
>
>>> And from I see we litterly dont use them much different from the generic
>>> dma mapping API helpers (at a very high level) so it seems it should
>>> be easy to move a slightly expanded version of your API to the core
>>> code.
>>
>> IMO FRWR is the only registration mode that has legs for the long term,
>> and is specifically designed for storage.
>>
>> If you are not working on a legacy piece of code that has to support
>> older HCAs, why not stay with FRWR?

Hey Christoph,

I agree here,

FMRs (and FMR pools) are not supported over VFs. Also, I've seen some
unpredictable performance in certain workloads because the fmr pool
maintains a flush thread that executes a HW sync (terribly blocking on
mlx4 devices) when hitting some dirty_watermark...

If you are writing a driver, I suggest to stick FRWR as well.

>
> The raw FRWR API seems like an absolute nightmare, and I'm bound to
> get it wrong at first :)  This is only half joking, but despite that
> it's the first target for sure.  It's just very frustrating that there
> is no usable common API.

The FRWR API is a work request interface. The advantage here is the
option to concatenate it with a send/rdma work request and save an extra
send queue lock and more importantly a doorbell. This matters in high
workloads. The iser target is doing this and I have a patch for the
initiator code as well.

I'm not sure that providing an API that allows you to do post-lists
might be simpler...

Sagi.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-06  0:16                                         ` Tom Talpey
@ 2015-05-06 16:20                                             ` Jason Gunthorpe
  -1 siblings, 0 replies; 56+ messages in thread
From: Jason Gunthorpe @ 2015-05-06 16:20 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Steve French

On Tue, May 05, 2015 at 08:16:01PM -0400, Tom Talpey wrote:

> >The specific use-case of a RDMA to/from a logical linear region broken
> >up into HW pages is incredibly kernel specific, and very friendly to
> >hardware support.
> >
> >Heck, on modern systems 100% of these requirements can be solved just
> >by using the IOMMU. No need for the HCA at all. (HCA may be more
> >performant, of course)
> 
> I don't agree on "100%", because IOMMUs don't have the same protection
> attributes as RDMA adapters (local R, local W, remote R, remote W).

No, you do get protection - the IOMMU isn't the only resource, it would
still have to be combined with several pre-setup MR's that have the
proper protection attributes. You'd map the page list into the address
space that is covered by a MR that has the protection attributes
needed.

> Also they don't support handles for page lists quite like
> STags/RMRs, so they require additional (R)DMA scatter/gather. But, I
> agree with your point that they translate addresses just great.

??? the entire point of using the IOMMU in a context like this is to
linearize the page list into DMA'able address. How could you ever need
to scatter/gather when your memory is linear?

> >'post outbound rdma send/write of page region'
> 
> A bunch of writes followed by a send is a common sequence, but not
> very complex (I think).

So, I wasn't clear, I mean a general API that can post a SEND or RDMA
WRITE using a logically linear page list as the data source.

So this results in one of:
 1) A SEND with a gather list
 2) A SEND with a temporary linearized MR
 3) A series of RDMA WRITE with gather lists
 4) A RDMA WRITE with a temporary linearized MR

Picking one depends on the performance of the HCA and the various
features it supports. Even just the really simple options of #1 and #3
become a bit more complex when you want to take advantage of
transparent huge pages to reduce gather list length.

For instance, deciding when to trade off 3 vs 4 is going to be very
driver specific..

> >'prepare inbound rdma write of page region'
> 
> This is memory registration, with remote writability. That's what
> the rpcrdma_register_external() API in xprtrdma/verbs.c does. It
> takes a private rpcrdma structure, but it supports multiple memreg
> strategies and pretty much does what you expect. I'm sure someone
> could abstract it upward.

Right, most likely an implementation would just pull the NFS code into
the core, I think it is the broadest version we have?

> >'complete X'
> 
> This is trickier - invalidation has many interesting error cases.
> But, on a sunny day with the breeze at our backs, sure.

I don't mean send+invalidate, this is the 'free' for the 'alloc' the
above APIs might need (ie the temporary MR). You can't fail to free
the MR - that would be an insane API :)

> If Linux upper layers considered adopting a similar approach by
> carefully inserting RDMA operations conditionally, it can make
> the lower layer's job much more efficient. And, efficiency is speed.
> And in the end, the API throughout the stack will be simpler.

No idea for Linux. It seems to me most of the use cases we are talking
about here not actually assuming a socket, NFS-RDMA, SRP, iSER, Lustre
are all explicitly driving verbs and explicity working with pages
lists for their high speed side.

Does that mean we are already doing what you are talking about?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06 16:20                                             ` Jason Gunthorpe
  0 siblings, 0 replies; 56+ messages in thread
From: Jason Gunthorpe @ 2015-05-06 16:20 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Christoph Hellwig, Chuck Lever, Linux NFS Mailing List,
	linux-rdma, Steve French

On Tue, May 05, 2015 at 08:16:01PM -0400, Tom Talpey wrote:

> >The specific use-case of a RDMA to/from a logical linear region broken
> >up into HW pages is incredibly kernel specific, and very friendly to
> >hardware support.
> >
> >Heck, on modern systems 100% of these requirements can be solved just
> >by using the IOMMU. No need for the HCA at all. (HCA may be more
> >performant, of course)
> 
> I don't agree on "100%", because IOMMUs don't have the same protection
> attributes as RDMA adapters (local R, local W, remote R, remote W).

No, you do get protection - the IOMMU isn't the only resource, it would
still have to be combined with several pre-setup MR's that have the
proper protection attributes. You'd map the page list into the address
space that is covered by a MR that has the protection attributes
needed.

> Also they don't support handles for page lists quite like
> STags/RMRs, so they require additional (R)DMA scatter/gather. But, I
> agree with your point that they translate addresses just great.

??? the entire point of using the IOMMU in a context like this is to
linearize the page list into DMA'able address. How could you ever need
to scatter/gather when your memory is linear?

> >'post outbound rdma send/write of page region'
> 
> A bunch of writes followed by a send is a common sequence, but not
> very complex (I think).

So, I wasn't clear, I mean a general API that can post a SEND or RDMA
WRITE using a logically linear page list as the data source.

So this results in one of:
 1) A SEND with a gather list
 2) A SEND with a temporary linearized MR
 3) A series of RDMA WRITE with gather lists
 4) A RDMA WRITE with a temporary linearized MR

Picking one depends on the performance of the HCA and the various
features it supports. Even just the really simple options of #1 and #3
become a bit more complex when you want to take advantage of
transparent huge pages to reduce gather list length.

For instance, deciding when to trade off 3 vs 4 is going to be very
driver specific..

> >'prepare inbound rdma write of page region'
> 
> This is memory registration, with remote writability. That's what
> the rpcrdma_register_external() API in xprtrdma/verbs.c does. It
> takes a private rpcrdma structure, but it supports multiple memreg
> strategies and pretty much does what you expect. I'm sure someone
> could abstract it upward.

Right, most likely an implementation would just pull the NFS code into
the core, I think it is the broadest version we have?

> >'complete X'
> 
> This is trickier - invalidation has many interesting error cases.
> But, on a sunny day with the breeze at our backs, sure.

I don't mean send+invalidate, this is the 'free' for the 'alloc' the
above APIs might need (ie the temporary MR). You can't fail to free
the MR - that would be an insane API :)

> If Linux upper layers considered adopting a similar approach by
> carefully inserting RDMA operations conditionally, it can make
> the lower layer's job much more efficient. And, efficiency is speed.
> And in the end, the API throughout the stack will be simpler.

No idea for Linux. It seems to me most of the use cases we are talking
about here not actually assuming a socket, NFS-RDMA, SRP, iSER, Lustre
are all explicitly driving verbs and explicity working with pages
lists for their high speed side.

Does that mean we are already doing what you are talking about?

Jason

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
  2015-05-06  7:01                                         ` Bart Van Assche
@ 2015-05-06 16:38                                             ` Jason Gunthorpe
  -1 siblings, 0 replies; 56+ messages in thread
From: Jason Gunthorpe @ 2015-05-06 16:38 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Tom Talpey, Christoph Hellwig, Chuck Lever,
	Linux NFS Mailing List, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Wed, May 06, 2015 at 09:01:07AM +0200, Bart Van Assche wrote:
> On 05/06/15 00:38, Jason Gunthorpe wrote:
> >Heck, on modern systems 100% of these requirements can be solved just
> >by using the IOMMU. No need for the HCA at all. (HCA may be more
> >performant, of course)
> 
> Hello Jason,
> 
> Any performance tests I have run so far with the IOMMU enabled show
> much worse results than the same test with the IOMMU disabled. The
> perf tool learned me that this performance difference is due to lock
> contention caused by the IOMMU kernel code. I have not yet tried to
> verify whether this is an implementation issue or something
> fundamental.

I'm not surprised, I think that is well known.

Just to be clear I'm not saying we should rely on the IOMMU, or even
implement anything that uses it - but as a thought exercise, the fact
we could implement a page list API entirely with the dumbest HCA and
the IOMMU suggests strongly to me it is a sane API direction to look
at.

If you did have a dumb HCA, using the IOMMU is probably alot faster
that doing a heavy MR registration or doing operations 'page at a
time'. Which would be slower than using a smart HCA with the
IOMMU turned off, for that work load.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-05-06 16:38                                             ` Jason Gunthorpe
  0 siblings, 0 replies; 56+ messages in thread
From: Jason Gunthorpe @ 2015-05-06 16:38 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Tom Talpey, Christoph Hellwig, Chuck Lever,
	Linux NFS Mailing List, linux-rdma

On Wed, May 06, 2015 at 09:01:07AM +0200, Bart Van Assche wrote:
> On 05/06/15 00:38, Jason Gunthorpe wrote:
> >Heck, on modern systems 100% of these requirements can be solved just
> >by using the IOMMU. No need for the HCA at all. (HCA may be more
> >performant, of course)
> 
> Hello Jason,
> 
> Any performance tests I have run so far with the IOMMU enabled show
> much worse results than the same test with the IOMMU disabled. The
> perf tool learned me that this performance difference is due to lock
> contention caused by the IOMMU kernel code. I have not yet tried to
> verify whether this is an implementation issue or something
> fundamental.

I'm not surprised, I think that is well known.

Just to be clear I'm not saying we should rely on the IOMMU, or even
implement anything that uses it - but as a thought exercise, the fact
we could implement a page list API entirely with the dumbest HCA and
the IOMMU suggests strongly to me it is a sane API direction to look
at.

If you did have a dumb HCA, using the IOMMU is probably alot faster
that doing a heavy MR registration or doing operations 'page at a
time'. Which would be slower than using a smart HCA with the
IOMMU turned off, for that work load.

Jason

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1
@ 2015-03-13 21:26 Chuck Lever
  0 siblings, 0 replies; 56+ messages in thread
From: Chuck Lever @ 2015-03-13 21:26 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA

This is a series of client-side patches for NFS/RDMA. In preparation
for increasing the transport credit limit and maximum rsize/wsize,
I've re-factored the memory registration logic into separate files,
invoked via a method API.

Two optimizations appear in this series:

The old code pre-allocated 64 MRs for every RPC, and attached 64 MRs
to each RPC before posting it. The new code attaches just enough MRs
to handle each RPC. When no data payload chunk is needed, no MRs are
attached to the RPC. For modern HCAs, only one MR is needed for NFS
read or write data payloads.

The final patch in the series splits the rb_lock in two in order to
reduce lock contention.

The series is also available in the nfs-rdma-for-4.1 topic branch at

 git://linux-nfs.org/projects/cel/cel-2.6.git

---

Chuck Lever (16):
      xprtrdma: Display IPv6 addresses and port numbers correctly
      xprtrdma: Perform a full marshal on retransmit
      xprtrdma: Add vector of ops for each memory registration strategy
      xprtrdma: Add a "max_payload" op for each memreg mode
      xprtrdma: Add a "register_external" op for each memreg mode
      xprtrdma: Add a "deregister_external" op for each memreg mode
      xprtrdma: Add "init MRs" memreg op
      xprtrdma: Add "reset MRs" memreg op
      xprtrdma: Add "destroy MRs" memreg op
      xprtrdma: Add "open" memreg op
      xprtrdma: Handle non-SEND completions via a callout
      xprtrdma: Acquire FMRs in rpcrdma_fmr_register_external()
      xprtrdma: Acquire MRs in rpcrdma_register_external()
      xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy
      xprtrdma: Make rpcrdma_{un}map_one() into inline functions
      xprtrdma: Split rb_lock


 include/linux/sunrpc/xprtrdma.h    |    3 
 net/sunrpc/xprtrdma/Makefile       |    3 
 net/sunrpc/xprtrdma/fmr_ops.c      |  270 +++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c     |  485 ++++++++++++++++++++
 net/sunrpc/xprtrdma/physical_ops.c |  110 +++++
 net/sunrpc/xprtrdma/rpc_rdma.c     |   82 ++-
 net/sunrpc/xprtrdma/transport.c    |   65 ++-
 net/sunrpc/xprtrdma/verbs.c        |  856 +++---------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |  108 ++++-
 9 files changed, 1096 insertions(+), 886 deletions(-)
 create mode 100644 net/sunrpc/xprtrdma/fmr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/frwr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/physical_ops.c

--
Chuck Lever
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2015-05-06 16:38 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-13 21:21 [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Chuck Lever
2015-03-13 21:21 ` [PATCH v1 01/16] xprtrdma: Display IPv6 addresses and port numbers correctly Chuck Lever
2015-03-13 21:21 ` [PATCH v1 02/16] xprtrdma: Perform a full marshal on retransmit Chuck Lever
2015-03-13 21:21 ` [PATCH v1 03/16] xprtrdma: Add vector of ops for each memory registration strategy Chuck Lever
2015-03-13 21:21 ` [PATCH v1 04/16] xprtrdma: Add a "max_payload" op for each memreg mode Chuck Lever
2015-03-13 21:22 ` [PATCH v1 05/16] xprtrdma: Add a "register_external" " Chuck Lever
2015-03-13 21:22 ` [PATCH v1 06/16] xprtrdma: Add a "deregister_external" " Chuck Lever
2015-03-17 14:37   ` Anna Schumaker
2015-03-17 15:04     ` Chuck Lever
2015-03-13 21:22 ` [PATCH v1 07/16] xprtrdma: Add "init MRs" memreg op Chuck Lever
2015-03-13 21:22 ` [PATCH v1 08/16] xprtrdma: Add "reset " Chuck Lever
2015-03-13 21:22 ` [PATCH v1 09/16] xprtrdma: Add "destroy " Chuck Lever
2015-03-13 21:22 ` [PATCH v1 10/16] xprtrdma: Add "open" " Chuck Lever
2015-03-17 15:16   ` Anna Schumaker
2015-03-17 15:19     ` Chuck Lever
2015-03-13 21:23 ` [PATCH v1 11/16] xprtrdma: Handle non-SEND completions via a callout Chuck Lever
2015-03-13 21:23 ` [PATCH v1 12/16] xprtrdma: Acquire FMRs in rpcrdma_fmr_register_external() Chuck Lever
2015-03-13 21:23 ` [PATCH v1 13/16] xprtrdma: Acquire MRs in rpcrdma_register_external() Chuck Lever
2015-03-13 21:23 ` [PATCH v1 14/16] xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy Chuck Lever
2015-03-13 21:23 ` [PATCH v1 15/16] xprtrdma: Make rpcrdma_{un}map_one() into inline functions Chuck Lever
2015-03-13 21:23 ` [PATCH v1 16/16] xprtrdma: Split rb_lock Chuck Lever
     [not found] ` <20150313211124.22471.14517.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>
2015-05-05 15:44   ` [PATCH v1 00/16] NFS/RDMA patches proposed for 4.1 Christoph Hellwig
2015-05-05 15:44     ` Christoph Hellwig
     [not found]     ` <20150505154411.GA16729-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-05-05 16:04       ` Chuck Lever
2015-05-05 16:04         ` Chuck Lever
     [not found]         ` <5E1B32EA-9803-49AA-856D-BF0E1A5DFFF4-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2015-05-05 17:25           ` Christoph Hellwig
2015-05-05 17:25             ` Christoph Hellwig
     [not found]             ` <20150505172540.GA19442-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-05-05 18:14               ` Tom Talpey
2015-05-05 18:14                 ` Tom Talpey
     [not found]                 ` <55490886.4070502-CLs1Zie5N5HQT0dZR+AlfA@public.gmane.org>
2015-05-05 19:10                   ` Christoph Hellwig
2015-05-05 19:10                     ` Christoph Hellwig
     [not found]                     ` <20150505191012.GA21164-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-05-05 20:57                       ` Tom Talpey
2015-05-05 20:57                         ` Tom Talpey
     [not found]                         ` <55492ED3.7000507-CLs1Zie5N5HQT0dZR+AlfA@public.gmane.org>
2015-05-05 21:06                           ` Christoph Hellwig
2015-05-05 21:06                             ` Christoph Hellwig
     [not found]                             ` <20150505210627.GA5941-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-05-05 21:32                               ` Tom Talpey
2015-05-05 21:32                                 ` Tom Talpey
     [not found]                                 ` <554936E5.80607-CLs1Zie5N5HQT0dZR+AlfA@public.gmane.org>
2015-05-05 22:38                                   ` Jason Gunthorpe
2015-05-05 22:38                                     ` Jason Gunthorpe
     [not found]                                     ` <20150505223855.GA7696-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-05-06  0:16                                       ` Tom Talpey
2015-05-06  0:16                                         ` Tom Talpey
     [not found]                                         ` <55495D41.5090502-CLs1Zie5N5HQT0dZR+AlfA@public.gmane.org>
2015-05-06 16:20                                           ` Jason Gunthorpe
2015-05-06 16:20                                             ` Jason Gunthorpe
2015-05-06  7:01                                       ` Bart Van Assche
2015-05-06  7:01                                         ` Bart Van Assche
     [not found]                                         ` <5549BC33.30905-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-05-06 16:38                                           ` Jason Gunthorpe
2015-05-06 16:38                                             ` Jason Gunthorpe
2015-05-06  7:33                                   ` Christoph Hellwig
2015-05-06  7:33                                     ` Christoph Hellwig
2015-05-06  7:09                               ` Bart Van Assche
2015-05-06  7:09                                 ` Bart Van Assche
     [not found]                                 ` <5549BE30.8020505-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-05-06  7:29                                   ` Christoph Hellwig
2015-05-06  7:29                                     ` Christoph Hellwig
2015-05-06 12:15               ` Sagi Grimberg
2015-05-06 12:15                 ` Sagi Grimberg
2015-03-13 21:26 Chuck Lever

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.