All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 0/6] Sometimes pull up the Send buffer
@ 2019-10-17 18:31 Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 1/6] xprtrdma: Ensure ri_id is stable during MR recycling Chuck Lever
                   ` (5 more replies)
  0 siblings, 6 replies; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

This series contains several clean-up/refactoring patches, then the
last two patches in the series re-implement buffer pull-up for Send
buffers. After this series is applied, the client may choose between
using DMA-mapped Send buffers and pull-up. Sometimes one of these
methods is strictly more efficient than the other.

---

Chuck Lever (6):
      xprtrdma: Ensure ri_id is stable during MR recycling
      xprtrdma: Remove rpcrdma_sendctx::sc_xprt
      xprtrdma: Remove rpcrdma_sendctx::sc_device
      xprtrdma: Move the rpcrdma_sendctx::sc_wr field
      xprtrdma: Refactor rpcrdma_prepare_msg_sges()
      xprtrdma: Pull up sometimes


 net/sunrpc/xprtrdma/backchannel.c |    2 
 net/sunrpc/xprtrdma/frwr_ops.c    |   25 +--
 net/sunrpc/xprtrdma/rpc_rdma.c    |  352 ++++++++++++++++++++++++-------------
 net/sunrpc/xprtrdma/verbs.c       |   26 +--
 net/sunrpc/xprtrdma/xprt_rdma.h   |   14 -
 5 files changed, 251 insertions(+), 168 deletions(-)

--
Chuck Lever

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v1 1/6] xprtrdma: Ensure ri_id is stable during MR recycling
  2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
@ 2019-10-17 18:31 ` Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 2/6] xprtrdma: Remove rpcrdma_sendctx::sc_xprt Chuck Lever
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

ia->ri_id is replaced during a reconnect. The connect_worker runs
with the transport send lock held to prevent ri_id from being
dereferenced by the send_request path during this process.

Currently, however, there is no guarantee that ia->ri_id is stable
in the MR recycling worker, which operates in the background and is
not serialized with the connect_worker in any way.

But now that Local_Inv completions are being done in process
context, we can handle the recycling operation there instead of
deferring the recycling work to another process. Because the
disconnect path drains all work before allowing tear down to
proceed, it is guaranteed that Local Invalidations complete only
while the ri_id pointer is stable.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   23 ++++++-----------------
 net/sunrpc/xprtrdma/xprt_rdma.h |    7 -------
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 37ba82d..5cd8715 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -88,8 +88,10 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
 	kfree(mr);
 }
 
-static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
 {
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
 	trace_xprtrdma_mr_recycle(mr);
 
 	if (mr->mr_dir != DMA_NONE) {
@@ -107,18 +109,6 @@ static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 	frwr_release_mr(mr);
 }
 
-/* MRs are dynamically allocated, so simply clean up and release the MR.
- * A replacement MR will subsequently be allocated on demand.
- */
-static void
-frwr_mr_recycle_worker(struct work_struct *work)
-{
-	struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
-					     mr_recycle);
-
-	frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
 /* frwr_reset - Place MRs back on the free list
  * @req: request to reset
  *
@@ -163,7 +153,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 	mr->frwr.fr_mr = frmr;
 	mr->mr_dir = DMA_NONE;
 	INIT_LIST_HEAD(&mr->mr_list);
-	INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
 	init_completion(&mr->frwr.fr_linv_done);
 
 	sg_init_table(sg, depth);
@@ -448,7 +437,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
 {
 	if (wc->status != IB_WC_SUCCESS)
-		rpcrdma_mr_recycle(mr);
+		frwr_mr_recycle(mr);
 	else
 		rpcrdma_mr_put(mr);
 }
@@ -570,7 +559,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		bad_wr = bad_wr->next;
 
 		list_del_init(&mr->mr_list);
-		rpcrdma_mr_recycle(mr);
+		frwr_mr_recycle(mr);
 	}
 }
 
@@ -664,7 +653,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		mr = container_of(frwr, struct rpcrdma_mr, frwr);
 		bad_wr = bad_wr->next;
 
-		rpcrdma_mr_recycle(mr);
+		frwr_mr_recycle(mr);
 	}
 
 	/* The final LOCAL_INV WR in the chain is supposed to
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a7ef965..b8e768d 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -257,7 +257,6 @@ struct rpcrdma_mr {
 	u32			mr_handle;
 	u32			mr_length;
 	u64			mr_offset;
-	struct work_struct	mr_recycle;
 	struct list_head	mr_all;
 };
 
@@ -490,12 +489,6 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
 void rpcrdma_mr_put(struct rpcrdma_mr *mr);
 void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
 
-static inline void
-rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
-{
-	schedule_work(&mr->mr_recycle);
-}
-
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
 			struct rpcrdma_req *req);


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v1 2/6] xprtrdma: Remove rpcrdma_sendctx::sc_xprt
  2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 1/6] xprtrdma: Ensure ri_id is stable during MR recycling Chuck Lever
@ 2019-10-17 18:31 ` Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 3/6] xprtrdma: Remove rpcrdma_sendctx::sc_device Chuck Lever
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

Micro-optimization: Save eight bytes in a frequently allocated
structure.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/verbs.c     |   19 ++++++++++---------
 net/sunrpc/xprtrdma/xprt_rdma.h |    2 --
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c79d862..3ab086a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,7 +74,8 @@
 /*
  * internal functions
  */
-static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+				       struct rpcrdma_sendctx *sc);
 static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf);
 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
@@ -124,7 +125,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
 
 /**
  * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
- * @cq:	completion queue (ignored)
+ * @cq:	completion queue
  * @wc:	completed WR
  *
  */
@@ -137,7 +138,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	trace_xprtrdma_wc_send(sc, wc);
-	rpcrdma_sendctx_put_locked(sc);
+	rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
 }
 
 /**
@@ -518,7 +519,7 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	init_waitqueue_head(&ep->rep_connect_wait);
 	ep->rep_receive_count = 0;
 
-	sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
+	sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
 				 ep->rep_attr.cap.max_send_wr + 1,
 				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(sendcq)) {
@@ -840,7 +841,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
 		if (!sc)
 			return -ENOMEM;
 
-		sc->sc_xprt = r_xprt;
 		buf->rb_sc_ctxs[i] = sc;
 	}
 
@@ -903,6 +903,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
 
 /**
  * rpcrdma_sendctx_put_locked - Release a send context
+ * @r_xprt: controlling transport instance
  * @sc: send context to release
  *
  * Usage: Called from Send completion to return a sendctxt
@@ -910,10 +911,10 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
  *
  * The caller serializes calls to this function (per transport).
  */
-static void
-rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+				       struct rpcrdma_sendctx *sc)
 {
-	struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	unsigned long next_tail;
 
 	/* Unmap SGEs of previously completed but unsignaled
@@ -931,7 +932,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
 	/* Paired with READ_ONCE */
 	smp_store_release(&buf->rb_sc_tail, next_tail);
 
-	xprt_write_space(&sc->sc_xprt->rx_xprt);
+	xprt_write_space(&r_xprt->rx_xprt);
 }
 
 static void
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b8e768d..4897c09 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -218,12 +218,10 @@ enum {
 /* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
  */
 struct rpcrdma_req;
-struct rpcrdma_xprt;
 struct rpcrdma_sendctx {
 	struct ib_send_wr	sc_wr;
 	struct ib_cqe		sc_cqe;
 	struct ib_device	*sc_device;
-	struct rpcrdma_xprt	*sc_xprt;
 	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
 	struct ib_sge		sc_sges[];


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v1 3/6] xprtrdma: Remove rpcrdma_sendctx::sc_device
  2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 1/6] xprtrdma: Ensure ri_id is stable during MR recycling Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 2/6] xprtrdma: Remove rpcrdma_sendctx::sc_xprt Chuck Lever
@ 2019-10-17 18:31 ` Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 4/6] xprtrdma: Move the rpcrdma_sendctx::sc_wr field Chuck Lever
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

Micro-optimization: Save eight bytes in a frequently allocated
structure.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ++--
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 7b13582..1941b22 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -564,6 +564,7 @@ static void rpcrdma_sendctx_done(struct kref *kref)
  */
 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
 {
+	struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
 	struct ib_sge *sge;
 
 	if (!sc->sc_unmap_count)
@@ -575,7 +576,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
 	 */
 	for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
 	     ++sge, --sc->sc_unmap_count)
-		ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
+		ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
 				  DMA_TO_DEVICE);
 
 	kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
@@ -625,7 +626,6 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
 	 */
 	if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
 		goto out_regbuf;
-	sc->sc_device = rdmab_device(rb);
 	sge_no = 1;
 	sge[sge_no].addr = rdmab_addr(rb);
 	sge[sge_no].length = xdr->head[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 4897c09..0e5b7f3 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -221,7 +221,6 @@ enum {
 struct rpcrdma_sendctx {
 	struct ib_send_wr	sc_wr;
 	struct ib_cqe		sc_cqe;
-	struct ib_device	*sc_device;
 	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
 	struct ib_sge		sc_sges[];


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v1 4/6] xprtrdma: Move the rpcrdma_sendctx::sc_wr field
  2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
                   ` (2 preceding siblings ...)
  2019-10-17 18:31 ` [PATCH v1 3/6] xprtrdma: Remove rpcrdma_sendctx::sc_device Chuck Lever
@ 2019-10-17 18:31 ` Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 5/6] xprtrdma: Refactor rpcrdma_prepare_msg_sges() Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 6/6] xprtrdma: Pull up sometimes Chuck Lever
  5 siblings, 0 replies; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

Clean up: This field is not needed in the Send completion handler,
so it can be moved to struct rpcrdma_req to reduce the size of
struct rpcrdma_sendctx, and to reduce the amount of memory that
is sloshed between the sending process and the Send completion
process.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |    2 +-
 net/sunrpc/xprtrdma/rpc_rdma.c  |    9 ++++++---
 net/sunrpc/xprtrdma/verbs.c     |    5 +----
 net/sunrpc/xprtrdma/xprt_rdma.h |    2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 31681cb..16572ae7 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -667,9 +667,8 @@
 		__entry->client_id = rqst->rq_task->tk_client ?
 				     rqst->rq_task->tk_client->cl_clid : -1;
 		__entry->req = req;
-		__entry->num_sge = req->rl_sendctx->sc_wr.num_sge;
-		__entry->signaled = req->rl_sendctx->sc_wr.send_flags &
-				    IB_SEND_SIGNALED;
+		__entry->num_sge = req->rl_wr.num_sge;
+		__entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED;
 		__entry->status = status;
 	),
 
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 5cd8715..523722b 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -396,7 +396,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
 	struct ib_send_wr *post_wr;
 	struct rpcrdma_mr *mr;
 
-	post_wr = &req->rl_sendctx->sc_wr;
+	post_wr = &req->rl_wr;
 	list_for_each_entry(mr, &req->rl_registered, mr_list) {
 		struct rpcrdma_frwr *frwr;
 
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 1941b22..53cd2e3 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -599,7 +599,7 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
 
 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
 				      DMA_TO_DEVICE);
-	sc->sc_wr.num_sge++;
+	req->rl_wr.num_sge++;
 	return true;
 
 out_regbuf:
@@ -711,7 +711,7 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
 	}
 
 out:
-	sc->sc_wr.num_sge += sge_no;
+	req->rl_wr.num_sge += sge_no;
 	if (sc->sc_unmap_count)
 		kref_get(&req->rl_kref);
 	return true;
@@ -752,10 +752,13 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
 	req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
 	if (!req->rl_sendctx)
 		goto err;
-	req->rl_sendctx->sc_wr.num_sge = 0;
 	req->rl_sendctx->sc_unmap_count = 0;
 	req->rl_sendctx->sc_req = req;
 	kref_init(&req->rl_kref);
+	req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
+	req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
+	req->rl_wr.num_sge = 0;
+	req->rl_wr.opcode = IB_WR_SEND;
 
 	ret = -EIO;
 	if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3ab086a..2f46582 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -811,9 +811,6 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
 	if (!sc)
 		return NULL;
 
-	sc->sc_wr.wr_cqe = &sc->sc_cqe;
-	sc->sc_wr.sg_list = sc->sc_sges;
-	sc->sc_wr.opcode = IB_WR_SEND;
 	sc->sc_cqe.done = rpcrdma_wc_send;
 	return sc;
 }
@@ -1469,7 +1466,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
 		struct rpcrdma_ep *ep,
 		struct rpcrdma_req *req)
 {
-	struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
+	struct ib_send_wr *send_wr = &req->rl_wr;
 	int rc;
 
 	if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0e5b7f3..cdd6a3d 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -219,7 +219,6 @@ enum {
  */
 struct rpcrdma_req;
 struct rpcrdma_sendctx {
-	struct ib_send_wr	sc_wr;
 	struct ib_cqe		sc_cqe;
 	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
@@ -314,6 +313,7 @@ struct rpcrdma_req {
 	struct rpcrdma_rep	*rl_reply;
 	struct xdr_stream	rl_stream;
 	struct xdr_buf		rl_hdrbuf;
+	struct ib_send_wr	rl_wr;
 	struct rpcrdma_sendctx	*rl_sendctx;
 	struct rpcrdma_regbuf	*rl_rdmabuf;	/* xprt header */
 	struct rpcrdma_regbuf	*rl_sendbuf;	/* rq_snd_buf */


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v1 5/6] xprtrdma: Refactor rpcrdma_prepare_msg_sges()
  2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
                   ` (3 preceding siblings ...)
  2019-10-17 18:31 ` [PATCH v1 4/6] xprtrdma: Move the rpcrdma_sendctx::sc_wr field Chuck Lever
@ 2019-10-17 18:31 ` Chuck Lever
  2019-10-17 18:31 ` [PATCH v1 6/6] xprtrdma: Pull up sometimes Chuck Lever
  5 siblings, 0 replies; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

Refactor: Replace spaghetti with code that makes it plain what needs
to be done for each rtype. This makes it easier to add features and
optimizations.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c |  263 ++++++++++++++++++++++------------------
 1 file changed, 146 insertions(+), 117 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 53cd2e3..a441dbf 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -589,148 +589,162 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
 {
 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
 	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
-	struct ib_sge *sge = sc->sc_sges;
+	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
 
 	if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
-		goto out_regbuf;
+		return false;
 	sge->addr = rdmab_addr(rb);
 	sge->length = len;
 	sge->lkey = rdmab_lkey(rb);
 
 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
 				      DMA_TO_DEVICE);
-	req->rl_wr.num_sge++;
 	return true;
-
-out_regbuf:
-	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
-	return false;
 }
 
-/* Prepare the Send SGEs. The head and tail iovec, and each entry
- * in the page list, gets its own SGE.
+/* The head iovec is straightforward, as it is usually already
+ * DMA-mapped. Sync the content that has changed.
  */
-static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
-				     struct rpcrdma_req *req,
-				     struct xdr_buf *xdr,
-				     enum rpcrdma_chunktype rtype)
+static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
+				     struct rpcrdma_req *req, unsigned int len)
 {
 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
-	unsigned int sge_no, page_base, len, remaining;
+	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
 	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
-	struct ib_sge *sge = sc->sc_sges;
-	struct page *page, **ppages;
 
-	/* The head iovec is straightforward, as it is already
-	 * DMA-mapped. Sync the content that has changed.
-	 */
 	if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
-		goto out_regbuf;
-	sge_no = 1;
-	sge[sge_no].addr = rdmab_addr(rb);
-	sge[sge_no].length = xdr->head[0].iov_len;
-	sge[sge_no].lkey = rdmab_lkey(rb);
-	ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
-				      sge[sge_no].length, DMA_TO_DEVICE);
-
-	/* If there is a Read chunk, the page list is being handled
-	 * via explicit RDMA, and thus is skipped here. However, the
-	 * tail iovec may include an XDR pad for the page list, as
-	 * well as additional content, and may not reside in the
-	 * same page as the head iovec.
-	 */
-	if (rtype == rpcrdma_readch) {
-		len = xdr->tail[0].iov_len;
+		return false;
 
-		/* Do not include the tail if it is only an XDR pad */
-		if (len < 4)
-			goto out;
+	sge->addr = rdmab_addr(rb);
+	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
 
-		page = virt_to_page(xdr->tail[0].iov_base);
-		page_base = offset_in_page(xdr->tail[0].iov_base);
+	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+				      DMA_TO_DEVICE);
+	return true;
+}
 
-		/* If the content in the page list is an odd length,
-		 * xdr_write_pages() has added a pad at the beginning
-		 * of the tail iovec. Force the tail's non-pad content
-		 * to land at the next XDR position in the Send message.
-		 */
-		page_base += len & 3;
-		len -= len & 3;
-		goto map_tail;
-	}
+/* If there is a page list present, DMA map and prepare an
+ * SGE for each page to be sent.
+ */
+static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
+				     struct xdr_buf *xdr)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	unsigned int page_base, len, remaining;
+	struct page **ppages;
+	struct ib_sge *sge;
 
-	/* If there is a page list present, temporarily DMA map
-	 * and prepare an SGE for each page to be sent.
-	 */
-	if (xdr->page_len) {
-		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
-		page_base = offset_in_page(xdr->page_base);
-		remaining = xdr->page_len;
-		while (remaining) {
-			sge_no++;
-			if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
-				goto out_mapping_overflow;
-
-			len = min_t(u32, PAGE_SIZE - page_base, remaining);
-			sge[sge_no].addr =
-				ib_dma_map_page(rdmab_device(rb), *ppages,
-						page_base, len, DMA_TO_DEVICE);
-			if (ib_dma_mapping_error(rdmab_device(rb),
-						 sge[sge_no].addr))
-				goto out_mapping_err;
-			sge[sge_no].length = len;
-			sge[sge_no].lkey = rdmab_lkey(rb);
-
-			sc->sc_unmap_count++;
-			ppages++;
-			remaining -= len;
-			page_base = 0;
-		}
-	}
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		sge = &sc->sc_sges[req->rl_wr.num_sge++];
+		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+		sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
+					    page_base, len, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+			goto out_mapping_err;
 
-	/* The tail iovec is not always constructed in the same
-	 * page where the head iovec resides (see, for example,
-	 * gss_wrap_req_priv). To neatly accommodate that case,
-	 * DMA map it separately.
-	 */
-	if (xdr->tail[0].iov_len) {
-		page = virt_to_page(xdr->tail[0].iov_base);
-		page_base = offset_in_page(xdr->tail[0].iov_base);
-		len = xdr->tail[0].iov_len;
+		sge->length = len;
+		sge->lkey = rdmab_lkey(rb);
 
-map_tail:
-		sge_no++;
-		sge[sge_no].addr =
-			ib_dma_map_page(rdmab_device(rb), page, page_base, len,
-					DMA_TO_DEVICE);
-		if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
-			goto out_mapping_err;
-		sge[sge_no].length = len;
-		sge[sge_no].lkey = rdmab_lkey(rb);
 		sc->sc_unmap_count++;
+		ppages++;
+		remaining -= len;
+		page_base = 0;
 	}
 
-out:
-	req->rl_wr.num_sge += sge_no;
-	if (sc->sc_unmap_count)
-		kref_get(&req->rl_kref);
 	return true;
 
-out_regbuf:
-	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+out_mapping_err:
+	trace_xprtrdma_dma_maperr(sge->addr);
 	return false;
+}
 
-out_mapping_overflow:
-	rpcrdma_sendctx_unmap(sc);
-	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
-	return false;
+/* The tail iovec may include an XDR pad for the page list,
+ * as well as additional content, and may not reside in the
+ * same page as the head iovec.
+ */
+static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
+				     struct xdr_buf *xdr,
+				     unsigned int page_base, unsigned int len)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	struct page *page = virt_to_page(xdr->tail[0].iov_base);
+
+	sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+				    DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+		goto out_mapping_err;
+
+	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
+	++sc->sc_unmap_count;
+	return true;
 
 out_mapping_err:
-	rpcrdma_sendctx_unmap(sc);
-	trace_xprtrdma_dma_maperr(sge[sge_no].addr);
+	trace_xprtrdma_dma_maperr(sge->addr);
 	return false;
 }
 
+static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
+					struct rpcrdma_req *req,
+					struct xdr_buf *xdr)
+{
+	struct kvec *tail = &xdr->tail[0];
+
+	if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+		return false;
+	if (xdr->page_len)
+		if (!rpcrdma_prepare_pagelist(req, xdr))
+			return false;
+	if (tail->iov_len)
+		if (!rpcrdma_prepare_tail_iov(req, xdr,
+					      offset_in_page(tail->iov_base),
+					      tail->iov_len))
+			return false;
+
+	if (req->rl_sendctx->sc_unmap_count)
+		kref_get(&req->rl_kref);
+	return true;
+}
+
+static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
+				   struct rpcrdma_req *req,
+				   struct xdr_buf *xdr)
+{
+	if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+		return false;
+
+	/* If there is a Read chunk, the page list is being handled
+	 * via explicit RDMA, and thus is skipped here.
+	 */
+
+	/* Do not include the tail if it is only an XDR pad */
+	if (xdr->tail[0].iov_len > 3) {
+		unsigned int page_base, len;
+
+		/* If the content in the page list is an odd length,
+		 * xdr_write_pages() adds a pad at the beginning of
+		 * the tail iovec. Force the tail's non-pad content to
+		 * land at the next XDR position in the Send message.
+		 */
+		page_base = offset_in_page(xdr->tail[0].iov_base);
+		len = xdr->tail[0].iov_len;
+		page_base += len & 3;
+		len -= len & 3;
+		if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
+			return false;
+		kref_get(&req->rl_kref);
+	}
+
+	return true;
+}
+
 /**
  * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
  * @r_xprt: controlling transport
@@ -741,17 +755,17 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
  *
  * Returns 0 on success; otherwise a negative errno is returned.
  */
-int
-rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
-			  struct rpcrdma_req *req, u32 hdrlen,
-			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+				     struct rpcrdma_req *req, u32 hdrlen,
+				     struct xdr_buf *xdr,
+				     enum rpcrdma_chunktype rtype)
 {
 	int ret;
 
 	ret = -EAGAIN;
 	req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
 	if (!req->rl_sendctx)
-		goto err;
+		goto out_nosc;
 	req->rl_sendctx->sc_unmap_count = 0;
 	req->rl_sendctx->sc_req = req;
 	kref_init(&req->rl_kref);
@@ -762,13 +776,28 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
 
 	ret = -EIO;
 	if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
-		goto err;
-	if (rtype != rpcrdma_areadch)
-		if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
-			goto err;
+		goto out_unmap;
+
+	switch (rtype) {
+	case rpcrdma_noch:
+		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
+			goto out_unmap;
+		break;
+	case rpcrdma_readch:
+		if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
+			goto out_unmap;
+		break;
+	case rpcrdma_areadch:
+		break;
+	default:
+		goto out_unmap;
+	}
+
 	return 0;
 
-err:
+out_unmap:
+	rpcrdma_sendctx_unmap(req->rl_sendctx);
+out_nosc:
 	trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
 	return ret;
 }


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v1 6/6] xprtrdma: Pull up sometimes
  2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
                   ` (4 preceding siblings ...)
  2019-10-17 18:31 ` [PATCH v1 5/6] xprtrdma: Refactor rpcrdma_prepare_msg_sges() Chuck Lever
@ 2019-10-17 18:31 ` Chuck Lever
  2019-10-18 20:17   ` Tom Talpey
  5 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2019-10-17 18:31 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

On some platforms, DMA mapping part of a page is more costly than
copying bytes. Restore the pull-up code and use that when we
think it's going to be faster. The heuristic for now is to pull-up
when the size of the RPC message body fits in the buffer underlying
the head iovec.

Indeed, not involving the I/O MMU can help the RPC/RDMA transport
scale better for tiny I/Os across more RDMA devices. This is because
interaction with the I/O MMU is eliminated, as is handling a Send
completion, for each of these small I/Os. Without the explicit
unmapping, the NIC no longer needs to do a costly internal TLB shoot
down for buffers that are just a handful of bytes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/backchannel.c |    2 -
 net/sunrpc/xprtrdma/rpc_rdma.c    |   82 +++++++++++++++++++++++++++++++++++--
 net/sunrpc/xprtrdma/verbs.c       |    2 -
 net/sunrpc/xprtrdma/xprt_rdma.h   |    2 +
 4 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 16572ae7..336a65d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -532,6 +532,8 @@
 DEFINE_WRCH_EVENT(reply);
 
 TRACE_DEFINE_ENUM(rpcrdma_noch);
+TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
+TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
 TRACE_DEFINE_ENUM(rpcrdma_readch);
 TRACE_DEFINE_ENUM(rpcrdma_areadch);
 TRACE_DEFINE_ENUM(rpcrdma_writech);
@@ -540,6 +542,8 @@
 #define xprtrdma_show_chunktype(x)					\
 		__print_symbolic(x,					\
 				{ rpcrdma_noch, "inline" },		\
+				{ rpcrdma_noch_pullup, "pullup" },	\
+				{ rpcrdma_noch_mapped, "mapped" },	\
 				{ rpcrdma_readch, "read list" },	\
 				{ rpcrdma_areadch, "*read list" },	\
 				{ rpcrdma_writech, "write list" },	\
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 50e075f..1168524 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 	*p = xdr_zero;
 
 	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
-				      &rqst->rq_snd_buf, rpcrdma_noch))
+				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
 		return -EIO;
 
 	trace_xprtrdma_cb_reply(rqst);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a441dbf..4ad8889 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 	unsigned int pos;
 	int nsegs;
 
-	if (rtype == rpcrdma_noch)
+	if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
 		goto done;
 
 	pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
 	return false;
 }
 
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req,
+				    struct xdr_buf *xdr)
+{
+	unsigned char *dst;
+
+	dst = (unsigned char *)xdr->head[0].iov_base;
+	dst += xdr->head[0].iov_len + xdr->page_len;
+	memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+	r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
+
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req,
+				    struct xdr_buf *xdr)
+{
+	unsigned int len, page_base, remaining;
+	struct page **ppages;
+	unsigned char *src, *dst;
+
+	dst = (unsigned char *)xdr->head[0].iov_base;
+	dst += xdr->head[0].iov_len;
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		src = page_address(*ppages);
+		src += page_base;
+		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+		memcpy(dst, src, len);
+		r_xprt->rx_stats.pullup_copy_count += len;
+
+		ppages++;
+		dst += len;
+		remaining -= len;
+		page_base = 0;
+	}
+}
+
+/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
+ * When the head, pagelist, and tail are small, a pull-up copy
+ * is considerably less costly than DMA mapping the components
+ * of @xdr.
+ *
+ * Assumptions:
+ *  - the caller has already verified that the total length
+ *    of the RPC Call body will fit into @rl_sendbuf.
+ */
+static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
+					struct rpcrdma_req *req,
+					struct xdr_buf *xdr)
+{
+	if (unlikely(xdr->tail[0].iov_len))
+		rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
+
+	if (unlikely(xdr->page_len))
+		rpcrdma_pullup_pagelist(r_xprt, req, xdr);
+
+	/* The whole RPC message resides in the head iovec now */
+	return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
+}
+
 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
 					struct rpcrdma_req *req,
 					struct xdr_buf *xdr)
@@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 		goto out_unmap;
 
 	switch (rtype) {
-	case rpcrdma_noch:
+	case rpcrdma_noch_pullup:
+		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
+			goto out_unmap;
+		break;
+	case rpcrdma_noch_mapped:
 		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
 			goto out_unmap;
 		break;
@@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	struct xdr_stream *xdr = &req->rl_stream;
 	enum rpcrdma_chunktype rtype, wtype;
+	struct xdr_buf *buf = &rqst->rq_snd_buf;
 	bool ddp_allowed;
 	__be32 *p;
 	int ret;
@@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 	 */
 	if (rpcrdma_args_inline(r_xprt, rqst)) {
 		*p++ = rdma_msg;
-		rtype = rpcrdma_noch;
-	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+		rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
+			rpcrdma_noch_pullup : rpcrdma_noch_mapped;
+	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
 		*p++ = rdma_msg;
 		rtype = rpcrdma_readch;
 	} else {
@@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 		goto out_err;
 
 	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
-					&rqst->rq_snd_buf, rtype);
+					buf, rtype);
 	if (ret)
 		goto out_err;
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 2f46582..a514e2c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	for (i = 0; i < buf->rb_max_requests; i++) {
 		struct rpcrdma_req *req;
 
-		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
+		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
 					 GFP_KERNEL);
 		if (!req)
 			goto out;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cdd6a3d..5d15140 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 
 enum rpcrdma_chunktype {
 	rpcrdma_noch = 0,
+	rpcrdma_noch_pullup,
+	rpcrdma_noch_mapped,
 	rpcrdma_readch,
 	rpcrdma_areadch,
 	rpcrdma_writech,


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 6/6] xprtrdma: Pull up sometimes
  2019-10-17 18:31 ` [PATCH v1 6/6] xprtrdma: Pull up sometimes Chuck Lever
@ 2019-10-18 20:17   ` Tom Talpey
  2019-10-18 23:34     ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: Tom Talpey @ 2019-10-18 20:17 UTC (permalink / raw)
  To: Chuck Lever, linux-rdma, linux-nfs

On 10/17/2019 2:31 PM, Chuck Lever wrote:
> On some platforms, DMA mapping part of a page is more costly than
> copying bytes. Restore the pull-up code and use that when we
> think it's going to be faster. The heuristic for now is to pull-up
> when the size of the RPC message body fits in the buffer underlying
> the head iovec.
> 
> Indeed, not involving the I/O MMU can help the RPC/RDMA transport
> scale better for tiny I/Os across more RDMA devices. This is because
> interaction with the I/O MMU is eliminated, as is handling a Send
> completion, for each of these small I/Os. Without the explicit
> unmapping, the NIC no longer needs to do a costly internal TLB shoot
> down for buffers that are just a handful of bytes.

This is good stuff. Do you have any performance data for the new
strategy, especially latencies and local CPU cycles per byte?

Tom.

> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>   net/sunrpc/xprtrdma/backchannel.c |    2 -
>   net/sunrpc/xprtrdma/rpc_rdma.c    |   82 +++++++++++++++++++++++++++++++++++--
>   net/sunrpc/xprtrdma/verbs.c       |    2 -
>   net/sunrpc/xprtrdma/xprt_rdma.h   |    2 +
>   4 files changed, 81 insertions(+), 7 deletions(-)
> 
> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
> index 16572ae7..336a65d 100644
> --- a/include/trace/events/rpcrdma.h
> +++ b/include/trace/events/rpcrdma.h
> @@ -532,6 +532,8 @@
>   DEFINE_WRCH_EVENT(reply);
>   
>   TRACE_DEFINE_ENUM(rpcrdma_noch);
> +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
> +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
>   TRACE_DEFINE_ENUM(rpcrdma_readch);
>   TRACE_DEFINE_ENUM(rpcrdma_areadch);
>   TRACE_DEFINE_ENUM(rpcrdma_writech);
> @@ -540,6 +542,8 @@
>   #define xprtrdma_show_chunktype(x)					\
>   		__print_symbolic(x,					\
>   				{ rpcrdma_noch, "inline" },		\
> +				{ rpcrdma_noch_pullup, "pullup" },	\
> +				{ rpcrdma_noch_mapped, "mapped" },	\
>   				{ rpcrdma_readch, "read list" },	\
>   				{ rpcrdma_areadch, "*read list" },	\
>   				{ rpcrdma_writech, "write list" },	\
> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
> index 50e075f..1168524 100644
> --- a/net/sunrpc/xprtrdma/backchannel.c
> +++ b/net/sunrpc/xprtrdma/backchannel.c
> @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
>   	*p = xdr_zero;
>   
>   	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
> -				      &rqst->rq_snd_buf, rpcrdma_noch))
> +				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
>   		return -EIO;
>   
>   	trace_xprtrdma_cb_reply(rqst);
> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
> index a441dbf..4ad8889 100644
> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
> @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
>   	unsigned int pos;
>   	int nsegs;
>   
> -	if (rtype == rpcrdma_noch)
> +	if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
>   		goto done;
>   
>   	pos = rqst->rq_snd_buf.head[0].iov_len;
> @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
>   	return false;
>   }
>   
> +/* Copy the tail to the end of the head buffer.
> + */
> +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
> +				    struct rpcrdma_req *req,
> +				    struct xdr_buf *xdr)
> +{
> +	unsigned char *dst;
> +
> +	dst = (unsigned char *)xdr->head[0].iov_base;
> +	dst += xdr->head[0].iov_len + xdr->page_len;
> +	memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
> +	r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
> +}
> +
> +/* Copy pagelist content into the head buffer.
> + */
> +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
> +				    struct rpcrdma_req *req,
> +				    struct xdr_buf *xdr)
> +{
> +	unsigned int len, page_base, remaining;
> +	struct page **ppages;
> +	unsigned char *src, *dst;
> +
> +	dst = (unsigned char *)xdr->head[0].iov_base;
> +	dst += xdr->head[0].iov_len;
> +	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
> +	page_base = offset_in_page(xdr->page_base);
> +	remaining = xdr->page_len;
> +	while (remaining) {
> +		src = page_address(*ppages);
> +		src += page_base;
> +		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
> +		memcpy(dst, src, len);
> +		r_xprt->rx_stats.pullup_copy_count += len;
> +
> +		ppages++;
> +		dst += len;
> +		remaining -= len;
> +		page_base = 0;
> +	}
> +}
> +
> +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
> + * When the head, pagelist, and tail are small, a pull-up copy
> + * is considerably less costly than DMA mapping the components
> + * of @xdr.
> + *
> + * Assumptions:
> + *  - the caller has already verified that the total length
> + *    of the RPC Call body will fit into @rl_sendbuf.
> + */
> +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
> +					struct rpcrdma_req *req,
> +					struct xdr_buf *xdr)
> +{
> +	if (unlikely(xdr->tail[0].iov_len))
> +		rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
> +
> +	if (unlikely(xdr->page_len))
> +		rpcrdma_pullup_pagelist(r_xprt, req, xdr);
> +
> +	/* The whole RPC message resides in the head iovec now */
> +	return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
> +}
> +
>   static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
>   					struct rpcrdma_req *req,
>   					struct xdr_buf *xdr)
> @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>   		goto out_unmap;
>   
>   	switch (rtype) {
> -	case rpcrdma_noch:
> +	case rpcrdma_noch_pullup:
> +		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
> +			goto out_unmap;
> +		break;
> +	case rpcrdma_noch_mapped:
>   		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
>   			goto out_unmap;
>   		break;
> @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>   	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
>   	struct xdr_stream *xdr = &req->rl_stream;
>   	enum rpcrdma_chunktype rtype, wtype;
> +	struct xdr_buf *buf = &rqst->rq_snd_buf;
>   	bool ddp_allowed;
>   	__be32 *p;
>   	int ret;
> @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>   	 */
>   	if (rpcrdma_args_inline(r_xprt, rqst)) {
>   		*p++ = rdma_msg;
> -		rtype = rpcrdma_noch;
> -	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
> +		rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
> +			rpcrdma_noch_pullup : rpcrdma_noch_mapped;
> +	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
>   		*p++ = rdma_msg;
>   		rtype = rpcrdma_readch;
>   	} else {
> @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>   		goto out_err;
>   
>   	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
> -					&rqst->rq_snd_buf, rtype);
> +					buf, rtype);
>   	if (ret)
>   		goto out_err;
>   
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 2f46582..a514e2c 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
>   	for (i = 0; i < buf->rb_max_requests; i++) {
>   		struct rpcrdma_req *req;
>   
> -		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
> +		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
>   					 GFP_KERNEL);
>   		if (!req)
>   			goto out;
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index cdd6a3d..5d15140 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
>   
>   enum rpcrdma_chunktype {
>   	rpcrdma_noch = 0,
> +	rpcrdma_noch_pullup,
> +	rpcrdma_noch_mapped,
>   	rpcrdma_readch,
>   	rpcrdma_areadch,
>   	rpcrdma_writech,
> 
> 
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 6/6] xprtrdma: Pull up sometimes
  2019-10-18 20:17   ` Tom Talpey
@ 2019-10-18 23:34     ` Chuck Lever
  2019-10-19 16:36       ` Tom Talpey
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2019-10-18 23:34 UTC (permalink / raw)
  To: Tom Talpey; +Cc: linux-rdma, Linux NFS Mailing List

Hi Tom-

> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote:
> 
> On 10/17/2019 2:31 PM, Chuck Lever wrote:
>> On some platforms, DMA mapping part of a page is more costly than
>> copying bytes. Restore the pull-up code and use that when we
>> think it's going to be faster. The heuristic for now is to pull-up
>> when the size of the RPC message body fits in the buffer underlying
>> the head iovec.
>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport
>> scale better for tiny I/Os across more RDMA devices. This is because
>> interaction with the I/O MMU is eliminated, as is handling a Send
>> completion, for each of these small I/Os. Without the explicit
>> unmapping, the NIC no longer needs to do a costly internal TLB shoot
>> down for buffers that are just a handful of bytes.
> 
> This is good stuff. Do you have any performance data for the new
> strategy, especially latencies and local CPU cycles per byte?

Saves almost a microsecond of RT latency on my NFS client that uses
a real Intel IOMMU. On my other NFS client, the DMA map operations
are always a no-op. This savings applies only to NFS WRITE, of course.

I don't have a good benchmark for cycles per byte. Do you have any
suggestions? Not sure how I would account for cycles spent handling
Send completions, for example.


> Tom.
> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>  net/sunrpc/xprtrdma/backchannel.c |    2 -
>>  net/sunrpc/xprtrdma/rpc_rdma.c    |   82 +++++++++++++++++++++++++++++++++++--
>>  net/sunrpc/xprtrdma/verbs.c       |    2 -
>>  net/sunrpc/xprtrdma/xprt_rdma.h   |    2 +
>>  4 files changed, 81 insertions(+), 7 deletions(-)
>> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
>> index 16572ae7..336a65d 100644
>> --- a/include/trace/events/rpcrdma.h
>> +++ b/include/trace/events/rpcrdma.h
>> @@ -532,6 +532,8 @@
>>  DEFINE_WRCH_EVENT(reply);
>>    TRACE_DEFINE_ENUM(rpcrdma_noch);
>> +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
>> +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
>>  TRACE_DEFINE_ENUM(rpcrdma_readch);
>>  TRACE_DEFINE_ENUM(rpcrdma_areadch);
>>  TRACE_DEFINE_ENUM(rpcrdma_writech);
>> @@ -540,6 +542,8 @@
>>  #define xprtrdma_show_chunktype(x)					\
>>  		__print_symbolic(x,					\
>>  				{ rpcrdma_noch, "inline" },		\
>> +				{ rpcrdma_noch_pullup, "pullup" },	\
>> +				{ rpcrdma_noch_mapped, "mapped" },	\
>>  				{ rpcrdma_readch, "read list" },	\
>>  				{ rpcrdma_areadch, "*read list" },	\
>>  				{ rpcrdma_writech, "write list" },	\
>> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
>> index 50e075f..1168524 100644
>> --- a/net/sunrpc/xprtrdma/backchannel.c
>> +++ b/net/sunrpc/xprtrdma/backchannel.c
>> @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
>>  	*p = xdr_zero;
>>    	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
>> -				      &rqst->rq_snd_buf, rpcrdma_noch))
>> +				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
>>  		return -EIO;
>>    	trace_xprtrdma_cb_reply(rqst);
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>> index a441dbf..4ad8889 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
>>  	unsigned int pos;
>>  	int nsegs;
>>  -	if (rtype == rpcrdma_noch)
>> +	if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
>>  		goto done;
>>    	pos = rqst->rq_snd_buf.head[0].iov_len;
>> @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
>>  	return false;
>>  }
>>  +/* Copy the tail to the end of the head buffer.
>> + */
>> +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
>> +				    struct rpcrdma_req *req,
>> +				    struct xdr_buf *xdr)
>> +{
>> +	unsigned char *dst;
>> +
>> +	dst = (unsigned char *)xdr->head[0].iov_base;
>> +	dst += xdr->head[0].iov_len + xdr->page_len;
>> +	memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
>> +	r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
>> +}
>> +
>> +/* Copy pagelist content into the head buffer.
>> + */
>> +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
>> +				    struct rpcrdma_req *req,
>> +				    struct xdr_buf *xdr)
>> +{
>> +	unsigned int len, page_base, remaining;
>> +	struct page **ppages;
>> +	unsigned char *src, *dst;
>> +
>> +	dst = (unsigned char *)xdr->head[0].iov_base;
>> +	dst += xdr->head[0].iov_len;
>> +	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
>> +	page_base = offset_in_page(xdr->page_base);
>> +	remaining = xdr->page_len;
>> +	while (remaining) {
>> +		src = page_address(*ppages);
>> +		src += page_base;
>> +		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
>> +		memcpy(dst, src, len);
>> +		r_xprt->rx_stats.pullup_copy_count += len;
>> +
>> +		ppages++;
>> +		dst += len;
>> +		remaining -= len;
>> +		page_base = 0;
>> +	}
>> +}
>> +
>> +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
>> + * When the head, pagelist, and tail are small, a pull-up copy
>> + * is considerably less costly than DMA mapping the components
>> + * of @xdr.
>> + *
>> + * Assumptions:
>> + *  - the caller has already verified that the total length
>> + *    of the RPC Call body will fit into @rl_sendbuf.
>> + */
>> +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
>> +					struct rpcrdma_req *req,
>> +					struct xdr_buf *xdr)
>> +{
>> +	if (unlikely(xdr->tail[0].iov_len))
>> +		rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
>> +
>> +	if (unlikely(xdr->page_len))
>> +		rpcrdma_pullup_pagelist(r_xprt, req, xdr);
>> +
>> +	/* The whole RPC message resides in the head iovec now */
>> +	return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
>> +}
>> +
>>  static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
>>  					struct rpcrdma_req *req,
>>  					struct xdr_buf *xdr)
>> @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>  		goto out_unmap;
>>    	switch (rtype) {
>> -	case rpcrdma_noch:
>> +	case rpcrdma_noch_pullup:
>> +		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
>> +			goto out_unmap;
>> +		break;
>> +	case rpcrdma_noch_mapped:
>>  		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
>>  			goto out_unmap;
>>  		break;
>> @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>  	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
>>  	struct xdr_stream *xdr = &req->rl_stream;
>>  	enum rpcrdma_chunktype rtype, wtype;
>> +	struct xdr_buf *buf = &rqst->rq_snd_buf;
>>  	bool ddp_allowed;
>>  	__be32 *p;
>>  	int ret;
>> @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>  	 */
>>  	if (rpcrdma_args_inline(r_xprt, rqst)) {
>>  		*p++ = rdma_msg;
>> -		rtype = rpcrdma_noch;
>> -	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
>> +		rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
>> +			rpcrdma_noch_pullup : rpcrdma_noch_mapped;
>> +	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
>>  		*p++ = rdma_msg;
>>  		rtype = rpcrdma_readch;
>>  	} else {
>> @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>  		goto out_err;
>>    	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
>> -					&rqst->rq_snd_buf, rtype);
>> +					buf, rtype);
>>  	if (ret)
>>  		goto out_err;
>>  diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index 2f46582..a514e2c 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
>>  	for (i = 0; i < buf->rb_max_requests; i++) {
>>  		struct rpcrdma_req *req;
>>  -		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
>> +		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
>>  					 GFP_KERNEL);
>>  		if (!req)
>>  			goto out;
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index cdd6a3d..5d15140 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
>>    enum rpcrdma_chunktype {
>>  	rpcrdma_noch = 0,
>> +	rpcrdma_noch_pullup,
>> +	rpcrdma_noch_mapped,
>>  	rpcrdma_readch,
>>  	rpcrdma_areadch,
>>  	rpcrdma_writech,

--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 6/6] xprtrdma: Pull up sometimes
  2019-10-18 23:34     ` Chuck Lever
@ 2019-10-19 16:36       ` Tom Talpey
  2019-10-22 16:08         ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: Tom Talpey @ 2019-10-19 16:36 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma, Linux NFS Mailing List

On 10/18/2019 7:34 PM, Chuck Lever wrote:
> Hi Tom-
> 
>> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote:
>>
>> On 10/17/2019 2:31 PM, Chuck Lever wrote:
>>> On some platforms, DMA mapping part of a page is more costly than
>>> copying bytes. Restore the pull-up code and use that when we
>>> think it's going to be faster. The heuristic for now is to pull-up
>>> when the size of the RPC message body fits in the buffer underlying
>>> the head iovec.
>>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport
>>> scale better for tiny I/Os across more RDMA devices. This is because
>>> interaction with the I/O MMU is eliminated, as is handling a Send
>>> completion, for each of these small I/Os. Without the explicit
>>> unmapping, the NIC no longer needs to do a costly internal TLB shoot
>>> down for buffers that are just a handful of bytes.
>>
>> This is good stuff. Do you have any performance data for the new
>> strategy, especially latencies and local CPU cycles per byte?
> 
> Saves almost a microsecond of RT latency on my NFS client that uses
> a real Intel IOMMU. On my other NFS client, the DMA map operations
> are always a no-op. This savings applies only to NFS WRITE, of course.
> 
> I don't have a good benchmark for cycles per byte. Do you have any
> suggestions? Not sure how I would account for cycles spent handling
> Send completions, for example.

Cycles per byte is fairly simple but like all performance measurement
the trick is in the setup. Because of platform variations, it's best
to compare results on the same hardware. The absolute value isn't as
meaningful. Here's a rough sketch of one approach.

- Configure BIOS and OS to hold CPU frequency constant:
   - ACPI C-states off
   - Turbo mode off
   - Power management off (OS needs this too)
   - Anything else relevant to clock variation
- Hyperthreading off
   - (hyperthreads don't add work linearly)
- Calculate core count X clock frequency
   - (e.g. 8 X 3GHz = 24G cycles/sec)

Now, use a benchmark which runs the desired workload and reports %CPU.
For a given interval, record the total bytes transferred, time spent,
and CPU load. (e.g. 100GB, 100 sec, 20%).

Finally, compute CpB (the 1/sec terms cancel out):
  20% x 24Gcps = 4.8G cps
  100GB / 100s = 1G bps
  4.8Gcps / 1 GBps = 4.8cpb

Like I said, it's rough, but surprisingly telling. A similar metric
is cycles per IOP, and since you're focusing on small i/o with this
change, it might also be an interesting calculation. Simply replace
total bytes/sec with IOPS.

Tom.


>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>>   net/sunrpc/xprtrdma/backchannel.c |    2 -
>>>   net/sunrpc/xprtrdma/rpc_rdma.c    |   82 +++++++++++++++++++++++++++++++++++--
>>>   net/sunrpc/xprtrdma/verbs.c       |    2 -
>>>   net/sunrpc/xprtrdma/xprt_rdma.h   |    2 +
>>>   4 files changed, 81 insertions(+), 7 deletions(-)
>>> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
>>> index 16572ae7..336a65d 100644
>>> --- a/include/trace/events/rpcrdma.h
>>> +++ b/include/trace/events/rpcrdma.h
>>> @@ -532,6 +532,8 @@
>>>   DEFINE_WRCH_EVENT(reply);
>>>     TRACE_DEFINE_ENUM(rpcrdma_noch);
>>> +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
>>> +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
>>>   TRACE_DEFINE_ENUM(rpcrdma_readch);
>>>   TRACE_DEFINE_ENUM(rpcrdma_areadch);
>>>   TRACE_DEFINE_ENUM(rpcrdma_writech);
>>> @@ -540,6 +542,8 @@
>>>   #define xprtrdma_show_chunktype(x)					\
>>>   		__print_symbolic(x,					\
>>>   				{ rpcrdma_noch, "inline" },		\
>>> +				{ rpcrdma_noch_pullup, "pullup" },	\
>>> +				{ rpcrdma_noch_mapped, "mapped" },	\
>>>   				{ rpcrdma_readch, "read list" },	\
>>>   				{ rpcrdma_areadch, "*read list" },	\
>>>   				{ rpcrdma_writech, "write list" },	\
>>> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
>>> index 50e075f..1168524 100644
>>> --- a/net/sunrpc/xprtrdma/backchannel.c
>>> +++ b/net/sunrpc/xprtrdma/backchannel.c
>>> @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
>>>   	*p = xdr_zero;
>>>     	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
>>> -				      &rqst->rq_snd_buf, rpcrdma_noch))
>>> +				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
>>>   		return -EIO;
>>>     	trace_xprtrdma_cb_reply(rqst);
>>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
>>> index a441dbf..4ad8889 100644
>>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>>> @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
>>>   	unsigned int pos;
>>>   	int nsegs;
>>>   -	if (rtype == rpcrdma_noch)
>>> +	if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
>>>   		goto done;
>>>     	pos = rqst->rq_snd_buf.head[0].iov_len;
>>> @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
>>>   	return false;
>>>   }
>>>   +/* Copy the tail to the end of the head buffer.
>>> + */
>>> +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
>>> +				    struct rpcrdma_req *req,
>>> +				    struct xdr_buf *xdr)
>>> +{
>>> +	unsigned char *dst;
>>> +
>>> +	dst = (unsigned char *)xdr->head[0].iov_base;
>>> +	dst += xdr->head[0].iov_len + xdr->page_len;
>>> +	memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
>>> +	r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
>>> +}
>>> +
>>> +/* Copy pagelist content into the head buffer.
>>> + */
>>> +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
>>> +				    struct rpcrdma_req *req,
>>> +				    struct xdr_buf *xdr)
>>> +{
>>> +	unsigned int len, page_base, remaining;
>>> +	struct page **ppages;
>>> +	unsigned char *src, *dst;
>>> +
>>> +	dst = (unsigned char *)xdr->head[0].iov_base;
>>> +	dst += xdr->head[0].iov_len;
>>> +	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
>>> +	page_base = offset_in_page(xdr->page_base);
>>> +	remaining = xdr->page_len;
>>> +	while (remaining) {
>>> +		src = page_address(*ppages);
>>> +		src += page_base;
>>> +		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
>>> +		memcpy(dst, src, len);
>>> +		r_xprt->rx_stats.pullup_copy_count += len;
>>> +
>>> +		ppages++;
>>> +		dst += len;
>>> +		remaining -= len;
>>> +		page_base = 0;
>>> +	}
>>> +}
>>> +
>>> +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
>>> + * When the head, pagelist, and tail are small, a pull-up copy
>>> + * is considerably less costly than DMA mapping the components
>>> + * of @xdr.
>>> + *
>>> + * Assumptions:
>>> + *  - the caller has already verified that the total length
>>> + *    of the RPC Call body will fit into @rl_sendbuf.
>>> + */
>>> +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
>>> +					struct rpcrdma_req *req,
>>> +					struct xdr_buf *xdr)
>>> +{
>>> +	if (unlikely(xdr->tail[0].iov_len))
>>> +		rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
>>> +
>>> +	if (unlikely(xdr->page_len))
>>> +		rpcrdma_pullup_pagelist(r_xprt, req, xdr);
>>> +
>>> +	/* The whole RPC message resides in the head iovec now */
>>> +	return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
>>> +}
>>> +
>>>   static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
>>>   					struct rpcrdma_req *req,
>>>   					struct xdr_buf *xdr)
>>> @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>>   		goto out_unmap;
>>>     	switch (rtype) {
>>> -	case rpcrdma_noch:
>>> +	case rpcrdma_noch_pullup:
>>> +		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
>>> +			goto out_unmap;
>>> +		break;
>>> +	case rpcrdma_noch_mapped:
>>>   		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
>>>   			goto out_unmap;
>>>   		break;
>>> @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>>   	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
>>>   	struct xdr_stream *xdr = &req->rl_stream;
>>>   	enum rpcrdma_chunktype rtype, wtype;
>>> +	struct xdr_buf *buf = &rqst->rq_snd_buf;
>>>   	bool ddp_allowed;
>>>   	__be32 *p;
>>>   	int ret;
>>> @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>>   	 */
>>>   	if (rpcrdma_args_inline(r_xprt, rqst)) {
>>>   		*p++ = rdma_msg;
>>> -		rtype = rpcrdma_noch;
>>> -	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
>>> +		rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
>>> +			rpcrdma_noch_pullup : rpcrdma_noch_mapped;
>>> +	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
>>>   		*p++ = rdma_msg;
>>>   		rtype = rpcrdma_readch;
>>>   	} else {
>>> @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
>>>   		goto out_err;
>>>     	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
>>> -					&rqst->rq_snd_buf, rtype);
>>> +					buf, rtype);
>>>   	if (ret)
>>>   		goto out_err;
>>>   diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>>> index 2f46582..a514e2c 100644
>>> --- a/net/sunrpc/xprtrdma/verbs.c
>>> +++ b/net/sunrpc/xprtrdma/verbs.c
>>> @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
>>>   	for (i = 0; i < buf->rb_max_requests; i++) {
>>>   		struct rpcrdma_req *req;
>>>   -		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
>>> +		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
>>>   					 GFP_KERNEL);
>>>   		if (!req)
>>>   			goto out;
>>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> index cdd6a3d..5d15140 100644
>>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
>>>     enum rpcrdma_chunktype {
>>>   	rpcrdma_noch = 0,
>>> +	rpcrdma_noch_pullup,
>>> +	rpcrdma_noch_mapped,
>>>   	rpcrdma_readch,
>>>   	rpcrdma_areadch,
>>>   	rpcrdma_writech,
> 
> --
> Chuck Lever
> 
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 6/6] xprtrdma: Pull up sometimes
  2019-10-19 16:36       ` Tom Talpey
@ 2019-10-22 16:08         ` Chuck Lever
  2019-10-26  2:04           ` Tom Talpey
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2019-10-22 16:08 UTC (permalink / raw)
  To: Tom Talpey; +Cc: linux-rdma, Linux NFS Mailing List



> On Oct 19, 2019, at 12:36 PM, Tom Talpey <tom@talpey.com> wrote:
> 
> On 10/18/2019 7:34 PM, Chuck Lever wrote:
>> Hi Tom-
>>> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote:
>>> 
>>> On 10/17/2019 2:31 PM, Chuck Lever wrote:
>>>> On some platforms, DMA mapping part of a page is more costly than
>>>> copying bytes. Restore the pull-up code and use that when we
>>>> think it's going to be faster. The heuristic for now is to pull-up
>>>> when the size of the RPC message body fits in the buffer underlying
>>>> the head iovec.
>>>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport
>>>> scale better for tiny I/Os across more RDMA devices. This is because
>>>> interaction with the I/O MMU is eliminated, as is handling a Send
>>>> completion, for each of these small I/Os. Without the explicit
>>>> unmapping, the NIC no longer needs to do a costly internal TLB shoot
>>>> down for buffers that are just a handful of bytes.
>>> 
>>> This is good stuff. Do you have any performance data for the new
>>> strategy, especially latencies and local CPU cycles per byte?
>> Saves almost a microsecond of RT latency on my NFS client that uses
>> a real Intel IOMMU. On my other NFS client, the DMA map operations
>> are always a no-op. This savings applies only to NFS WRITE, of course.
>> I don't have a good benchmark for cycles per byte. Do you have any
>> suggestions? Not sure how I would account for cycles spent handling
>> Send completions, for example.
> 
> Cycles per byte is fairly simple but like all performance measurement
> the trick is in the setup. Because of platform variations, it's best
> to compare results on the same hardware. The absolute value isn't as
> meaningful. Here's a rough sketch of one approach.
> 
> - Configure BIOS and OS to hold CPU frequency constant:
>  - ACPI C-states off
>  - Turbo mode off
>  - Power management off (OS needs this too)
>  - Anything else relevant to clock variation
> - Hyperthreading off
>  - (hyperthreads don't add work linearly)
> - Calculate core count X clock frequency
>  - (e.g. 8 X 3GHz = 24G cycles/sec)
> 
> Now, use a benchmark which runs the desired workload and reports %CPU.
> For a given interval, record the total bytes transferred, time spent,
> and CPU load. (e.g. 100GB, 100 sec, 20%).
> 
> Finally, compute CpB (the 1/sec terms cancel out):
> 20% x 24Gcps = 4.8G cps
> 100GB / 100s = 1G bps
> 4.8Gcps / 1 GBps = 4.8cpb
> 
> Like I said, it's rough, but surprisingly telling. A similar metric
> is cycles per IOP, and since you're focusing on small i/o with this
> change, it might also be an interesting calculation. Simply replace
> total bytes/sec with IOPS.

Systems under test:

	• 12 Haswell cores x 1.6GHz = 19.2 billion cps
	• Server is exporting a tmpfs filesystem
	• Client and server using CX-3 Pro on 56Gbps InfiniBand
	• Kernel is v5.4-rc4
	• iozone -M -+u -i0 -i1 -s1g -r1k -t12 -I

The purpose of this test is to compare the two kernels, not to publish an absolute performance value. Both kernels below have a number of CPU-intensive debugging options enabled, which might tend to increase CPU cycles per byte or per I/O, and might also amplify the differences between the two kernels.



*** With DMA-mapping kernel (confirmed after test - total pull-up was zero bytes):

WRITE tests:

	• Write test: CPU Utilization: Wall time  496.136    CPU time  812.879    CPU utilization 163.84 %
	• Re-write test: CPU utilization: Wall time  500.266    CPU time  822.810    CPU utilization 164.47 %

Final mountstats results:

WRITE:
    25161863 ops (50%)
    avg bytes sent per op: 1172    avg bytes received per op: 136
    backlog wait: 0.094913     RTT: 0.048245     total execute time: 0.213270 (milliseconds)

Based solely on the iozone Write test:
12 threads x 1GB file = 12 GB transferred
12 GB / 496 s = 25973227 Bps
19.2 billion cps / 25973227 Bps = 740 cpB @ 1KB I/O

Based on both the iozone Write and Re-write tests:
25161863 ops / 996 s = 25263 IOps
19.2 billion cps / 25263 IOps = 760004 cpIO


READ tests:

	• Read test: CPU utilization: Wall time  451.762    CPU time  826.888    CPU utilization 183.04 %
	• Re-read test: CPU utilization: Wall time  452.543    CPU time  827.575    CPU utilization 182.87 %

Final mountstats results:

READ:
    25146066 ops (49%)
    avg bytes sent per op: 140    avg bytes received per op: 1152
    backlog wait: 0.092140     RTT: 0.045202     total execute time: 0.205996 (milliseconds)

Based solely on the iozone Read test:
12 threads x 1GB file = 12 GB transferred
12 GB / 451 s = 28569627 Bps
19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O

Based on both the iozone Read and Re-read tests:
25146066 ops / 903 s = 27847 IOps
19.2 billion cps / 27847 IOps = 689481 cpIO



*** With pull-up kernel (confirmed after test - total pull-up was 25763734528 bytes):

WRITE tests:
	• Write test: CPU Utilization: Wall time  453.318    CPU time  839.581    CPU utilization 185.21 %
	• Re-write test: CPU utilization: Wall time  458.717    CPU time  850.335    CPU utilization 185.37 %

Final mountstats results:

WRITE:
          25159897 ops (50%)
        avg bytes sent per op: 1172     avg bytes received per op: 136
        backlog wait: 0.080036  RTT: 0.049674   total execute time: 0.183426 (milliseconds)

Based solely on the iozone Write test:
12 threads x 1GB file = 12 GB transferred
12 GB / 453 s = 28443492 Bps
19.2 billion cps / 28443492 Bps = 675 cpB @ 1KB I/O

Based on both the iozone Write and Re-write tests:
25159897 ops / 911 s = 27617 IOps
19.2 billion cps / 27617 IOps = 695223 cpIO


READ tests:

	• Read test: CPU utilization: Wall time  451.248    CPU time  834.203    CPU utilization 184.87 %
	• Re-read test: CPU utilization: Wall time  451.113    CPU time  834.302    CPU utilization 184.94 %

Final mountstats results:

READ:
    25149527 ops (49%)
    avg bytes sent per op: 140    avg bytes received per op: 1152
    backlog wait: 0.091011     RTT: 0.045790     total execute time: 0.203793 (milliseconds)

Based solely on the iozone Read test:
12 threads x 1GB file = 12 GB transferred
12 GB / 451 s = 28569627 Bps
19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O

Based on both the iozone Read and Re-read tests:
25149527 ops / 902 s = 27881 IOps
19.2 billion cps / 27881 IOps = 688641 cpIO



*** Analysis:

For both kernels, the READ tests are close. This demonstrates that the patch does not have any gross effects on the READ path, as expected.

The WRITE tests are more remarkable.
	• Mean total execute time per WRITE RPC decreases by about 30 microseconds. Almost half of that is decreased backlog wait.
	• Mean round-trip time increases by a microsecond and a half. My earlier report that RT decreased by a microsecond was based on a QD=1 direct latency measure.
	• For 1KB WRITE: IOPS, Cycles per byte written and Cycles per I/O are now within spitting distance of the same metrics for 1KB READ.


--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 6/6] xprtrdma: Pull up sometimes
  2019-10-22 16:08         ` Chuck Lever
@ 2019-10-26  2:04           ` Tom Talpey
  0 siblings, 0 replies; 12+ messages in thread
From: Tom Talpey @ 2019-10-26  2:04 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma, Linux NFS Mailing List

On 10/22/2019 12:08 PM, Chuck Lever wrote:
> 
> 
>> On Oct 19, 2019, at 12:36 PM, Tom Talpey <tom@talpey.com> wrote:
>>
>> On 10/18/2019 7:34 PM, Chuck Lever wrote:
>>> Hi Tom-
>>>> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote:
>>>>
>>>> On 10/17/2019 2:31 PM, Chuck Lever wrote:
>>>>> On some platforms, DMA mapping part of a page is more costly than
>>>>> copying bytes. Restore the pull-up code and use that when we
>>>>> think it's going to be faster. The heuristic for now is to pull-up
>>>>> when the size of the RPC message body fits in the buffer underlying
>>>>> the head iovec.
>>>>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport
>>>>> scale better for tiny I/Os across more RDMA devices. This is because
>>>>> interaction with the I/O MMU is eliminated, as is handling a Send
>>>>> completion, for each of these small I/Os. Without the explicit
>>>>> unmapping, the NIC no longer needs to do a costly internal TLB shoot
>>>>> down for buffers that are just a handful of bytes.
>>>>
>>>> This is good stuff. Do you have any performance data for the new
>>>> strategy, especially latencies and local CPU cycles per byte?
>>> Saves almost a microsecond of RT latency on my NFS client that uses
>>> a real Intel IOMMU. On my other NFS client, the DMA map operations
>>> are always a no-op. This savings applies only to NFS WRITE, of course.
>>> I don't have a good benchmark for cycles per byte. Do you have any
>>> suggestions? Not sure how I would account for cycles spent handling
>>> Send completions, for example.
>>
>> Cycles per byte is fairly simple but like all performance measurement
>> the trick is in the setup. Because of platform variations, it's best
>> to compare results on the same hardware. The absolute value isn't as
>> meaningful. Here's a rough sketch of one approach.
>>
>> - Configure BIOS and OS to hold CPU frequency constant:
>>   - ACPI C-states off
>>   - Turbo mode off
>>   - Power management off (OS needs this too)
>>   - Anything else relevant to clock variation
>> - Hyperthreading off
>>   - (hyperthreads don't add work linearly)
>> - Calculate core count X clock frequency
>>   - (e.g. 8 X 3GHz = 24G cycles/sec)
>>
>> Now, use a benchmark which runs the desired workload and reports %CPU.
>> For a given interval, record the total bytes transferred, time spent,
>> and CPU load. (e.g. 100GB, 100 sec, 20%).
>>
>> Finally, compute CpB (the 1/sec terms cancel out):
>> 20% x 24Gcps = 4.8G cps
>> 100GB / 100s = 1G bps
>> 4.8Gcps / 1 GBps = 4.8cpb
>>
>> Like I said, it's rough, but surprisingly telling. A similar metric
>> is cycles per IOP, and since you're focusing on small i/o with this
>> change, it might also be an interesting calculation. Simply replace
>> total bytes/sec with IOPS.
> 
> Systems under test:
> 
> 	• 12 Haswell cores x 1.6GHz = 19.2 billion cps
> 	• Server is exporting a tmpfs filesystem
> 	• Client and server using CX-3 Pro on 56Gbps InfiniBand
> 	• Kernel is v5.4-rc4
> 	• iozone -M -+u -i0 -i1 -s1g -r1k -t12 -I
> 
> The purpose of this test is to compare the two kernels, not to publish an absolute performance value. Both kernels below have a number of CPU-intensive debugging options enabled, which might tend to increase CPU cycles per byte or per I/O, and might also amplify the differences between the two kernels.
> 
> 
> 
> *** With DMA-mapping kernel (confirmed after test - total pull-up was zero bytes):
> 
> WRITE tests:
> 
> 	• Write test: CPU Utilization: Wall time  496.136    CPU time  812.879    CPU utilization 163.84 %
> 	• Re-write test: CPU utilization: Wall time  500.266    CPU time  822.810    CPU utilization 164.47 %

Ah, the math I suggested earlier needs a different approah for these
absolute utilizations. The >100% numbers indicate these are per-core,
so 164% means 1.64 cores' worth of load. The math I suggested is for
relative, where the core count is corrected to 100% total.

For these, ignore the core count, just take the frequency and multiply
by the percent. This means your cpb and cpio numbers are 12X the value.
It's no big deal since we're just comparing before/after though.

> Final mountstats results:
> 
> WRITE:
>      25161863 ops (50%)
>      avg bytes sent per op: 1172    avg bytes received per op: 136
>      backlog wait: 0.094913     RTT: 0.048245     total execute time: 0.213270 (milliseconds)
> 
> Based solely on the iozone Write test:
> 12 threads x 1GB file = 12 GB transferred
> 12 GB / 496 s = 25973227 Bps
> 19.2 billion cps / 25973227 Bps = 740 cpB @ 1KB I/O
> 
> Based on both the iozone Write and Re-write tests:
> 25161863 ops / 996 s = 25263 IOps
> 19.2 billion cps / 25263 IOps = 760004 cpIO

So, 62cpb and and 63Kcpio, corrected. Seems a bit high, but as I said
earlier, it's hard to compare across platforms.

> READ tests:
> 
> 	• Read test: CPU utilization: Wall time  451.762    CPU time  826.888    CPU utilization 183.04 %
> 	• Re-read test: CPU utilization: Wall time  452.543    CPU time  827.575    CPU utilization 182.87 %
> 
> Final mountstats results:
> 
> READ:
>      25146066 ops (49%)
>      avg bytes sent per op: 140    avg bytes received per op: 1152
>      backlog wait: 0.092140     RTT: 0.045202     total execute time: 0.205996 (milliseconds)
> 
> Based solely on the iozone Read test:
> 12 threads x 1GB file = 12 GB transferred
> 12 GB / 451 s = 28569627 Bps
> 19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O
> 
> Based on both the iozone Read and Re-read tests:
> 25146066 ops / 903 s = 27847 IOps
> 19.2 billion cps / 27847 IOps = 689481 cpIO
> 
> 
> 
> *** With pull-up kernel (confirmed after test - total pull-up was 25763734528 bytes):
> 
> WRITE tests:
> 	• Write test: CPU Utilization: Wall time  453.318    CPU time  839.581    CPU utilization 185.21 %
> 	• Re-write test: CPU utilization: Wall time  458.717    CPU time  850.335    CPU utilization 185.37 %
> 
> Final mountstats results:
> 
> WRITE:
>            25159897 ops (50%)
>          avg bytes sent per op: 1172     avg bytes received per op: 136
>          backlog wait: 0.080036  RTT: 0.049674   total execute time: 0.183426 (milliseconds)
> 
> Based solely on the iozone Write test:
> 12 threads x 1GB file = 12 GB transferred
> 12 GB / 453 s = 28443492 Bps
> 19.2 billion cps / 28443492 Bps = 675 cpB @ 1KB I/O
> 
> Based on both the iozone Write and Re-write tests:
> 25159897 ops / 911 s = 27617 IOps
> 19.2 billion cps / 27617 IOps = 695223 cpIO
> 
> 
> READ tests:
> 
> 	• Read test: CPU utilization: Wall time  451.248    CPU time  834.203    CPU utilization 184.87 %
> 	• Re-read test: CPU utilization: Wall time  451.113    CPU time  834.302    CPU utilization 184.94 %
> 
> Final mountstats results:
> 
> READ:
>      25149527 ops (49%)
>      avg bytes sent per op: 140    avg bytes received per op: 1152
>      backlog wait: 0.091011     RTT: 0.045790     total execute time: 0.203793 (milliseconds)
> 
> Based solely on the iozone Read test:
> 12 threads x 1GB file = 12 GB transferred
> 12 GB / 451 s = 28569627 Bps
> 19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O
> 
> Based on both the iozone Read and Re-read tests:
> 25149527 ops / 902 s = 27881 IOps
> 19.2 billion cps / 27881 IOps = 688641 cpIO
> 
> 
> 
> *** Analysis:
> 
> For both kernels, the READ tests are close. This demonstrates that the patch does not have any gross effects on the READ path, as expected.

Well, close, but noticably lower in the pullup approach. This bears
out your suspicion that the IOMMU is not a trivial cost. It's also
likely to be a single-threading point, which will cause queuing as
well as overhead. Of course, the cost may be worth it, for security,
or scatter/gather optimization, etc. But for small i/o, the pullup
is a simple and effective approach.

> The WRITE tests are more remarkable.
> 	• Mean total execute time per WRITE RPC decreases by about 30 microseconds. Almost half of that is decreased backlog wait.
> 	• Mean round-trip time increases by a microsecond and a half. My earlier report that RT decreased by a microsecond was based on a QD=1 direct latency measure.
> 	• For 1KB WRITE: IOPS, Cycles per byte written and Cycles per I/O are now within spitting distance of the same metrics for 1KB READ.

Excellent result. The backlog wait may be in part avoiding the IOMMU
programming, and queuing on it before mapping. Bringing the numbers
close to READ corroborates this. I assume you are using RoCE or IB,
which doesn't require remote registration of the source buffers? The
pullup would improve those even more.

Curious that the RTT goes up by such a large number. It seems really
high. This might become a noticable penalty on single-threaded
(metadata) workloads.

Tom.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-10-26  2:04 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-17 18:31 [PATCH v1 0/6] Sometimes pull up the Send buffer Chuck Lever
2019-10-17 18:31 ` [PATCH v1 1/6] xprtrdma: Ensure ri_id is stable during MR recycling Chuck Lever
2019-10-17 18:31 ` [PATCH v1 2/6] xprtrdma: Remove rpcrdma_sendctx::sc_xprt Chuck Lever
2019-10-17 18:31 ` [PATCH v1 3/6] xprtrdma: Remove rpcrdma_sendctx::sc_device Chuck Lever
2019-10-17 18:31 ` [PATCH v1 4/6] xprtrdma: Move the rpcrdma_sendctx::sc_wr field Chuck Lever
2019-10-17 18:31 ` [PATCH v1 5/6] xprtrdma: Refactor rpcrdma_prepare_msg_sges() Chuck Lever
2019-10-17 18:31 ` [PATCH v1 6/6] xprtrdma: Pull up sometimes Chuck Lever
2019-10-18 20:17   ` Tom Talpey
2019-10-18 23:34     ` Chuck Lever
2019-10-19 16:36       ` Tom Talpey
2019-10-22 16:08         ` Chuck Lever
2019-10-26  2:04           ` Tom Talpey

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.