All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
@ 2018-11-27 16:11 Chuck Lever
  2018-11-27 16:29 ` J. Bruce Fields
  2018-11-27 21:16 ` Tom Talpey
  0 siblings, 2 replies; 9+ messages in thread
From: Chuck Lever @ 2018-11-27 16:11 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

o Select the R_key to invalidate while the CPU cache still contains
  the received RPC Call transport header, rather than waiting until
  we're about to send the RPC Reply.

o Choose Send With Invalidate if there is exactly one distinct R_key
  in the received transport header. If there's more than one, the
  client will have to perform local invalidation after it has
  already waited for remote invalidation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
Hi-

Please consider this NFS server-side patch for v4.21.


 include/linux/sunrpc/svc_rdma.h         |    1 
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
 3 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e6e2691..7e22681 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
 	u32			rc_byte_len;
 	unsigned int		rc_page_count;
 	unsigned int		rc_hdr_count;
+	u32			rc_inv_rkey;
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index b24d5b8..828b149 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
 	return p;
 }
 
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one R_key to invalidate.
+ *
+ * If there is exactly one distinct R_key in the received transport
+ * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
+ *
+ * Perform this operation while the received transport header is
+ * still in the CPU cache.
+ */
+static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_recv_ctxt *ctxt)
+{
+	__be32 inv_rkey, *p;
+	u32 i, segcount;
+
+	ctxt->rc_inv_rkey = 0;
+
+	if (!rdma->sc_snd_w_inv)
+		return;
+
+	inv_rkey = xdr_zero;
+	p = ctxt->rc_recv_buf;
+	p += rpcrdma_fixed_maxsz;
+
+	/* Read list */
+	while (*p++ != xdr_zero) {
+		p++;	/* position */
+		if (inv_rkey == xdr_zero)
+			inv_rkey = *p;
+		else if (inv_rkey != *p)
+			return;
+		p += 4;
+	}
+
+	/* Write list */
+	while (*p++ != xdr_zero) {
+		segcount = be32_to_cpup(p++);
+		for (i = 0; i < segcount; i++) {
+			if (inv_rkey == xdr_zero)
+				inv_rkey = *p;
+			else if (inv_rkey != *p)
+				return;
+			p += 4;
+		}
+	}
+
+	/* Reply chunk */
+	if (*p++ != xdr_zero) {
+		segcount = be32_to_cpup(p++);
+		for (i = 0; i < segcount; i++) {
+			if (inv_rkey == xdr_zero)
+				inv_rkey = *p;
+			else if (inv_rkey != *p)
+				return;
+			p += 4;
+		}
+	}
+
+	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
+}
+
 /* On entry, xdr->head[0].iov_base points to first byte in the
  * RPC-over-RDMA header.
  *
@@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 		return ret;
 	}
+	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
 
 	p += rpcrdma_fixed_maxsz;
 	if (*p != xdr_zero)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 8602a5f..d48bc6d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
 		*reply = NULL;
 }
 
-/* RPC-over-RDMA Version One private extension: Remote Invalidation.
- * Responder's choice: requester signals it can handle Send With
- * Invalidate, and responder chooses one rkey to invalidate.
- *
- * Find a candidate rkey to invalidate when sending a reply.  Picks the
- * first R_key it finds in the chunk lists.
- *
- * Returns zero if RPC's chunk lists are empty.
- */
-static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
-				 __be32 *wr_lst, __be32 *rp_ch)
-{
-	__be32 *p;
-
-	p = rdma_argp + rpcrdma_fixed_maxsz;
-	if (*p != xdr_zero)
-		p += 2;
-	else if (wr_lst && be32_to_cpup(wr_lst + 1))
-		p = wr_lst + 2;
-	else if (rp_ch && be32_to_cpup(rp_ch + 1))
-		p = rp_ch + 2;
-	else
-		return 0;
-	return be32_to_cpup(p);
-}
-
 static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_send_ctxt *ctxt,
 				 struct page *page,
@@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  *
  * RDMA Send is the last step of transmitting an RPC reply. Pages
  * involved in the earlier RDMA Writes are here transferred out
- * of the rqstp and into the ctxt's page array. These pages are
+ * of the rqstp and into the sctxt's page array. These pages are
  * DMA unmapped by each Write completion, but the subsequent Send
  * completion finally releases these pages.
  *
@@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  * - The Reply's transport header will never be larger than a page.
  */
 static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
-				   struct svc_rdma_send_ctxt *ctxt,
-				   __be32 *rdma_argp,
+				   struct svc_rdma_send_ctxt *sctxt,
+				   struct svc_rdma_recv_ctxt *rctxt,
 				   struct svc_rqst *rqstp,
 				   __be32 *wr_lst, __be32 *rp_ch)
 {
 	int ret;
 
 	if (!rp_ch) {
-		ret = svc_rdma_map_reply_msg(rdma, ctxt,
+		ret = svc_rdma_map_reply_msg(rdma, sctxt,
 					     &rqstp->rq_res, wr_lst);
 		if (ret < 0)
 			return ret;
 	}
 
-	svc_rdma_save_io_pages(rqstp, ctxt);
+	svc_rdma_save_io_pages(rqstp, sctxt);
 
-	ctxt->sc_send_wr.opcode = IB_WR_SEND;
-	if (rdma->sc_snd_w_inv) {
-		ctxt->sc_send_wr.ex.invalidate_rkey =
-			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
-		if (ctxt->sc_send_wr.ex.invalidate_rkey)
-			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+	if (rctxt->rc_inv_rkey) {
+		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
+	} else {
+		sctxt->sc_send_wr.opcode = IB_WR_SEND;
 	}
 	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
-		ctxt->sc_send_wr.num_sge);
-	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
+		sctxt->sc_send_wr.num_sge);
+	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
 }
 
 /* Given the client-provided Write and Reply chunks, the server was not
@@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	}
 
 	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
-	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
+	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
 				      wr_lst, rp_ch);
 	if (ret < 0)
 		goto err1;


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 16:11 [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate Chuck Lever
@ 2018-11-27 16:29 ` J. Bruce Fields
  2018-11-27 16:31   ` Chuck Lever
  2018-11-27 21:16 ` Tom Talpey
  1 sibling, 1 reply; 9+ messages in thread
From: J. Bruce Fields @ 2018-11-27 16:29 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma, linux-nfs

On Tue, Nov 27, 2018 at 11:11:35AM -0500, Chuck Lever wrote:
> o Select the R_key to invalidate while the CPU cache still contains
>   the received RPC Call transport header, rather than waiting until
>   we're about to send the RPC Reply.
> 
> o Choose Send With Invalidate if there is exactly one distinct R_key
>   in the received transport header. If there's more than one, the
>   client will have to perform local invalidation after it has
>   already waited for remote invalidation.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> Hi-
> 
> Please consider this NFS server-side patch for v4.21.

OK, thanks, applying.

(By the way, I appreciate it if patch submissions have
bfields@fieldses.org on the To: line, my filters handle that a little
differently than mailing list traffic.)

--b.

> 
> 
>  include/linux/sunrpc/svc_rdma.h         |    1 
>  net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
>  net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
>  3 files changed, 77 insertions(+), 40 deletions(-)
> 
> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
> index e6e2691..7e22681 100644
> --- a/include/linux/sunrpc/svc_rdma.h
> +++ b/include/linux/sunrpc/svc_rdma.h
> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
>  	u32			rc_byte_len;
>  	unsigned int		rc_page_count;
>  	unsigned int		rc_hdr_count;
> +	u32			rc_inv_rkey;
>  	struct page		*rc_pages[RPCSVC_MAXPAGES];
>  };
>  
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> index b24d5b8..828b149 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
>  	return p;
>  }
>  
> +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
> + * Responder's choice: requester signals it can handle Send With
> + * Invalidate, and responder chooses one R_key to invalidate.
> + *
> + * If there is exactly one distinct R_key in the received transport
> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
> + *
> + * Perform this operation while the received transport header is
> + * still in the CPU cache.
> + */
> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
> +				  struct svc_rdma_recv_ctxt *ctxt)
> +{
> +	__be32 inv_rkey, *p;
> +	u32 i, segcount;
> +
> +	ctxt->rc_inv_rkey = 0;
> +
> +	if (!rdma->sc_snd_w_inv)
> +		return;
> +
> +	inv_rkey = xdr_zero;
> +	p = ctxt->rc_recv_buf;
> +	p += rpcrdma_fixed_maxsz;
> +
> +	/* Read list */
> +	while (*p++ != xdr_zero) {
> +		p++;	/* position */
> +		if (inv_rkey == xdr_zero)
> +			inv_rkey = *p;
> +		else if (inv_rkey != *p)
> +			return;
> +		p += 4;
> +	}
> +
> +	/* Write list */
> +	while (*p++ != xdr_zero) {
> +		segcount = be32_to_cpup(p++);
> +		for (i = 0; i < segcount; i++) {
> +			if (inv_rkey == xdr_zero)
> +				inv_rkey = *p;
> +			else if (inv_rkey != *p)
> +				return;
> +			p += 4;
> +		}
> +	}
> +
> +	/* Reply chunk */
> +	if (*p++ != xdr_zero) {
> +		segcount = be32_to_cpup(p++);
> +		for (i = 0; i < segcount; i++) {
> +			if (inv_rkey == xdr_zero)
> +				inv_rkey = *p;
> +			else if (inv_rkey != *p)
> +				return;
> +			p += 4;
> +		}
> +	}
> +
> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
> +}
> +
>  /* On entry, xdr->head[0].iov_base points to first byte in the
>   * RPC-over-RDMA header.
>   *
> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>  		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>  		return ret;
>  	}
> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
>  
>  	p += rpcrdma_fixed_maxsz;
>  	if (*p != xdr_zero)
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> index 8602a5f..d48bc6d 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
>  		*reply = NULL;
>  }
>  
> -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
> - * Responder's choice: requester signals it can handle Send With
> - * Invalidate, and responder chooses one rkey to invalidate.
> - *
> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
> - * first R_key it finds in the chunk lists.
> - *
> - * Returns zero if RPC's chunk lists are empty.
> - */
> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
> -				 __be32 *wr_lst, __be32 *rp_ch)
> -{
> -	__be32 *p;
> -
> -	p = rdma_argp + rpcrdma_fixed_maxsz;
> -	if (*p != xdr_zero)
> -		p += 2;
> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
> -		p = wr_lst + 2;
> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
> -		p = rp_ch + 2;
> -	else
> -		return 0;
> -	return be32_to_cpup(p);
> -}
> -
>  static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
>  				 struct svc_rdma_send_ctxt *ctxt,
>  				 struct page *page,
> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>   *
>   * RDMA Send is the last step of transmitting an RPC reply. Pages
>   * involved in the earlier RDMA Writes are here transferred out
> - * of the rqstp and into the ctxt's page array. These pages are
> + * of the rqstp and into the sctxt's page array. These pages are
>   * DMA unmapped by each Write completion, but the subsequent Send
>   * completion finally releases these pages.
>   *
> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>   * - The Reply's transport header will never be larger than a page.
>   */
>  static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
> -				   struct svc_rdma_send_ctxt *ctxt,
> -				   __be32 *rdma_argp,
> +				   struct svc_rdma_send_ctxt *sctxt,
> +				   struct svc_rdma_recv_ctxt *rctxt,
>  				   struct svc_rqst *rqstp,
>  				   __be32 *wr_lst, __be32 *rp_ch)
>  {
>  	int ret;
>  
>  	if (!rp_ch) {
> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
>  					     &rqstp->rq_res, wr_lst);
>  		if (ret < 0)
>  			return ret;
>  	}
>  
> -	svc_rdma_save_io_pages(rqstp, ctxt);
> +	svc_rdma_save_io_pages(rqstp, sctxt);
>  
> -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
> -	if (rdma->sc_snd_w_inv) {
> -		ctxt->sc_send_wr.ex.invalidate_rkey =
> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
> +	if (rctxt->rc_inv_rkey) {
> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
> +	} else {
> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
>  	}
>  	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
> -		ctxt->sc_send_wr.num_sge);
> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
> +		sctxt->sc_send_wr.num_sge);
> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
>  }
>  
>  /* Given the client-provided Write and Reply chunks, the server was not
> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>  	}
>  
>  	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
>  				      wr_lst, rp_ch);
>  	if (ret < 0)
>  		goto err1;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 16:29 ` J. Bruce Fields
@ 2018-11-27 16:31   ` Chuck Lever
  2018-11-27 17:21     ` Bruce Fields
  0 siblings, 1 reply; 9+ messages in thread
From: Chuck Lever @ 2018-11-27 16:31 UTC (permalink / raw)
  To: Bruce Fields; +Cc: linux-rdma, Linux NFS Mailing List



> On Nov 27, 2018, at 11:29 AM, bfields@fieldses.org wrote:
> 
> On Tue, Nov 27, 2018 at 11:11:35AM -0500, Chuck Lever wrote:
>> o Select the R_key to invalidate while the CPU cache still contains
>>  the received RPC Call transport header, rather than waiting until
>>  we're about to send the RPC Reply.
>> 
>> o Choose Send With Invalidate if there is exactly one distinct R_key
>>  in the received transport header. If there's more than one, the
>>  client will have to perform local invalidation after it has
>>  already waited for remote invalidation.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> Hi-
>> 
>> Please consider this NFS server-side patch for v4.21.
> 
> OK, thanks, applying.
> 
> (By the way, I appreciate it if patch submissions have
> bfields@fieldses.org on the To: line, my filters handle that a little
> differently than mailing list traffic.)

I've been told not to include To: when the patch is being
presented for review. This is a v1. If you feel it is ready
to go in, great! But I purposely did not include To: Bruce
because it has not had any review yet.


> --b.
> 
>> 
>> 
>> include/linux/sunrpc/svc_rdma.h         |    1 
>> net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
>> net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
>> 3 files changed, 77 insertions(+), 40 deletions(-)
>> 
>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
>> index e6e2691..7e22681 100644
>> --- a/include/linux/sunrpc/svc_rdma.h
>> +++ b/include/linux/sunrpc/svc_rdma.h
>> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
>> 	u32			rc_byte_len;
>> 	unsigned int		rc_page_count;
>> 	unsigned int		rc_hdr_count;
>> +	u32			rc_inv_rkey;
>> 	struct page		*rc_pages[RPCSVC_MAXPAGES];
>> };
>> 
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> index b24d5b8..828b149 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
>> 	return p;
>> }
>> 
>> +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>> + * Responder's choice: requester signals it can handle Send With
>> + * Invalidate, and responder chooses one R_key to invalidate.
>> + *
>> + * If there is exactly one distinct R_key in the received transport
>> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
>> + *
>> + * Perform this operation while the received transport header is
>> + * still in the CPU cache.
>> + */
>> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
>> +				  struct svc_rdma_recv_ctxt *ctxt)
>> +{
>> +	__be32 inv_rkey, *p;
>> +	u32 i, segcount;
>> +
>> +	ctxt->rc_inv_rkey = 0;
>> +
>> +	if (!rdma->sc_snd_w_inv)
>> +		return;
>> +
>> +	inv_rkey = xdr_zero;
>> +	p = ctxt->rc_recv_buf;
>> +	p += rpcrdma_fixed_maxsz;
>> +
>> +	/* Read list */
>> +	while (*p++ != xdr_zero) {
>> +		p++;	/* position */
>> +		if (inv_rkey == xdr_zero)
>> +			inv_rkey = *p;
>> +		else if (inv_rkey != *p)
>> +			return;
>> +		p += 4;
>> +	}
>> +
>> +	/* Write list */
>> +	while (*p++ != xdr_zero) {
>> +		segcount = be32_to_cpup(p++);
>> +		for (i = 0; i < segcount; i++) {
>> +			if (inv_rkey == xdr_zero)
>> +				inv_rkey = *p;
>> +			else if (inv_rkey != *p)
>> +				return;
>> +			p += 4;
>> +		}
>> +	}
>> +
>> +	/* Reply chunk */
>> +	if (*p++ != xdr_zero) {
>> +		segcount = be32_to_cpup(p++);
>> +		for (i = 0; i < segcount; i++) {
>> +			if (inv_rkey == xdr_zero)
>> +				inv_rkey = *p;
>> +			else if (inv_rkey != *p)
>> +				return;
>> +			p += 4;
>> +		}
>> +	}
>> +
>> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
>> +}
>> +
>> /* On entry, xdr->head[0].iov_base points to first byte in the
>>  * RPC-over-RDMA header.
>>  *
>> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>> 		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> 		return ret;
>> 	}
>> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
>> 
>> 	p += rpcrdma_fixed_maxsz;
>> 	if (*p != xdr_zero)
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> index 8602a5f..d48bc6d 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
>> 		*reply = NULL;
>> }
>> 
>> -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>> - * Responder's choice: requester signals it can handle Send With
>> - * Invalidate, and responder chooses one rkey to invalidate.
>> - *
>> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
>> - * first R_key it finds in the chunk lists.
>> - *
>> - * Returns zero if RPC's chunk lists are empty.
>> - */
>> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
>> -				 __be32 *wr_lst, __be32 *rp_ch)
>> -{
>> -	__be32 *p;
>> -
>> -	p = rdma_argp + rpcrdma_fixed_maxsz;
>> -	if (*p != xdr_zero)
>> -		p += 2;
>> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
>> -		p = wr_lst + 2;
>> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
>> -		p = rp_ch + 2;
>> -	else
>> -		return 0;
>> -	return be32_to_cpup(p);
>> -}
>> -
>> static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
>> 				 struct svc_rdma_send_ctxt *ctxt,
>> 				 struct page *page,
>> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>  *
>>  * RDMA Send is the last step of transmitting an RPC reply. Pages
>>  * involved in the earlier RDMA Writes are here transferred out
>> - * of the rqstp and into the ctxt's page array. These pages are
>> + * of the rqstp and into the sctxt's page array. These pages are
>>  * DMA unmapped by each Write completion, but the subsequent Send
>>  * completion finally releases these pages.
>>  *
>> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>  * - The Reply's transport header will never be larger than a page.
>>  */
>> static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
>> -				   struct svc_rdma_send_ctxt *ctxt,
>> -				   __be32 *rdma_argp,
>> +				   struct svc_rdma_send_ctxt *sctxt,
>> +				   struct svc_rdma_recv_ctxt *rctxt,
>> 				   struct svc_rqst *rqstp,
>> 				   __be32 *wr_lst, __be32 *rp_ch)
>> {
>> 	int ret;
>> 
>> 	if (!rp_ch) {
>> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
>> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
>> 					     &rqstp->rq_res, wr_lst);
>> 		if (ret < 0)
>> 			return ret;
>> 	}
>> 
>> -	svc_rdma_save_io_pages(rqstp, ctxt);
>> +	svc_rdma_save_io_pages(rqstp, sctxt);
>> 
>> -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
>> -	if (rdma->sc_snd_w_inv) {
>> -		ctxt->sc_send_wr.ex.invalidate_rkey =
>> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
>> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
>> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>> +	if (rctxt->rc_inv_rkey) {
>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
>> +	} else {
>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
>> 	}
>> 	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
>> -		ctxt->sc_send_wr.num_sge);
>> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
>> +		sctxt->sc_send_wr.num_sge);
>> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
>> }
>> 
>> /* Given the client-provided Write and Reply chunks, the server was not
>> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>> 	}
>> 
>> 	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
>> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
>> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
>> 				      wr_lst, rp_ch);
>> 	if (ret < 0)
>> 		goto err1;

--
Chuck Lever




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 16:31   ` Chuck Lever
@ 2018-11-27 17:21     ` Bruce Fields
  0 siblings, 0 replies; 9+ messages in thread
From: Bruce Fields @ 2018-11-27 17:21 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma, Linux NFS Mailing List

On Tue, Nov 27, 2018 at 11:31:23AM -0500, Chuck Lever wrote:
> 
> 
> > On Nov 27, 2018, at 11:29 AM, bfields@fieldses.org wrote:
> > 
> > On Tue, Nov 27, 2018 at 11:11:35AM -0500, Chuck Lever wrote:
> >> o Select the R_key to invalidate while the CPU cache still contains
> >>  the received RPC Call transport header, rather than waiting until
> >>  we're about to send the RPC Reply.
> >> 
> >> o Choose Send With Invalidate if there is exactly one distinct R_key
> >>  in the received transport header. If there's more than one, the
> >>  client will have to perform local invalidation after it has
> >>  already waited for remote invalidation.
> >> 
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> Hi-
> >> 
> >> Please consider this NFS server-side patch for v4.21.
> > 
> > OK, thanks, applying.
> > 
> > (By the way, I appreciate it if patch submissions have
> > bfields@fieldses.org on the To: line, my filters handle that a little
> > differently than mailing list traffic.)
> 
> I've been told not to include To: when the patch is being
> presented for review.  This is a v1. If you feel it is ready
> to go in, great! But I purposely did not include To: Bruce
> because it has not had any review yet.

Oh.  The "Please consider this..." made it sound like you wanted it in
now.  I can wait.

--b.

> > --b.
> > 
> >> 
> >> 
> >> include/linux/sunrpc/svc_rdma.h         |    1 
> >> net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
> >> net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
> >> 3 files changed, 77 insertions(+), 40 deletions(-)
> >> 
> >> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
> >> index e6e2691..7e22681 100644
> >> --- a/include/linux/sunrpc/svc_rdma.h
> >> +++ b/include/linux/sunrpc/svc_rdma.h
> >> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
> >> 	u32			rc_byte_len;
> >> 	unsigned int		rc_page_count;
> >> 	unsigned int		rc_hdr_count;
> >> +	u32			rc_inv_rkey;
> >> 	struct page		*rc_pages[RPCSVC_MAXPAGES];
> >> };
> >> 
> >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> >> index b24d5b8..828b149 100644
> >> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> >> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> >> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
> >> 	return p;
> >> }
> >> 
> >> +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
> >> + * Responder's choice: requester signals it can handle Send With
> >> + * Invalidate, and responder chooses one R_key to invalidate.
> >> + *
> >> + * If there is exactly one distinct R_key in the received transport
> >> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
> >> + *
> >> + * Perform this operation while the received transport header is
> >> + * still in the CPU cache.
> >> + */
> >> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
> >> +				  struct svc_rdma_recv_ctxt *ctxt)
> >> +{
> >> +	__be32 inv_rkey, *p;
> >> +	u32 i, segcount;
> >> +
> >> +	ctxt->rc_inv_rkey = 0;
> >> +
> >> +	if (!rdma->sc_snd_w_inv)
> >> +		return;
> >> +
> >> +	inv_rkey = xdr_zero;
> >> +	p = ctxt->rc_recv_buf;
> >> +	p += rpcrdma_fixed_maxsz;
> >> +
> >> +	/* Read list */
> >> +	while (*p++ != xdr_zero) {
> >> +		p++;	/* position */
> >> +		if (inv_rkey == xdr_zero)
> >> +			inv_rkey = *p;
> >> +		else if (inv_rkey != *p)
> >> +			return;
> >> +		p += 4;
> >> +	}
> >> +
> >> +	/* Write list */
> >> +	while (*p++ != xdr_zero) {
> >> +		segcount = be32_to_cpup(p++);
> >> +		for (i = 0; i < segcount; i++) {
> >> +			if (inv_rkey == xdr_zero)
> >> +				inv_rkey = *p;
> >> +			else if (inv_rkey != *p)
> >> +				return;
> >> +			p += 4;
> >> +		}
> >> +	}
> >> +
> >> +	/* Reply chunk */
> >> +	if (*p++ != xdr_zero) {
> >> +		segcount = be32_to_cpup(p++);
> >> +		for (i = 0; i < segcount; i++) {
> >> +			if (inv_rkey == xdr_zero)
> >> +				inv_rkey = *p;
> >> +			else if (inv_rkey != *p)
> >> +				return;
> >> +			p += 4;
> >> +		}
> >> +	}
> >> +
> >> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
> >> +}
> >> +
> >> /* On entry, xdr->head[0].iov_base points to first byte in the
> >>  * RPC-over-RDMA header.
> >>  *
> >> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
> >> 		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
> >> 		return ret;
> >> 	}
> >> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
> >> 
> >> 	p += rpcrdma_fixed_maxsz;
> >> 	if (*p != xdr_zero)
> >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> >> index 8602a5f..d48bc6d 100644
> >> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> >> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> >> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
> >> 		*reply = NULL;
> >> }
> >> 
> >> -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
> >> - * Responder's choice: requester signals it can handle Send With
> >> - * Invalidate, and responder chooses one rkey to invalidate.
> >> - *
> >> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
> >> - * first R_key it finds in the chunk lists.
> >> - *
> >> - * Returns zero if RPC's chunk lists are empty.
> >> - */
> >> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
> >> -				 __be32 *wr_lst, __be32 *rp_ch)
> >> -{
> >> -	__be32 *p;
> >> -
> >> -	p = rdma_argp + rpcrdma_fixed_maxsz;
> >> -	if (*p != xdr_zero)
> >> -		p += 2;
> >> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
> >> -		p = wr_lst + 2;
> >> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
> >> -		p = rp_ch + 2;
> >> -	else
> >> -		return 0;
> >> -	return be32_to_cpup(p);
> >> -}
> >> -
> >> static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
> >> 				 struct svc_rdma_send_ctxt *ctxt,
> >> 				 struct page *page,
> >> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
> >>  *
> >>  * RDMA Send is the last step of transmitting an RPC reply. Pages
> >>  * involved in the earlier RDMA Writes are here transferred out
> >> - * of the rqstp and into the ctxt's page array. These pages are
> >> + * of the rqstp and into the sctxt's page array. These pages are
> >>  * DMA unmapped by each Write completion, but the subsequent Send
> >>  * completion finally releases these pages.
> >>  *
> >> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
> >>  * - The Reply's transport header will never be larger than a page.
> >>  */
> >> static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
> >> -				   struct svc_rdma_send_ctxt *ctxt,
> >> -				   __be32 *rdma_argp,
> >> +				   struct svc_rdma_send_ctxt *sctxt,
> >> +				   struct svc_rdma_recv_ctxt *rctxt,
> >> 				   struct svc_rqst *rqstp,
> >> 				   __be32 *wr_lst, __be32 *rp_ch)
> >> {
> >> 	int ret;
> >> 
> >> 	if (!rp_ch) {
> >> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
> >> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
> >> 					     &rqstp->rq_res, wr_lst);
> >> 		if (ret < 0)
> >> 			return ret;
> >> 	}
> >> 
> >> -	svc_rdma_save_io_pages(rqstp, ctxt);
> >> +	svc_rdma_save_io_pages(rqstp, sctxt);
> >> 
> >> -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
> >> -	if (rdma->sc_snd_w_inv) {
> >> -		ctxt->sc_send_wr.ex.invalidate_rkey =
> >> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
> >> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
> >> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
> >> +	if (rctxt->rc_inv_rkey) {
> >> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
> >> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
> >> +	} else {
> >> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
> >> 	}
> >> 	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
> >> -		ctxt->sc_send_wr.num_sge);
> >> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
> >> +		sctxt->sc_send_wr.num_sge);
> >> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
> >> }
> >> 
> >> /* Given the client-provided Write and Reply chunks, the server was not
> >> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
> >> 	}
> >> 
> >> 	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
> >> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
> >> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
> >> 				      wr_lst, rp_ch);
> >> 	if (ret < 0)
> >> 		goto err1;
> 
> --
> Chuck Lever
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 16:11 [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate Chuck Lever
  2018-11-27 16:29 ` J. Bruce Fields
@ 2018-11-27 21:16 ` Tom Talpey
  2018-11-27 21:21   ` Chuck Lever
  1 sibling, 1 reply; 9+ messages in thread
From: Tom Talpey @ 2018-11-27 21:16 UTC (permalink / raw)
  To: Chuck Lever, linux-rdma, linux-nfs

On 11/27/2018 11:11 AM, Chuck Lever wrote:
> o Select the R_key to invalidate while the CPU cache still contains
>    the received RPC Call transport header, rather than waiting until
>    we're about to send the RPC Reply.
> 
> o Choose Send With Invalidate if there is exactly one distinct R_key
>    in the received transport header. If there's more than one, the
>    client will have to perform local invalidation after it has
>    already waited for remote invalidation.

What's the reason for remote-invalidating only if exactly one
region is targeted? It seems valuable to save the client the work,
no matter how many regions are used.

Put another way, why the change?

Tom.

> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> Hi-
> 
> Please consider this NFS server-side patch for v4.21.
> 
> 
>   include/linux/sunrpc/svc_rdma.h         |    1
>   net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
>   net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
>   3 files changed, 77 insertions(+), 40 deletions(-)
> 
> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
> index e6e2691..7e22681 100644
> --- a/include/linux/sunrpc/svc_rdma.h
> +++ b/include/linux/sunrpc/svc_rdma.h
> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
>   	u32			rc_byte_len;
>   	unsigned int		rc_page_count;
>   	unsigned int		rc_hdr_count;
> +	u32			rc_inv_rkey;
>   	struct page		*rc_pages[RPCSVC_MAXPAGES];
>   };
>   
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> index b24d5b8..828b149 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
>   	return p;
>   }
>   
> +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
> + * Responder's choice: requester signals it can handle Send With
> + * Invalidate, and responder chooses one R_key to invalidate.
> + *
> + * If there is exactly one distinct R_key in the received transport
> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
> + *
> + * Perform this operation while the received transport header is
> + * still in the CPU cache.
> + */
> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
> +				  struct svc_rdma_recv_ctxt *ctxt)
> +{
> +	__be32 inv_rkey, *p;
> +	u32 i, segcount;
> +
> +	ctxt->rc_inv_rkey = 0;
> +
> +	if (!rdma->sc_snd_w_inv)
> +		return;
> +
> +	inv_rkey = xdr_zero;
> +	p = ctxt->rc_recv_buf;
> +	p += rpcrdma_fixed_maxsz;
> +
> +	/* Read list */
> +	while (*p++ != xdr_zero) {
> +		p++;	/* position */
> +		if (inv_rkey == xdr_zero)
> +			inv_rkey = *p;
> +		else if (inv_rkey != *p)
> +			return;
> +		p += 4;
> +	}
> +
> +	/* Write list */
> +	while (*p++ != xdr_zero) {
> +		segcount = be32_to_cpup(p++);
> +		for (i = 0; i < segcount; i++) {
> +			if (inv_rkey == xdr_zero)
> +				inv_rkey = *p;
> +			else if (inv_rkey != *p)
> +				return;
> +			p += 4;
> +		}
> +	}
> +
> +	/* Reply chunk */
> +	if (*p++ != xdr_zero) {
> +		segcount = be32_to_cpup(p++);
> +		for (i = 0; i < segcount; i++) {
> +			if (inv_rkey == xdr_zero)
> +				inv_rkey = *p;
> +			else if (inv_rkey != *p)
> +				return;
> +			p += 4;
> +		}
> +	}
> +
> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
> +}
> +
>   /* On entry, xdr->head[0].iov_base points to first byte in the
>    * RPC-over-RDMA header.
>    *
> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>   		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>   		return ret;
>   	}
> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
>   
>   	p += rpcrdma_fixed_maxsz;
>   	if (*p != xdr_zero)
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> index 8602a5f..d48bc6d 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
>   		*reply = NULL;
>   }
>   
> -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
> - * Responder's choice: requester signals it can handle Send With
> - * Invalidate, and responder chooses one rkey to invalidate.
> - *
> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
> - * first R_key it finds in the chunk lists.
> - *
> - * Returns zero if RPC's chunk lists are empty.
> - */
> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
> -				 __be32 *wr_lst, __be32 *rp_ch)
> -{
> -	__be32 *p;
> -
> -	p = rdma_argp + rpcrdma_fixed_maxsz;
> -	if (*p != xdr_zero)
> -		p += 2;
> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
> -		p = wr_lst + 2;
> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
> -		p = rp_ch + 2;
> -	else
> -		return 0;
> -	return be32_to_cpup(p);
> -}
> -
>   static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
>   				 struct svc_rdma_send_ctxt *ctxt,
>   				 struct page *page,
> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>    *
>    * RDMA Send is the last step of transmitting an RPC reply. Pages
>    * involved in the earlier RDMA Writes are here transferred out
> - * of the rqstp and into the ctxt's page array. These pages are
> + * of the rqstp and into the sctxt's page array. These pages are
>    * DMA unmapped by each Write completion, but the subsequent Send
>    * completion finally releases these pages.
>    *
> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>    * - The Reply's transport header will never be larger than a page.
>    */
>   static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
> -				   struct svc_rdma_send_ctxt *ctxt,
> -				   __be32 *rdma_argp,
> +				   struct svc_rdma_send_ctxt *sctxt,
> +				   struct svc_rdma_recv_ctxt *rctxt,
>   				   struct svc_rqst *rqstp,
>   				   __be32 *wr_lst, __be32 *rp_ch)
>   {
>   	int ret;
>   
>   	if (!rp_ch) {
> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
>   					     &rqstp->rq_res, wr_lst);
>   		if (ret < 0)
>   			return ret;
>   	}
>   
> -	svc_rdma_save_io_pages(rqstp, ctxt);
> +	svc_rdma_save_io_pages(rqstp, sctxt);
>   
> -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
> -	if (rdma->sc_snd_w_inv) {
> -		ctxt->sc_send_wr.ex.invalidate_rkey =
> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
> +	if (rctxt->rc_inv_rkey) {
> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
> +	} else {
> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
>   	}
>   	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
> -		ctxt->sc_send_wr.num_sge);
> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
> +		sctxt->sc_send_wr.num_sge);
> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
>   }
>   
>   /* Given the client-provided Write and Reply chunks, the server was not
> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>   	}
>   
>   	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
>   				      wr_lst, rp_ch);
>   	if (ret < 0)
>   		goto err1;
> 
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 21:16 ` Tom Talpey
@ 2018-11-27 21:21   ` Chuck Lever
  2018-11-27 21:30     ` Tom Talpey
  0 siblings, 1 reply; 9+ messages in thread
From: Chuck Lever @ 2018-11-27 21:21 UTC (permalink / raw)
  To: Tom Talpey; +Cc: linux-rdma, Linux NFS Mailing List



> On Nov 27, 2018, at 4:16 PM, Tom Talpey <tom@talpey.com> wrote:
> 
> On 11/27/2018 11:11 AM, Chuck Lever wrote:
>> o Select the R_key to invalidate while the CPU cache still contains
>>   the received RPC Call transport header, rather than waiting until
>>   we're about to send the RPC Reply.
>> o Choose Send With Invalidate if there is exactly one distinct R_key
>>   in the received transport header. If there's more than one, the
>>   client will have to perform local invalidation after it has
>>   already waited for remote invalidation.
> 
> What's the reason for remote-invalidating only if exactly one
> region is targeted? It seems valuable to save the client the work,
> no matter how many regions are used.

Because remote invalidation delays the Receive completion.
If the client has to do local invalidation as well, that
means two delays, and the client already has to do a context
switch for the local invalidation.

Also, some cards are not especially efficient at remote
invalidation. If the server requests it less frequently (for
example, when it's not actually going to be beneficial),
that's good for those cards.


> Put another way, why the change?
> 
> Tom.
> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> Hi-
>> Please consider this NFS server-side patch for v4.21.
>>  include/linux/sunrpc/svc_rdma.h         |    1
>>  net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
>>  net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
>>  3 files changed, 77 insertions(+), 40 deletions(-)
>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
>> index e6e2691..7e22681 100644
>> --- a/include/linux/sunrpc/svc_rdma.h
>> +++ b/include/linux/sunrpc/svc_rdma.h
>> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
>>  	u32			rc_byte_len;
>>  	unsigned int		rc_page_count;
>>  	unsigned int		rc_hdr_count;
>> +	u32			rc_inv_rkey;
>>  	struct page		*rc_pages[RPCSVC_MAXPAGES];
>>  };
>>  diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> index b24d5b8..828b149 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
>>  	return p;
>>  }
>>  +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>> + * Responder's choice: requester signals it can handle Send With
>> + * Invalidate, and responder chooses one R_key to invalidate.
>> + *
>> + * If there is exactly one distinct R_key in the received transport
>> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
>> + *
>> + * Perform this operation while the received transport header is
>> + * still in the CPU cache.
>> + */
>> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
>> +				  struct svc_rdma_recv_ctxt *ctxt)
>> +{
>> +	__be32 inv_rkey, *p;
>> +	u32 i, segcount;
>> +
>> +	ctxt->rc_inv_rkey = 0;
>> +
>> +	if (!rdma->sc_snd_w_inv)
>> +		return;
>> +
>> +	inv_rkey = xdr_zero;
>> +	p = ctxt->rc_recv_buf;
>> +	p += rpcrdma_fixed_maxsz;
>> +
>> +	/* Read list */
>> +	while (*p++ != xdr_zero) {
>> +		p++;	/* position */
>> +		if (inv_rkey == xdr_zero)
>> +			inv_rkey = *p;
>> +		else if (inv_rkey != *p)
>> +			return;
>> +		p += 4;
>> +	}
>> +
>> +	/* Write list */
>> +	while (*p++ != xdr_zero) {
>> +		segcount = be32_to_cpup(p++);
>> +		for (i = 0; i < segcount; i++) {
>> +			if (inv_rkey == xdr_zero)
>> +				inv_rkey = *p;
>> +			else if (inv_rkey != *p)
>> +				return;
>> +			p += 4;
>> +		}
>> +	}
>> +
>> +	/* Reply chunk */
>> +	if (*p++ != xdr_zero) {
>> +		segcount = be32_to_cpup(p++);
>> +		for (i = 0; i < segcount; i++) {
>> +			if (inv_rkey == xdr_zero)
>> +				inv_rkey = *p;
>> +			else if (inv_rkey != *p)
>> +				return;
>> +			p += 4;
>> +		}
>> +	}
>> +
>> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
>> +}
>> +
>>  /* On entry, xdr->head[0].iov_base points to first byte in the
>>   * RPC-over-RDMA header.
>>   *
>> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>  		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>>  		return ret;
>>  	}
>> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
>>    	p += rpcrdma_fixed_maxsz;
>>  	if (*p != xdr_zero)
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> index 8602a5f..d48bc6d 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
>>  		*reply = NULL;
>>  }
>>  -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>> - * Responder's choice: requester signals it can handle Send With
>> - * Invalidate, and responder chooses one rkey to invalidate.
>> - *
>> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
>> - * first R_key it finds in the chunk lists.
>> - *
>> - * Returns zero if RPC's chunk lists are empty.
>> - */
>> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
>> -				 __be32 *wr_lst, __be32 *rp_ch)
>> -{
>> -	__be32 *p;
>> -
>> -	p = rdma_argp + rpcrdma_fixed_maxsz;
>> -	if (*p != xdr_zero)
>> -		p += 2;
>> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
>> -		p = wr_lst + 2;
>> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
>> -		p = rp_ch + 2;
>> -	else
>> -		return 0;
>> -	return be32_to_cpup(p);
>> -}
>> -
>>  static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
>>  				 struct svc_rdma_send_ctxt *ctxt,
>>  				 struct page *page,
>> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>   *
>>   * RDMA Send is the last step of transmitting an RPC reply. Pages
>>   * involved in the earlier RDMA Writes are here transferred out
>> - * of the rqstp and into the ctxt's page array. These pages are
>> + * of the rqstp and into the sctxt's page array. These pages are
>>   * DMA unmapped by each Write completion, but the subsequent Send
>>   * completion finally releases these pages.
>>   *
>> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>   * - The Reply's transport header will never be larger than a page.
>>   */
>>  static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
>> -				   struct svc_rdma_send_ctxt *ctxt,
>> -				   __be32 *rdma_argp,
>> +				   struct svc_rdma_send_ctxt *sctxt,
>> +				   struct svc_rdma_recv_ctxt *rctxt,
>>  				   struct svc_rqst *rqstp,
>>  				   __be32 *wr_lst, __be32 *rp_ch)
>>  {
>>  	int ret;
>>    	if (!rp_ch) {
>> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
>> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
>>  					     &rqstp->rq_res, wr_lst);
>>  		if (ret < 0)
>>  			return ret;
>>  	}
>>  -	svc_rdma_save_io_pages(rqstp, ctxt);
>> +	svc_rdma_save_io_pages(rqstp, sctxt);
>>  -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
>> -	if (rdma->sc_snd_w_inv) {
>> -		ctxt->sc_send_wr.ex.invalidate_rkey =
>> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
>> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
>> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>> +	if (rctxt->rc_inv_rkey) {
>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
>> +	} else {
>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
>>  	}
>>  	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
>> -		ctxt->sc_send_wr.num_sge);
>> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
>> +		sctxt->sc_send_wr.num_sge);
>> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
>>  }
>>    /* Given the client-provided Write and Reply chunks, the server was not
>> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>>  	}
>>    	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
>> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
>> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
>>  				      wr_lst, rp_ch);
>>  	if (ret < 0)
>>  		goto err1;

--
Chuck Lever




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 21:21   ` Chuck Lever
@ 2018-11-27 21:30     ` Tom Talpey
  2018-11-27 22:23       ` Chuck Lever
  0 siblings, 1 reply; 9+ messages in thread
From: Tom Talpey @ 2018-11-27 21:30 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma, Linux NFS Mailing List

On 11/27/2018 4:21 PM, Chuck Lever wrote:
> 
> 
>> On Nov 27, 2018, at 4:16 PM, Tom Talpey <tom@talpey.com> wrote:
>>
>> On 11/27/2018 11:11 AM, Chuck Lever wrote:
>>> o Select the R_key to invalidate while the CPU cache still contains
>>>    the received RPC Call transport header, rather than waiting until
>>>    we're about to send the RPC Reply.
>>> o Choose Send With Invalidate if there is exactly one distinct R_key
>>>    in the received transport header. If there's more than one, the
>>>    client will have to perform local invalidation after it has
>>>    already waited for remote invalidation.
>>
>> What's the reason for remote-invalidating only if exactly one
>> region is targeted? It seems valuable to save the client the work,
>> no matter how many regions are used.
> 
> Because remote invalidation delays the Receive completion.

Well yes, but the invalidations have to happen before the reply is
processed, and remote invalidation saves a local work request plus
its completion.

> If the client has to do local invalidation as well, that
> means two delays, and the client already has to do a context
> switch for the local invalidation.

Have you measured the difference?

> Also, some cards are not especially efficient at remote
> invalidation. If the server requests it less frequently (for
> example, when it's not actually going to be beneficial),
> that's good for those cards.

Hmm. I'd say get a better card and don't hobble the rpcrdma
protocol design. Maybe that's just me.

I think it would be best to capture some or all of this
explanation in the commit message, in any case.

Tom.


> 
> 
>> Put another way, why the change?
>>
>> Tom.
>>
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>> Hi-
>>> Please consider this NFS server-side patch for v4.21.
>>>   include/linux/sunrpc/svc_rdma.h         |    1
>>>   net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
>>>   net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
>>>   3 files changed, 77 insertions(+), 40 deletions(-)
>>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
>>> index e6e2691..7e22681 100644
>>> --- a/include/linux/sunrpc/svc_rdma.h
>>> +++ b/include/linux/sunrpc/svc_rdma.h
>>> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
>>>   	u32			rc_byte_len;
>>>   	unsigned int		rc_page_count;
>>>   	unsigned int		rc_hdr_count;
>>> +	u32			rc_inv_rkey;
>>>   	struct page		*rc_pages[RPCSVC_MAXPAGES];
>>>   };
>>>   diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> index b24d5b8..828b149 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
>>>   	return p;
>>>   }
>>>   +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>>> + * Responder's choice: requester signals it can handle Send With
>>> + * Invalidate, and responder chooses one R_key to invalidate.
>>> + *
>>> + * If there is exactly one distinct R_key in the received transport
>>> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
>>> + *
>>> + * Perform this operation while the received transport header is
>>> + * still in the CPU cache.
>>> + */
>>> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
>>> +				  struct svc_rdma_recv_ctxt *ctxt)
>>> +{
>>> +	__be32 inv_rkey, *p;
>>> +	u32 i, segcount;
>>> +
>>> +	ctxt->rc_inv_rkey = 0;
>>> +
>>> +	if (!rdma->sc_snd_w_inv)
>>> +		return;
>>> +
>>> +	inv_rkey = xdr_zero;
>>> +	p = ctxt->rc_recv_buf;
>>> +	p += rpcrdma_fixed_maxsz;
>>> +
>>> +	/* Read list */
>>> +	while (*p++ != xdr_zero) {
>>> +		p++;	/* position */
>>> +		if (inv_rkey == xdr_zero)
>>> +			inv_rkey = *p;
>>> +		else if (inv_rkey != *p)
>>> +			return;
>>> +		p += 4;
>>> +	}
>>> +
>>> +	/* Write list */
>>> +	while (*p++ != xdr_zero) {
>>> +		segcount = be32_to_cpup(p++);
>>> +		for (i = 0; i < segcount; i++) {
>>> +			if (inv_rkey == xdr_zero)
>>> +				inv_rkey = *p;
>>> +			else if (inv_rkey != *p)
>>> +				return;
>>> +			p += 4;
>>> +		}
>>> +	}
>>> +
>>> +	/* Reply chunk */
>>> +	if (*p++ != xdr_zero) {
>>> +		segcount = be32_to_cpup(p++);
>>> +		for (i = 0; i < segcount; i++) {
>>> +			if (inv_rkey == xdr_zero)
>>> +				inv_rkey = *p;
>>> +			else if (inv_rkey != *p)
>>> +				return;
>>> +			p += 4;
>>> +		}
>>> +	}
>>> +
>>> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
>>> +}
>>> +
>>>   /* On entry, xdr->head[0].iov_base points to first byte in the
>>>    * RPC-over-RDMA header.
>>>    *
>>> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>>   		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>>>   		return ret;
>>>   	}
>>> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
>>>     	p += rpcrdma_fixed_maxsz;
>>>   	if (*p != xdr_zero)
>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>> index 8602a5f..d48bc6d 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
>>>   		*reply = NULL;
>>>   }
>>>   -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>>> - * Responder's choice: requester signals it can handle Send With
>>> - * Invalidate, and responder chooses one rkey to invalidate.
>>> - *
>>> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
>>> - * first R_key it finds in the chunk lists.
>>> - *
>>> - * Returns zero if RPC's chunk lists are empty.
>>> - */
>>> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
>>> -				 __be32 *wr_lst, __be32 *rp_ch)
>>> -{
>>> -	__be32 *p;
>>> -
>>> -	p = rdma_argp + rpcrdma_fixed_maxsz;
>>> -	if (*p != xdr_zero)
>>> -		p += 2;
>>> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
>>> -		p = wr_lst + 2;
>>> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
>>> -		p = rp_ch + 2;
>>> -	else
>>> -		return 0;
>>> -	return be32_to_cpup(p);
>>> -}
>>> -
>>>   static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
>>>   				 struct svc_rdma_send_ctxt *ctxt,
>>>   				 struct page *page,
>>> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>>    *
>>>    * RDMA Send is the last step of transmitting an RPC reply. Pages
>>>    * involved in the earlier RDMA Writes are here transferred out
>>> - * of the rqstp and into the ctxt's page array. These pages are
>>> + * of the rqstp and into the sctxt's page array. These pages are
>>>    * DMA unmapped by each Write completion, but the subsequent Send
>>>    * completion finally releases these pages.
>>>    *
>>> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>>    * - The Reply's transport header will never be larger than a page.
>>>    */
>>>   static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
>>> -				   struct svc_rdma_send_ctxt *ctxt,
>>> -				   __be32 *rdma_argp,
>>> +				   struct svc_rdma_send_ctxt *sctxt,
>>> +				   struct svc_rdma_recv_ctxt *rctxt,
>>>   				   struct svc_rqst *rqstp,
>>>   				   __be32 *wr_lst, __be32 *rp_ch)
>>>   {
>>>   	int ret;
>>>     	if (!rp_ch) {
>>> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
>>> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
>>>   					     &rqstp->rq_res, wr_lst);
>>>   		if (ret < 0)
>>>   			return ret;
>>>   	}
>>>   -	svc_rdma_save_io_pages(rqstp, ctxt);
>>> +	svc_rdma_save_io_pages(rqstp, sctxt);
>>>   -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
>>> -	if (rdma->sc_snd_w_inv) {
>>> -		ctxt->sc_send_wr.ex.invalidate_rkey =
>>> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
>>> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
>>> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>>> +	if (rctxt->rc_inv_rkey) {
>>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>>> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
>>> +	} else {
>>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
>>>   	}
>>>   	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
>>> -		ctxt->sc_send_wr.num_sge);
>>> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
>>> +		sctxt->sc_send_wr.num_sge);
>>> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
>>>   }
>>>     /* Given the client-provided Write and Reply chunks, the server was not
>>> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>>>   	}
>>>     	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
>>> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
>>> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
>>>   				      wr_lst, rp_ch);
>>>   	if (ret < 0)
>>>   		goto err1;
> 
> --
> Chuck Lever
> 
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 21:30     ` Tom Talpey
@ 2018-11-27 22:23       ` Chuck Lever
  2018-11-28  1:13         ` Tom Talpey
  0 siblings, 1 reply; 9+ messages in thread
From: Chuck Lever @ 2018-11-27 22:23 UTC (permalink / raw)
  To: Tom Talpey; +Cc: linux-rdma, Linux NFS Mailing List



> On Nov 27, 2018, at 4:30 PM, Tom Talpey <tom@talpey.com> wrote:
> 
> On 11/27/2018 4:21 PM, Chuck Lever wrote:
>>> On Nov 27, 2018, at 4:16 PM, Tom Talpey <tom@talpey.com> wrote:
>>> 
>>> On 11/27/2018 11:11 AM, Chuck Lever wrote:
>>>> o Select the R_key to invalidate while the CPU cache still contains
>>>>   the received RPC Call transport header, rather than waiting until
>>>>   we're about to send the RPC Reply.
>>>> o Choose Send With Invalidate if there is exactly one distinct R_key
>>>>   in the received transport header. If there's more than one, the
>>>>   client will have to perform local invalidation after it has
>>>>   already waited for remote invalidation.
>>> 
>>> What's the reason for remote-invalidating only if exactly one
>>> region is targeted? It seems valuable to save the client the work,
>>> no matter how many regions are used.
>> Because remote invalidation delays the Receive completion.
> 
> Well yes, but the invalidations have to happen before the reply is
> processed, and remote invalidation saves a local work request plus
> its completion.

That is true only if remote invalidation can knock down all the
R_keys for that RPC. If there's more than one R_key for that RPC,
a local invalidation is needed anyway, and there's no savings but
rather there is a cost of the extra latency of waiting twice.

A couple of details to note:
- remote invalidation is only available with FRWR, which
  invalidates asynchronously
- a smart FRWR client implementation will post a chain of LOCAL
  INV WRs, then wait for the last one to signal completion. That's
  just one doorbell, one interrupt, and one context switch no
  matter how many LOCAL INV WRs are needed.

So if the client still has to do even one local invalidation, it's
not worth the trouble to remotely invalidate.


>> If the client has to do local invalidation as well, that
>> means two delays, and the client already has to do a context
>> switch for the local invalidation.
> 
> Have you measured the difference?

Yes, as reported in the patch description. Perhaps I can include
some interesting iozone results.


>> Also, some cards are not especially efficient at remote
>> invalidation. If the server requests it less frequently (for
>> example, when it's not actually going to be beneficial),
>> that's good for those cards.
> 
> Hmm. I'd say get a better card and don't hobble the rpcrdma
> protocol design. Maybe that's just me.

This behavior seems to be a typical feature of most recent hardware.
I suspect there's some locking of the hardware Send queue to handle
RI that contends with actual posted WRs from the host.

With cards that have a shallow FR depth, multiple MRs/R_keys are
required to register a single 1MB NFS READ or WRITE. Here's where
squelching remote invalidation really pays off.


This is an implementation change, not a protocol change, btw. The
current RPC-over-RDMA v1 protocol extension is Responder's Choice,
which means the server gets to pick an R_key or decide not to
invalidate remotely at all. I'm simply optimizing that choice.


> I think it would be best to capture some or all of this
> explanation in the commit message, in any case.

You mean you want my patch description to explain _why_ ? ;-)


> Tom.
> 
> 
>>> Put another way, why the change?
>>> 
>>> Tom.
>>> 
>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>> Hi-
>>>> Please consider this NFS server-side patch for v4.21.
>>>>  include/linux/sunrpc/svc_rdma.h         |    1
>>>>  net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   63 +++++++++++++++++++++++++++++++
>>>>  net/sunrpc/xprtrdma/svc_rdma_sendto.c   |   53 ++++++--------------------
>>>>  3 files changed, 77 insertions(+), 40 deletions(-)
>>>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
>>>> index e6e2691..7e22681 100644
>>>> --- a/include/linux/sunrpc/svc_rdma.h
>>>> +++ b/include/linux/sunrpc/svc_rdma.h
>>>> @@ -135,6 +135,7 @@ struct svc_rdma_recv_ctxt {
>>>>  	u32			rc_byte_len;
>>>>  	unsigned int		rc_page_count;
>>>>  	unsigned int		rc_hdr_count;
>>>> +	u32			rc_inv_rkey;
>>>>  	struct page		*rc_pages[RPCSVC_MAXPAGES];
>>>>  };
>>>>  diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>>> index b24d5b8..828b149 100644
>>>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>>>> @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
>>>>  	return p;
>>>>  }
>>>>  +/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>>>> + * Responder's choice: requester signals it can handle Send With
>>>> + * Invalidate, and responder chooses one R_key to invalidate.
>>>> + *
>>>> + * If there is exactly one distinct R_key in the received transport
>>>> + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
>>>> + *
>>>> + * Perform this operation while the received transport header is
>>>> + * still in the CPU cache.
>>>> + */
>>>> +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
>>>> +				  struct svc_rdma_recv_ctxt *ctxt)
>>>> +{
>>>> +	__be32 inv_rkey, *p;
>>>> +	u32 i, segcount;
>>>> +
>>>> +	ctxt->rc_inv_rkey = 0;
>>>> +
>>>> +	if (!rdma->sc_snd_w_inv)
>>>> +		return;
>>>> +
>>>> +	inv_rkey = xdr_zero;
>>>> +	p = ctxt->rc_recv_buf;
>>>> +	p += rpcrdma_fixed_maxsz;
>>>> +
>>>> +	/* Read list */
>>>> +	while (*p++ != xdr_zero) {
>>>> +		p++;	/* position */
>>>> +		if (inv_rkey == xdr_zero)
>>>> +			inv_rkey = *p;
>>>> +		else if (inv_rkey != *p)
>>>> +			return;
>>>> +		p += 4;
>>>> +	}
>>>> +
>>>> +	/* Write list */
>>>> +	while (*p++ != xdr_zero) {
>>>> +		segcount = be32_to_cpup(p++);
>>>> +		for (i = 0; i < segcount; i++) {
>>>> +			if (inv_rkey == xdr_zero)
>>>> +				inv_rkey = *p;
>>>> +			else if (inv_rkey != *p)
>>>> +				return;
>>>> +			p += 4;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	/* Reply chunk */
>>>> +	if (*p++ != xdr_zero) {
>>>> +		segcount = be32_to_cpup(p++);
>>>> +		for (i = 0; i < segcount; i++) {
>>>> +			if (inv_rkey == xdr_zero)
>>>> +				inv_rkey = *p;
>>>> +			else if (inv_rkey != *p)
>>>> +				return;
>>>> +			p += 4;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
>>>> +}
>>>> +
>>>>  /* On entry, xdr->head[0].iov_base points to first byte in the
>>>>   * RPC-over-RDMA header.
>>>>   *
>>>> @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>>>  		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>>>>  		return ret;
>>>>  	}
>>>> +	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
>>>>    	p += rpcrdma_fixed_maxsz;
>>>>  	if (*p != xdr_zero)
>>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>>> index 8602a5f..d48bc6d 100644
>>>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>>>> @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
>>>>  		*reply = NULL;
>>>>  }
>>>>  -/* RPC-over-RDMA Version One private extension: Remote Invalidation.
>>>> - * Responder's choice: requester signals it can handle Send With
>>>> - * Invalidate, and responder chooses one rkey to invalidate.
>>>> - *
>>>> - * Find a candidate rkey to invalidate when sending a reply.  Picks the
>>>> - * first R_key it finds in the chunk lists.
>>>> - *
>>>> - * Returns zero if RPC's chunk lists are empty.
>>>> - */
>>>> -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
>>>> -				 __be32 *wr_lst, __be32 *rp_ch)
>>>> -{
>>>> -	__be32 *p;
>>>> -
>>>> -	p = rdma_argp + rpcrdma_fixed_maxsz;
>>>> -	if (*p != xdr_zero)
>>>> -		p += 2;
>>>> -	else if (wr_lst && be32_to_cpup(wr_lst + 1))
>>>> -		p = wr_lst + 2;
>>>> -	else if (rp_ch && be32_to_cpup(rp_ch + 1))
>>>> -		p = rp_ch + 2;
>>>> -	else
>>>> -		return 0;
>>>> -	return be32_to_cpup(p);
>>>> -}
>>>> -
>>>>  static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
>>>>  				 struct svc_rdma_send_ctxt *ctxt,
>>>>  				 struct page *page,
>>>> @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>>>   *
>>>>   * RDMA Send is the last step of transmitting an RPC reply. Pages
>>>>   * involved in the earlier RDMA Writes are here transferred out
>>>> - * of the rqstp and into the ctxt's page array. These pages are
>>>> + * of the rqstp and into the sctxt's page array. These pages are
>>>>   * DMA unmapped by each Write completion, but the subsequent Send
>>>>   * completion finally releases these pages.
>>>>   *
>>>> @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>>>>   * - The Reply's transport header will never be larger than a page.
>>>>   */
>>>>  static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
>>>> -				   struct svc_rdma_send_ctxt *ctxt,
>>>> -				   __be32 *rdma_argp,
>>>> +				   struct svc_rdma_send_ctxt *sctxt,
>>>> +				   struct svc_rdma_recv_ctxt *rctxt,
>>>>  				   struct svc_rqst *rqstp,
>>>>  				   __be32 *wr_lst, __be32 *rp_ch)
>>>>  {
>>>>  	int ret;
>>>>    	if (!rp_ch) {
>>>> -		ret = svc_rdma_map_reply_msg(rdma, ctxt,
>>>> +		ret = svc_rdma_map_reply_msg(rdma, sctxt,
>>>>  					     &rqstp->rq_res, wr_lst);
>>>>  		if (ret < 0)
>>>>  			return ret;
>>>>  	}
>>>>  -	svc_rdma_save_io_pages(rqstp, ctxt);
>>>> +	svc_rdma_save_io_pages(rqstp, sctxt);
>>>>  -	ctxt->sc_send_wr.opcode = IB_WR_SEND;
>>>> -	if (rdma->sc_snd_w_inv) {
>>>> -		ctxt->sc_send_wr.ex.invalidate_rkey =
>>>> -			svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
>>>> -		if (ctxt->sc_send_wr.ex.invalidate_rkey)
>>>> -			ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>>>> +	if (rctxt->rc_inv_rkey) {
>>>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
>>>> +		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
>>>> +	} else {
>>>> +		sctxt->sc_send_wr.opcode = IB_WR_SEND;
>>>>  	}
>>>>  	dprintk("svcrdma: posting Send WR with %u sge(s)\n",
>>>> -		ctxt->sc_send_wr.num_sge);
>>>> -	return svc_rdma_send(rdma, &ctxt->sc_send_wr);
>>>> +		sctxt->sc_send_wr.num_sge);
>>>> +	return svc_rdma_send(rdma, &sctxt->sc_send_wr);
>>>>  }
>>>>    /* Given the client-provided Write and Reply chunks, the server was not
>>>> @@ -809,7 +782,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>>>>  	}
>>>>    	svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
>>>> -	ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
>>>> +	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
>>>>  				      wr_lst, rp_ch);
>>>>  	if (ret < 0)
>>>>  		goto err1;
>> --
>> Chuck Lever

--
Chuck Lever




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate
  2018-11-27 22:23       ` Chuck Lever
@ 2018-11-28  1:13         ` Tom Talpey
  0 siblings, 0 replies; 9+ messages in thread
From: Tom Talpey @ 2018-11-28  1:13 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma, Linux NFS Mailing List

On 11/27/2018 5:23 PM, Chuck Lever wrote:
> 
> 
>> On Nov 27, 2018, at 4:30 PM, Tom Talpey <tom@talpey.com> wrote:
>>
>> On 11/27/2018 4:21 PM, Chuck Lever wrote:
>>>> On Nov 27, 2018, at 4:16 PM, Tom Talpey <tom@talpey.com> wrote:
>>>>
>>>> On 11/27/2018 11:11 AM, Chuck Lever wrote:
>>>>> o Select the R_key to invalidate while the CPU cache still contains
>>>>>    the received RPC Call transport header, rather than waiting until
>>>>>    we're about to send the RPC Reply.
>>>>> o Choose Send With Invalidate if there is exactly one distinct R_key
>>>>>    in the received transport header. If there's more than one, the
>>>>>    client will have to perform local invalidation after it has
>>>>>    already waited for remote invalidation.
>>>>
>>>> What's the reason for remote-invalidating only if exactly one
>>>> region is targeted? It seems valuable to save the client the work,
>>>> no matter how many regions are used.
>>> Because remote invalidation delays the Receive completion.
>>
>> Well yes, but the invalidations have to happen before the reply is
>> processed, and remote invalidation saves a local work request plus
>> its completion.
> 
> That is true only if remote invalidation can knock down all the
> R_keys for that RPC. If there's more than one R_key for that RPC,
> a local invalidation is needed anyway, and there's no savings but
> rather there is a cost of the extra latency of waiting twice.
> 
> A couple of details to note:
> - remote invalidation is only available with FRWR, which
>    invalidates asynchronously
> - a smart FRWR client implementation will post a chain of LOCAL
>    INV WRs, then wait for the last one to signal completion. That's
>    just one doorbell, one interrupt, and one context switch no
>    matter how many LOCAL INV WRs are needed.
> 
> So if the client still has to do even one local invalidation, it's
> not worth the trouble to remotely invalidate.

I still don't agree about "not worth" it, but it's a choice.

Just a couple of other notes:

>> Have you measured the difference?
> 
> Yes, as reported in the patch description. Perhaps I can include
> some interesting iozone results.

I didn't see anything about this in the patch description, but I was
not arguing for including this kind of detail, just whether you had
actually measured it. I'm interested in that, btw.

> This behavior seems to be a typical feature of most recent hardware.
> I suspect there's some locking of the hardware Send queue to handle
> RI that contends with actual posted WRs from the host.

That would be really bad. Have you reported this to the vendors?

> With cards that have a shallow FR depth, multiple MRs/R_keys are
> required to register a single 1MB NFS READ or WRITE. Here's where
> squelching remote invalidation really pays off.

Sure, but such a bandwidth-dominated workload isn't very interesting
performance-wise. With 1MB ops I would expect you to be wire-limited,
right?

>> I think it would be best to capture some or all of this
>> explanation in the commit message, in any case.
> 
> You mean you want my patch description to explain _why_ ? ;-)

Sort of. My belief is that this decision represents a micro-optimization
and is unlikely to be forever true. More significantly, it's not a
bug fix or correctness issue. So, capturing the reasoning behind
it is useful for the future, in case someone thinks to unwind it.

Tom.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-11-28  1:13 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-27 16:11 [PATCH v1] svcrdma: Optimize the logic that selects the R_key to invalidate Chuck Lever
2018-11-27 16:29 ` J. Bruce Fields
2018-11-27 16:31   ` Chuck Lever
2018-11-27 17:21     ` Bruce Fields
2018-11-27 21:16 ` Tom Talpey
2018-11-27 21:21   ` Chuck Lever
2018-11-27 21:30     ` Tom Talpey
2018-11-27 22:23       ` Chuck Lever
2018-11-28  1:13         ` Tom Talpey

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.