All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again
@ 2020-12-11 22:33 Chuck Lever
  2020-12-12 19:16 ` Chuck Lever
  0 siblings, 1 reply; 2+ messages in thread
From: Chuck Lever @ 2020-12-11 22:33 UTC (permalink / raw)
  To: linux-nfs

Daire Byrne reports a ~50% aggregrate throughput regression on his
Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to
use xprt_sock_sendmsg for socket sends"), which replaced
kernel_send_page() calls in NFSD's socket send path with calls to
sock_sendmsg() using iov_iter.

Investigation showed that tcp_sendmsg() was not using zero-copy to
send the xdr_buf's bvec pages, but instead was relying on memcpy.
This means copying every byte of a large NFS READ payload.

It looks like TLS sockets do indeed support a ->sendpage method,
so it's really not necessary to use xprt_sock_sendmsg() to support
TLS fully on the server. A mechanical reversion of da1661b93bf4 is
not possible at this point, but we can re-implement the server's
TCP socket sendmsg path using kernel_sendpage().

No Fixes: tag. If needed, please backport this fix by hand.

Reported-by: Daire Byrne <daire@dneg.com>
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svcsock.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 4 deletions(-)


This replaces the SVC zero-copy send patch I posted a couple of
weeks ago.


diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b248f2349437..30332111bd37 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	return 0;	/* record not complete */
 }
 
+/**
+ * svc_tcp_sendmsg - Send an RPC message on a TCP socket
+ * @sock: socket to write the RPC message onto
+ * @xdr: XDR buffer containing the RPC message
+ * @marker: TCP record marker
+ * @sentp: OUT: number of bytes actually written
+ *
+ * Caller serializes calls on this @sock, and ensures the pages
+ * backing @xdr are unchanging. In addition, it is assumed that
+ * no .bv_len is larger than PAGE_SIZE.
+ *
+ * Returns zero on success or a negative errno value.
+ */
+static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr,
+			   rpc_fraghdr marker, unsigned int *sentp)
+{
+	struct kvec vec[2] = {
+		[0] = {
+			.iov_base	= &marker,
+			.iov_len	= sizeof(marker),
+		},
+		[1] = *xdr->head,
+	};
+	size_t len = vec[0].iov_len + vec[1].iov_len;
+	const struct kvec *tail = xdr->tail;
+	struct msghdr msg = {
+		.msg_flags	= 0,
+	};
+	int ret;
+
+	*sentp = 0;
+
+	/*
+	 * Optimized for the common case where we have just the record
+	 * marker and xdr->head.
+	 */
+	if (xdr->head[0].iov_len < xdr->len)
+		msg.msg_flags = MSG_MORE;
+	iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len);
+	ret = sock_sendmsg(sock, &msg);
+	if (ret < 0)
+		return ret;
+	*sentp += ret;
+	if (*sentp != len)
+		goto out;
+
+	if (xdr->page_len) {
+		unsigned int offset, len, remaining;
+		struct bio_vec *bvec;
+		int flags, ret;
+
+		bvec = xdr->bvec;
+		offset = xdr->page_base;
+		remaining = xdr->page_len;
+		flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		while (remaining > 0) {
+			if (remaining <= PAGE_SIZE && tail->iov_len == 0)
+				flags = 0;
+			len = min(remaining, bvec->bv_len);
+			ret = kernel_sendpage(sock, bvec->bv_page,
+						bvec->bv_offset + offset,
+						len, flags);
+			if (ret < 0)
+				return ret;
+			*sentp += ret;
+			if (ret != len)
+				goto out;
+			remaining -= len;
+			offset = 0;
+			bvec++;
+		}
+	}
+
+	if (tail->iov_len) {
+		ret = kernel_sendpage(sock, virt_to_page(tail->iov_base),
+				      offset_in_page(tail->iov_base),
+				      tail->iov_len, 0);
+		if (ret < 0)
+			return ret;
+		*sentp += ret;
+	}
+
+out:
+	return 0;
+}
+
 /**
  * svc_tcp_sendto - Send out a reply on a TCP socket
  * @rqstp: completed svc_rqst
@@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 	struct xdr_buf *xdr = &rqstp->rq_res;
 	rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
 					 (u32)xdr->len);
-	struct msghdr msg = {
-		.msg_flags	= 0,
-	};
 	unsigned int sent;
 	int err;
 
 	svc_tcp_release_rqst(rqstp);
+	xdr_alloc_bvec(xdr);
 
 	mutex_lock(&xprt->xpt_mutex);
 	if (svc_xprt_is_dead(xprt))
 		goto out_notconn;
-	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
+	err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
 	xdr_free_bvec(xdr);
 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
 	if (err < 0 || sent != (xdr->len + sizeof(marker)))



^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again
  2020-12-11 22:33 [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again Chuck Lever
@ 2020-12-12 19:16 ` Chuck Lever
  0 siblings, 0 replies; 2+ messages in thread
From: Chuck Lever @ 2020-12-12 19:16 UTC (permalink / raw)
  To: Linux NFS Mailing List



> On Dec 11, 2020, at 5:33 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
> 
> Daire Byrne reports a ~50% aggregrate throughput regression on his
> Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to
> use xprt_sock_sendmsg for socket sends"), which replaced
> kernel_send_page() calls in NFSD's socket send path with calls to
> sock_sendmsg() using iov_iter.
> 
> Investigation showed that tcp_sendmsg() was not using zero-copy to
> send the xdr_buf's bvec pages, but instead was relying on memcpy.
> This means copying every byte of a large NFS READ payload.
> 
> It looks like TLS sockets do indeed support a ->sendpage method,
> so it's really not necessary to use xprt_sock_sendmsg() to support
> TLS fully on the server. A mechanical reversion of da1661b93bf4 is
> not possible at this point, but we can re-implement the server's
> TCP socket sendmsg path using kernel_sendpage().
> 
> No Fixes: tag. If needed, please backport this fix by hand.
> 
> Reported-by: Daire Byrne <daire@dneg.com>
> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> net/sunrpc/svcsock.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 88 insertions(+), 4 deletions(-)
> 
> 
> This replaces the SVC zero-copy send patch I posted a couple of
> weeks ago.
> 
> 
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index b248f2349437..30332111bd37 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
> 	return 0;	/* record not complete */
> }
> 
> +/**
> + * svc_tcp_sendmsg - Send an RPC message on a TCP socket
> + * @sock: socket to write the RPC message onto
> + * @xdr: XDR buffer containing the RPC message
> + * @marker: TCP record marker
> + * @sentp: OUT: number of bytes actually written
> + *
> + * Caller serializes calls on this @sock, and ensures the pages
> + * backing @xdr are unchanging. In addition, it is assumed that
> + * no .bv_len is larger than PAGE_SIZE.
> + *
> + * Returns zero on success or a negative errno value.
> + */
> +static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr,
> +			   rpc_fraghdr marker, unsigned int *sentp)
> +{
> +	struct kvec vec[2] = {
> +		[0] = {
> +			.iov_base	= &marker,
> +			.iov_len	= sizeof(marker),
> +		},
> +		[1] = *xdr->head,
> +	};
> +	size_t len = vec[0].iov_len + vec[1].iov_len;
> +	const struct kvec *tail = xdr->tail;
> +	struct msghdr msg = {
> +		.msg_flags	= 0,
> +	};
> +	int ret;
> +
> +	*sentp = 0;
> +
> +	/*
> +	 * Optimized for the common case where we have just the record
> +	 * marker and xdr->head.
> +	 */
> +	if (xdr->head[0].iov_len < xdr->len)
> +		msg.msg_flags = MSG_MORE;
> +	iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len);
> +	ret = sock_sendmsg(sock, &msg);
> +	if (ret < 0)
> +		return ret;
> +	*sentp += ret;
> +	if (*sentp != len)
> +		goto out;
> +
> +	if (xdr->page_len) {
> +		unsigned int offset, len, remaining;
> +		struct bio_vec *bvec;
> +		int flags, ret;
> +
> +		bvec = xdr->bvec;
> +		offset = xdr->page_base;
> +		remaining = xdr->page_len;
> +		flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +		while (remaining > 0) {
> +			if (remaining <= PAGE_SIZE && tail->iov_len == 0)
> +				flags = 0;
> +			len = min(remaining, bvec->bv_len);
> +			ret = kernel_sendpage(sock, bvec->bv_page,
> +						bvec->bv_offset + offset,
> +						len, flags);
> +			if (ret < 0)
> +				return ret;
> +			*sentp += ret;
> +			if (ret != len)
> +				goto out;
> +			remaining -= len;
> +			offset = 0;
> +			bvec++;
> +		}
> +	}
> +
> +	if (tail->iov_len) {
> +		ret = kernel_sendpage(sock, virt_to_page(tail->iov_base),
> +				      offset_in_page(tail->iov_base),
> +				      tail->iov_len, 0);
> +		if (ret < 0)
> +			return ret;
> +		*sentp += ret;
> +	}
> +
> +out:
> +	return 0;
> +}
> +
> /**
>  * svc_tcp_sendto - Send out a reply on a TCP socket
>  * @rqstp: completed svc_rqst
> @@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
> 	struct xdr_buf *xdr = &rqstp->rq_res;
> 	rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
> 					 (u32)xdr->len);
> -	struct msghdr msg = {
> -		.msg_flags	= 0,
> -	};
> 	unsigned int sent;
> 	int err;
> 
> 	svc_tcp_release_rqst(rqstp);
> +	xdr_alloc_bvec(xdr);

This should be

+	xdr_alloc_bvec(xdr, GFP_KERNEL);


> 	mutex_lock(&xprt->xpt_mutex);
> 	if (svc_xprt_is_dead(xprt))
> 		goto out_notconn;
> -	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
> +	err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
> 	xdr_free_bvec(xdr);
> 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> 	if (err < 0 || sent != (xdr->len + sizeof(marker)))
> 
> 

--
Chuck Lever




^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-12-12 19:18 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-11 22:33 [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again Chuck Lever
2020-12-12 19:16 ` Chuck Lever

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.