* [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again
@ 2020-12-11 22:33 Chuck Lever
2020-12-12 19:16 ` Chuck Lever
0 siblings, 1 reply; 2+ messages in thread
From: Chuck Lever @ 2020-12-11 22:33 UTC (permalink / raw)
To: linux-nfs
Daire Byrne reports a ~50% aggregrate throughput regression on his
Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to
use xprt_sock_sendmsg for socket sends"), which replaced
kernel_send_page() calls in NFSD's socket send path with calls to
sock_sendmsg() using iov_iter.
Investigation showed that tcp_sendmsg() was not using zero-copy to
send the xdr_buf's bvec pages, but instead was relying on memcpy.
This means copying every byte of a large NFS READ payload.
It looks like TLS sockets do indeed support a ->sendpage method,
so it's really not necessary to use xprt_sock_sendmsg() to support
TLS fully on the server. A mechanical reversion of da1661b93bf4 is
not possible at this point, but we can re-implement the server's
TCP socket sendmsg path using kernel_sendpage().
No Fixes: tag. If needed, please backport this fix by hand.
Reported-by: Daire Byrne <daire@dneg.com>
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/svcsock.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 88 insertions(+), 4 deletions(-)
This replaces the SVC zero-copy send patch I posted a couple of
weeks ago.
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b248f2349437..30332111bd37 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
return 0; /* record not complete */
}
+/**
+ * svc_tcp_sendmsg - Send an RPC message on a TCP socket
+ * @sock: socket to write the RPC message onto
+ * @xdr: XDR buffer containing the RPC message
+ * @marker: TCP record marker
+ * @sentp: OUT: number of bytes actually written
+ *
+ * Caller serializes calls on this @sock, and ensures the pages
+ * backing @xdr are unchanging. In addition, it is assumed that
+ * no .bv_len is larger than PAGE_SIZE.
+ *
+ * Returns zero on success or a negative errno value.
+ */
+static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr,
+ rpc_fraghdr marker, unsigned int *sentp)
+{
+ struct kvec vec[2] = {
+ [0] = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker),
+ },
+ [1] = *xdr->head,
+ };
+ size_t len = vec[0].iov_len + vec[1].iov_len;
+ const struct kvec *tail = xdr->tail;
+ struct msghdr msg = {
+ .msg_flags = 0,
+ };
+ int ret;
+
+ *sentp = 0;
+
+ /*
+ * Optimized for the common case where we have just the record
+ * marker and xdr->head.
+ */
+ if (xdr->head[0].iov_len < xdr->len)
+ msg.msg_flags = MSG_MORE;
+ iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len);
+ ret = sock_sendmsg(sock, &msg);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (*sentp != len)
+ goto out;
+
+ if (xdr->page_len) {
+ unsigned int offset, len, remaining;
+ struct bio_vec *bvec;
+ int flags, ret;
+
+ bvec = xdr->bvec;
+ offset = xdr->page_base;
+ remaining = xdr->page_len;
+ flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ while (remaining > 0) {
+ if (remaining <= PAGE_SIZE && tail->iov_len == 0)
+ flags = 0;
+ len = min(remaining, bvec->bv_len);
+ ret = kernel_sendpage(sock, bvec->bv_page,
+ bvec->bv_offset + offset,
+ len, flags);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != len)
+ goto out;
+ remaining -= len;
+ offset = 0;
+ bvec++;
+ }
+ }
+
+ if (tail->iov_len) {
+ ret = kernel_sendpage(sock, virt_to_page(tail->iov_base),
+ offset_in_page(tail->iov_base),
+ tail->iov_len, 0);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ }
+
+out:
+ return 0;
+}
+
/**
* svc_tcp_sendto - Send out a reply on a TCP socket
* @rqstp: completed svc_rqst
@@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
struct xdr_buf *xdr = &rqstp->rq_res;
rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
(u32)xdr->len);
- struct msghdr msg = {
- .msg_flags = 0,
- };
unsigned int sent;
int err;
svc_tcp_release_rqst(rqstp);
+ xdr_alloc_bvec(xdr);
mutex_lock(&xprt->xpt_mutex);
if (svc_xprt_is_dead(xprt))
goto out_notconn;
- err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
+ err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
xdr_free_bvec(xdr);
trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
if (err < 0 || sent != (xdr->len + sizeof(marker)))
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again
2020-12-11 22:33 [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again Chuck Lever
@ 2020-12-12 19:16 ` Chuck Lever
0 siblings, 0 replies; 2+ messages in thread
From: Chuck Lever @ 2020-12-12 19:16 UTC (permalink / raw)
To: Linux NFS Mailing List
> On Dec 11, 2020, at 5:33 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
>
> Daire Byrne reports a ~50% aggregrate throughput regression on his
> Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to
> use xprt_sock_sendmsg for socket sends"), which replaced
> kernel_send_page() calls in NFSD's socket send path with calls to
> sock_sendmsg() using iov_iter.
>
> Investigation showed that tcp_sendmsg() was not using zero-copy to
> send the xdr_buf's bvec pages, but instead was relying on memcpy.
> This means copying every byte of a large NFS READ payload.
>
> It looks like TLS sockets do indeed support a ->sendpage method,
> so it's really not necessary to use xprt_sock_sendmsg() to support
> TLS fully on the server. A mechanical reversion of da1661b93bf4 is
> not possible at this point, but we can re-implement the server's
> TCP socket sendmsg path using kernel_sendpage().
>
> No Fixes: tag. If needed, please backport this fix by hand.
>
> Reported-by: Daire Byrne <daire@dneg.com>
> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> net/sunrpc/svcsock.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 88 insertions(+), 4 deletions(-)
>
>
> This replaces the SVC zero-copy send patch I posted a couple of
> weeks ago.
>
>
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index b248f2349437..30332111bd37 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
> return 0; /* record not complete */
> }
>
> +/**
> + * svc_tcp_sendmsg - Send an RPC message on a TCP socket
> + * @sock: socket to write the RPC message onto
> + * @xdr: XDR buffer containing the RPC message
> + * @marker: TCP record marker
> + * @sentp: OUT: number of bytes actually written
> + *
> + * Caller serializes calls on this @sock, and ensures the pages
> + * backing @xdr are unchanging. In addition, it is assumed that
> + * no .bv_len is larger than PAGE_SIZE.
> + *
> + * Returns zero on success or a negative errno value.
> + */
> +static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr,
> + rpc_fraghdr marker, unsigned int *sentp)
> +{
> + struct kvec vec[2] = {
> + [0] = {
> + .iov_base = &marker,
> + .iov_len = sizeof(marker),
> + },
> + [1] = *xdr->head,
> + };
> + size_t len = vec[0].iov_len + vec[1].iov_len;
> + const struct kvec *tail = xdr->tail;
> + struct msghdr msg = {
> + .msg_flags = 0,
> + };
> + int ret;
> +
> + *sentp = 0;
> +
> + /*
> + * Optimized for the common case where we have just the record
> + * marker and xdr->head.
> + */
> + if (xdr->head[0].iov_len < xdr->len)
> + msg.msg_flags = MSG_MORE;
> + iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len);
> + ret = sock_sendmsg(sock, &msg);
> + if (ret < 0)
> + return ret;
> + *sentp += ret;
> + if (*sentp != len)
> + goto out;
> +
> + if (xdr->page_len) {
> + unsigned int offset, len, remaining;
> + struct bio_vec *bvec;
> + int flags, ret;
> +
> + bvec = xdr->bvec;
> + offset = xdr->page_base;
> + remaining = xdr->page_len;
> + flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
> + while (remaining > 0) {
> + if (remaining <= PAGE_SIZE && tail->iov_len == 0)
> + flags = 0;
> + len = min(remaining, bvec->bv_len);
> + ret = kernel_sendpage(sock, bvec->bv_page,
> + bvec->bv_offset + offset,
> + len, flags);
> + if (ret < 0)
> + return ret;
> + *sentp += ret;
> + if (ret != len)
> + goto out;
> + remaining -= len;
> + offset = 0;
> + bvec++;
> + }
> + }
> +
> + if (tail->iov_len) {
> + ret = kernel_sendpage(sock, virt_to_page(tail->iov_base),
> + offset_in_page(tail->iov_base),
> + tail->iov_len, 0);
> + if (ret < 0)
> + return ret;
> + *sentp += ret;
> + }
> +
> +out:
> + return 0;
> +}
> +
> /**
> * svc_tcp_sendto - Send out a reply on a TCP socket
> * @rqstp: completed svc_rqst
> @@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
> struct xdr_buf *xdr = &rqstp->rq_res;
> rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
> (u32)xdr->len);
> - struct msghdr msg = {
> - .msg_flags = 0,
> - };
> unsigned int sent;
> int err;
>
> svc_tcp_release_rqst(rqstp);
> + xdr_alloc_bvec(xdr);
This should be
+ xdr_alloc_bvec(xdr, GFP_KERNEL);
> mutex_lock(&xprt->xpt_mutex);
> if (svc_xprt_is_dead(xprt))
> goto out_notconn;
> - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
> + err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
> xdr_free_bvec(xdr);
> trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> if (err < 0 || sent != (xdr->len + sizeof(marker)))
>
>
--
Chuck Lever
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-12-12 19:18 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-11 22:33 [PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again Chuck Lever
2020-12-12 19:16 ` Chuck Lever
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.