linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
       [not found] <20230316152618.711970-1-dhowells@redhat.com>
@ 2023-03-16 15:26 ` David Howells
  2023-03-16 16:17   ` Trond Myklebust
  2023-03-16 16:24   ` David Howells
  0 siblings, 2 replies; 14+ messages in thread
From: David Howells @ 2023-03-16 15:26 UTC (permalink / raw)
  To: Matthew Wilcox, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: David Howells, Al Viro, Christoph Hellwig, Jens Axboe,
	Jeff Layton, Christian Brauner, Linus Torvalds, netdev,
	linux-fsdevel, linux-kernel, linux-mm, Trond Myklebust,
	Anna Schumaker, Chuck Lever, linux-nfs

When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather than
performing several sendmsg and sendpage calls to transmit header, data
pages and trailer.

To make this work, the data is assembled in a bio_vec array and attached to
a BVEC-type iterator.  The bio_vec array has two extra slots before the
first for headers and one after the last for a trailer.  The headers and
trailer are copied into memory acquired from zcopy_alloc() which just
breaks a page up into small pieces that can be freed with put_page().

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna@kernel.org>
cc: Chuck Lever <chuck.lever@oracle.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-nfs@vger.kernel.org
cc: netdev@vger.kernel.org
---
 net/sunrpc/svcsock.c | 70 ++++++++++++--------------------------------
 net/sunrpc/xdr.c     | 24 ++++++++++++---
 2 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 03a4f5615086..1fa41ddbc40e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -36,6 +36,7 @@
 #include <linux/skbuff.h>
 #include <linux/file.h>
 #include <linux/freezer.h>
+#include <linux/zcopy_alloc.h>
 #include <net/sock.h>
 #include <net/checksum.h>
 #include <net/ip.h>
@@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	return 0;	/* record not complete */
 }
 
-static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
-			      int flags)
-{
-	return kernel_sendpage(sock, virt_to_page(vec->iov_base),
-			       offset_in_page(vec->iov_base),
-			       vec->iov_len, flags);
-}
-
 /*
- * kernel_sendpage() is used exclusively to reduce the number of
+ * MSG_SPLICE_PAGES is used exclusively to reduce the number of
  * copy operations in this path. Therefore the caller must ensure
  * that the pages backing @xdr are unchanging.
  *
@@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
 {
 	const struct kvec *head = xdr->head;
 	const struct kvec *tail = xdr->tail;
-	struct kvec rm = {
-		.iov_base	= &marker,
-		.iov_len	= sizeof(marker),
-	};
 	struct msghdr msg = {
-		.msg_flags	= 0,
+		.msg_flags	= MSG_SPLICE_PAGES,
 	};
-	int ret;
+	int ret, n = xdr_buf_pagecount(xdr), size;
 
 	*sentp = 0;
 	ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
 	if (ret < 0)
 		return ret;
 
-	ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len);
+	ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL);
 	if (ret < 0)
 		return ret;
-	*sentp += ret;
-	if (ret != rm.iov_len)
-		return -EAGAIN;
 
-	ret = svc_tcp_send_kvec(sock, head, 0);
+	ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL);
 	if (ret < 0)
 		return ret;
-	*sentp += ret;
-	if (ret != head->iov_len)
-		goto out;
 
-	if (xdr->page_len) {
-		unsigned int offset, len, remaining;
-		struct bio_vec *bvec;
-
-		bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
-		offset = offset_in_page(xdr->page_base);
-		remaining = xdr->page_len;
-		while (remaining > 0) {
-			len = min(remaining, bvec->bv_len - offset);
-			ret = kernel_sendpage(sock, bvec->bv_page,
-					      bvec->bv_offset + offset,
-					      len, 0);
-			if (ret < 0)
-				return ret;
-			*sentp += ret;
-			if (ret != len)
-				goto out;
-			remaining -= len;
-			offset = 0;
-			bvec++;
-		}
-	}
+	ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL);
+	if (ret < 0)
+		return ret;
 
-	if (tail->iov_len) {
-		ret = svc_tcp_send_kvec(sock, tail, 0);
-		if (ret < 0)
-			return ret;
-		*sentp += ret;
-	}
+	size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size);
 
-out:
+	ret = sock_sendmsg(sock, &msg);
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		*sentp = ret;
+	if (ret != size)
+		return -EAGAIN;
 	return 0;
 }
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 36835b2f5446..6dff0b4f17b8 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
 {
 	size_t i, n = xdr_buf_pagecount(buf);
 
-	if (n != 0 && buf->bvec == NULL) {
-		buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
+	if (buf->bvec == NULL) {
+		/* Allow for two headers and a trailer to be attached */
+		buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp);
 		if (!buf->bvec)
 			return -ENOMEM;
+		buf->bvec += 2;
+		buf->bvec[-2].bv_page = NULL;
+		buf->bvec[-1].bv_page = NULL;
 		for (i = 0; i < n; i++) {
 			bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE,
 				      0);
 		}
+		buf->bvec[n].bv_page = NULL;
 	}
 	return 0;
 }
@@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
 void
 xdr_free_bvec(struct xdr_buf *buf)
 {
-	kfree(buf->bvec);
-	buf->bvec = NULL;
+	if (buf->bvec) {
+		size_t n = xdr_buf_pagecount(buf);
+
+		if (buf->bvec[-2].bv_page)
+			put_page(buf->bvec[-2].bv_page);
+		if (buf->bvec[-1].bv_page)
+			put_page(buf->bvec[-1].bv_page);
+		if (buf->bvec[n].bv_page)
+			put_page(buf->bvec[n].bv_page);
+		buf->bvec -= 2;
+		kfree(buf->bvec);
+		buf->bvec = NULL;
+	}
 }
 
 /**


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 15:26 ` [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage David Howells
@ 2023-03-16 16:17   ` Trond Myklebust
  2023-03-16 17:10     ` Chuck Lever III
                       ` (2 more replies)
  2023-03-16 16:24   ` David Howells
  1 sibling, 3 replies; 14+ messages in thread
From: Trond Myklebust @ 2023-03-16 16:17 UTC (permalink / raw)
  To: David Howells
  Cc: Matthew Wilcox, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexander Viro, Christoph Hellwig, Jens Axboe,
	Jeffrey Layton, Christian Brauner, Linus Torvalds, netdev,
	linux-fsdevel, linux-kernel, linux-mm, Trond Myklebust,
	Anna Schumaker, Charles Edward Lever, linux-nfs



> On Mar 16, 2023, at 11:26, David Howells <dhowells@redhat.com> wrote:
> 
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
> 
> To make this work, the data is assembled in a bio_vec array and attached to
> a BVEC-type iterator.  The bio_vec array has two extra slots before the
> first for headers and one after the last for a trailer.  The headers and
> trailer are copied into memory acquired from zcopy_alloc() which just
> breaks a page up into small pieces that can be freed with put_page().
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Trond Myklebust <trond.myklebust@hammerspace.com>
> cc: Anna Schumaker <anna@kernel.org>
> cc: Chuck Lever <chuck.lever@oracle.com>
> cc: Jeff Layton <jlayton@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: linux-nfs@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
> net/sunrpc/svcsock.c | 70 ++++++++++++--------------------------------
> net/sunrpc/xdr.c     | 24 ++++++++++++---
> 2 files changed, 38 insertions(+), 56 deletions(-)
> 
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 03a4f5615086..1fa41ddbc40e 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -36,6 +36,7 @@
> #include <linux/skbuff.h>
> #include <linux/file.h>
> #include <linux/freezer.h>
> +#include <linux/zcopy_alloc.h>
> #include <net/sock.h>
> #include <net/checksum.h>
> #include <net/ip.h>
> @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
> return 0; /* record not complete */
> }
> 
> -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
> -      int flags)
> -{
> - return kernel_sendpage(sock, virt_to_page(vec->iov_base),
> -       offset_in_page(vec->iov_base),
> -       vec->iov_len, flags);
> -}
> -
> /*
> - * kernel_sendpage() is used exclusively to reduce the number of
> + * MSG_SPLICE_PAGES is used exclusively to reduce the number of
>  * copy operations in this path. Therefore the caller must ensure
>  * that the pages backing @xdr are unchanging.
>  *
> @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
> {
> const struct kvec *head = xdr->head;
> const struct kvec *tail = xdr->tail;
> - struct kvec rm = {
> - .iov_base = &marker,
> - .iov_len = sizeof(marker),
> - };
> struct msghdr msg = {
> - .msg_flags = 0,
> + .msg_flags = MSG_SPLICE_PAGES,
> };
> - int ret;
> + int ret, n = xdr_buf_pagecount(xdr), size;
> 
> *sentp = 0;
> ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
> if (ret < 0)
> return ret;
> 
> - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len);
> + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL);
> if (ret < 0)
> return ret;
> - *sentp += ret;
> - if (ret != rm.iov_len)
> - return -EAGAIN;
> 
> - ret = svc_tcp_send_kvec(sock, head, 0);
> + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL);
> if (ret < 0)
> return ret;
> - *sentp += ret;
> - if (ret != head->iov_len)
> - goto out;
> 
> - if (xdr->page_len) {
> - unsigned int offset, len, remaining;
> - struct bio_vec *bvec;
> -
> - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
> - offset = offset_in_page(xdr->page_base);
> - remaining = xdr->page_len;
> - while (remaining > 0) {
> - len = min(remaining, bvec->bv_len - offset);
> - ret = kernel_sendpage(sock, bvec->bv_page,
> -      bvec->bv_offset + offset,
> -      len, 0);
> - if (ret < 0)
> - return ret;
> - *sentp += ret;
> - if (ret != len)
> - goto out;
> - remaining -= len;
> - offset = 0;
> - bvec++;
> - }
> - }
> + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL);
> + if (ret < 0)
> + return ret;
> 
> - if (tail->iov_len) {
> - ret = svc_tcp_send_kvec(sock, tail, 0);
> - if (ret < 0)
> - return ret;
> - *sentp += ret;
> - }
> + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size);
> 
> -out:
> + ret = sock_sendmsg(sock, &msg);
> + if (ret < 0)
> + return ret;
> + if (ret > 0)
> + *sentp = ret;
> + if (ret != size)
> + return -EAGAIN;
> return 0;
> }
> 
> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
> index 36835b2f5446..6dff0b4f17b8 100644
> --- a/net/sunrpc/xdr.c
> +++ b/net/sunrpc/xdr.c
> @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
> {
> size_t i, n = xdr_buf_pagecount(buf);
> 
> - if (n != 0 && buf->bvec == NULL) {
> - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
> + if (buf->bvec == NULL) {
> + /* Allow for two headers and a trailer to be attached */
> + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp);
> if (!buf->bvec)
> return -ENOMEM;
> + buf->bvec += 2;
> + buf->bvec[-2].bv_page = NULL;
> + buf->bvec[-1].bv_page = NULL;

NACK.

> for (i = 0; i < n; i++) {
> bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE,
>      0);
> }
> + buf->bvec[n].bv_page = NULL;
> }
> return 0;
> }
> @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
> void
> xdr_free_bvec(struct xdr_buf *buf)
> {
> - kfree(buf->bvec);
> - buf->bvec = NULL;
> + if (buf->bvec) {
> + size_t n = xdr_buf_pagecount(buf);
> +
> + if (buf->bvec[-2].bv_page)
> + put_page(buf->bvec[-2].bv_page);
> + if (buf->bvec[-1].bv_page)
> + put_page(buf->bvec[-1].bv_page);
> + if (buf->bvec[n].bv_page)
> + put_page(buf->bvec[n].bv_page);
> + buf->bvec -= 2;
> + kfree(buf->bvec);
> + buf->bvec = NULL;
> + }
> }
> 
> /**
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 15:26 ` [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage David Howells
  2023-03-16 16:17   ` Trond Myklebust
@ 2023-03-16 16:24   ` David Howells
  2023-03-16 18:06     ` David Howells
  1 sibling, 1 reply; 14+ messages in thread
From: David Howells @ 2023-03-16 16:24 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: dhowells, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexander Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs

Trond Myklebust <trondmy@hammerspace.com> wrote:

> > + buf->bvec += 2;
> > + buf->bvec[-2].bv_page = NULL;
> > + buf->bvec[-1].bv_page = NULL;
> 
> NACK.

Can you elaborate?

Is it that you dislike allocating extra slots for protocol bits?  Or just that
the bvec[] is offset by 2?  Or some other reason?

David


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 16:17   ` Trond Myklebust
@ 2023-03-16 17:10     ` Chuck Lever III
  2023-03-16 17:28     ` David Howells
  2023-03-16 21:21     ` David Howells
  2 siblings, 0 replies; 14+ messages in thread
From: Chuck Lever III @ 2023-03-16 17:10 UTC (permalink / raw)
  To: David Howells
  Cc: Trond Myklebust, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Al Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Linux NFS Mailing List


Note: this is the first I've seen of this series -- not sure why
I never received any of these patches.

That means I haven't seen the cover letter and do not have any
context for this proposed change.


> On Mar 16, 2023, at 12:17 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
>> On Mar 16, 2023, at 11:26, David Howells <dhowells@redhat.com> wrote:
>> 
>> When transmitting data, call down into TCP using a single sendmsg with
>> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
>> performing several sendmsg and sendpage calls to transmit header, data
>> pages and trailer.

We've tried combining the sendpages calls in here before. It
results in a significant and measurable performance regression.
See:

da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends")

and it's subsequent revert:

4a85a6a3320b ("SUNRPC: Handle TCP socket sends with kernel_sendpage() again")


Therefore, this kind of change needs to be accompanied by both
benchmark results and some field testing to convince me it won't
cause harm.

Also, I'd rather see struct xdr_buf changed to /replace/ the
head/pagevec/tail arrangement with bvecs before we do this
kind of overhaul.

And, we have to make certain that this doesn't break operation
with kTLS sockets... do they support MSG_SPLICE_PAGES ?


>> To make this work, the data is assembled in a bio_vec array and attached to
>> a BVEC-type iterator.  The bio_vec array has two extra slots before the
>> first for headers and one after the last for a trailer.  The headers and
>> trailer are copied into memory acquired from zcopy_alloc() which just
>> breaks a page up into small pieces that can be freed with put_page().
>> 
>> Signed-off-by: David Howells <dhowells@redhat.com>
>> cc: Trond Myklebust <trond.myklebust@hammerspace.com>
>> cc: Anna Schumaker <anna@kernel.org>
>> cc: Chuck Lever <chuck.lever@oracle.com>
>> cc: Jeff Layton <jlayton@kernel.org>
>> cc: "David S. Miller" <davem@davemloft.net>
>> cc: Eric Dumazet <edumazet@google.com>
>> cc: Jakub Kicinski <kuba@kernel.org>
>> cc: Paolo Abeni <pabeni@redhat.com>
>> cc: Jens Axboe <axboe@kernel.dk>
>> cc: Matthew Wilcox <willy@infradead.org>
>> cc: linux-nfs@vger.kernel.org
>> cc: netdev@vger.kernel.org
>> ---
>> net/sunrpc/svcsock.c | 70 ++++++++++++--------------------------------
>> net/sunrpc/xdr.c     | 24 ++++++++++++---
>> 2 files changed, 38 insertions(+), 56 deletions(-)
>> 
>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>> index 03a4f5615086..1fa41ddbc40e 100644
>> --- a/net/sunrpc/svcsock.c
>> +++ b/net/sunrpc/svcsock.c
>> @@ -36,6 +36,7 @@
>> #include <linux/skbuff.h>
>> #include <linux/file.h>
>> #include <linux/freezer.h>
>> +#include <linux/zcopy_alloc.h>
>> #include <net/sock.h>
>> #include <net/checksum.h>
>> #include <net/ip.h>
>> @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>> return 0; /* record not complete */
>> }
>> 
>> -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
>> -      int flags)
>> -{
>> - return kernel_sendpage(sock, virt_to_page(vec->iov_base),
>> -       offset_in_page(vec->iov_base),
>> -       vec->iov_len, flags);
>> -}
>> -
>> /*
>> - * kernel_sendpage() is used exclusively to reduce the number of
>> + * MSG_SPLICE_PAGES is used exclusively to reduce the number of
>> * copy operations in this path. Therefore the caller must ensure
>> * that the pages backing @xdr are unchanging.
>> *
>> @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
>> {
>> const struct kvec *head = xdr->head;
>> const struct kvec *tail = xdr->tail;
>> - struct kvec rm = {
>> - .iov_base = &marker,
>> - .iov_len = sizeof(marker),
>> - };
>> struct msghdr msg = {
>> - .msg_flags = 0,
>> + .msg_flags = MSG_SPLICE_PAGES,
>> };
>> - int ret;
>> + int ret, n = xdr_buf_pagecount(xdr), size;
>> 
>> *sentp = 0;
>> ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
>> if (ret < 0)
>> return ret;
>> 
>> - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len);
>> + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL);
>> if (ret < 0)
>> return ret;
>> - *sentp += ret;
>> - if (ret != rm.iov_len)
>> - return -EAGAIN;
>> 
>> - ret = svc_tcp_send_kvec(sock, head, 0);
>> + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL);
>> if (ret < 0)
>> return ret;
>> - *sentp += ret;
>> - if (ret != head->iov_len)
>> - goto out;
>> 
>> - if (xdr->page_len) {
>> - unsigned int offset, len, remaining;
>> - struct bio_vec *bvec;
>> -
>> - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
>> - offset = offset_in_page(xdr->page_base);
>> - remaining = xdr->page_len;
>> - while (remaining > 0) {
>> - len = min(remaining, bvec->bv_len - offset);
>> - ret = kernel_sendpage(sock, bvec->bv_page,
>> -      bvec->bv_offset + offset,
>> -      len, 0);
>> - if (ret < 0)
>> - return ret;
>> - *sentp += ret;
>> - if (ret != len)
>> - goto out;
>> - remaining -= len;
>> - offset = 0;
>> - bvec++;
>> - }
>> - }
>> + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL);
>> + if (ret < 0)
>> + return ret;
>> 
>> - if (tail->iov_len) {
>> - ret = svc_tcp_send_kvec(sock, tail, 0);
>> - if (ret < 0)
>> - return ret;
>> - *sentp += ret;
>> - }
>> + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
>> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size);
>> 
>> -out:
>> + ret = sock_sendmsg(sock, &msg);
>> + if (ret < 0)
>> + return ret;
>> + if (ret > 0)
>> + *sentp = ret;
>> + if (ret != size)
>> + return -EAGAIN;
>> return 0;
>> }
>> 
>> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
>> index 36835b2f5446..6dff0b4f17b8 100644
>> --- a/net/sunrpc/xdr.c
>> +++ b/net/sunrpc/xdr.c
>> @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
>> {
>> size_t i, n = xdr_buf_pagecount(buf);
>> 
>> - if (n != 0 && buf->bvec == NULL) {
>> - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
>> + if (buf->bvec == NULL) {
>> + /* Allow for two headers and a trailer to be attached */
>> + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp);
>> if (!buf->bvec)
>> return -ENOMEM;
>> + buf->bvec += 2;
>> + buf->bvec[-2].bv_page = NULL;
>> + buf->bvec[-1].bv_page = NULL;
> 
> NACK.
> 
>> for (i = 0; i < n; i++) {
>> bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE,
>>     0);
>> }
>> + buf->bvec[n].bv_page = NULL;
>> }
>> return 0;
>> }
>> @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
>> void
>> xdr_free_bvec(struct xdr_buf *buf)
>> {
>> - kfree(buf->bvec);
>> - buf->bvec = NULL;
>> + if (buf->bvec) {
>> + size_t n = xdr_buf_pagecount(buf);
>> +
>> + if (buf->bvec[-2].bv_page)
>> + put_page(buf->bvec[-2].bv_page);
>> + if (buf->bvec[-1].bv_page)
>> + put_page(buf->bvec[-1].bv_page);
>> + if (buf->bvec[n].bv_page)
>> + put_page(buf->bvec[n].bv_page);
>> + buf->bvec -= 2;
>> + kfree(buf->bvec);
>> + buf->bvec = NULL;
>> + }
>> }
>> 
>> /**
>> 
> 

--
Chuck Lever



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 16:17   ` Trond Myklebust
  2023-03-16 17:10     ` Chuck Lever III
@ 2023-03-16 17:28     ` David Howells
  2023-03-16 17:41       ` Chuck Lever III
  2023-03-16 21:21     ` David Howells
  2 siblings, 1 reply; 14+ messages in thread
From: David Howells @ 2023-03-16 17:28 UTC (permalink / raw)
  To: Chuck Lever III
  Cc: dhowells, Trond Myklebust, Matthew Wilcox, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Al Viro,
	Christoph Hellwig, Jens Axboe, Jeffrey Layton, Christian Brauner,
	Linus Torvalds, netdev, linux-fsdevel, linux-kernel, linux-mm,
	Anna Schumaker, Linux NFS Mailing List

Chuck Lever III <chuck.lever@oracle.com> wrote:

> That means I haven't seen the cover letter and do not have any
> context for this proposed change.

https://lore.kernel.org/linux-fsdevel/20230316152618.711970-1-dhowells@redhat.com/

> We've tried combining the sendpages calls in here before. It
> results in a significant and measurable performance regression.
> See:
> 
> da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends")

The commit replaced the use of sendpage with sendmsg, but that took away the
zerocopy aspect of sendpage.  The idea behind MSG_SPLICE_PAGES is that it
allows you to do keep that.  I'll have to try reapplying this commit and
adding the MSG_SPLICE_PAGES flag.

> Therefore, this kind of change needs to be accompanied by both
> benchmark results and some field testing to convince me it won't
> cause harm.

Yep.

> And, we have to make certain that this doesn't break operation
> with kTLS sockets... do they support MSG_SPLICE_PAGES ?

I haven't yet tackled AF_TLS, AF_KCM or AF_SMC as they seem significantly more
complex than TCP and UDP.  I thought I'd get some feedback on what I have
before I tried my hand at those.

David


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 17:28     ` David Howells
@ 2023-03-16 17:41       ` Chuck Lever III
  0 siblings, 0 replies; 14+ messages in thread
From: Chuck Lever III @ 2023-03-16 17:41 UTC (permalink / raw)
  To: David Howells
  Cc: Trond Myklebust, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Al Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Linux NFS Mailing List



> On Mar 16, 2023, at 1:28 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Chuck Lever III <chuck.lever@oracle.com> wrote:
> 
>> That means I haven't seen the cover letter and do not have any
>> context for this proposed change.
> 
> https://lore.kernel.org/linux-fsdevel/20230316152618.711970-1-dhowells@redhat.com/
> 
>> We've tried combining the sendpages calls in here before. It
>> results in a significant and measurable performance regression.
>> See:
>> 
>> da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends")
> 
> The commit replaced the use of sendpage with sendmsg, but that took away the
> zerocopy aspect of sendpage.  The idea behind MSG_SPLICE_PAGES is that it
> allows you to do keep that.  I'll have to try reapplying this commit and
> adding the MSG_SPLICE_PAGES flag.

Note that, as Trond point out, NFSD can handle an NFS READ
request with either a splice actor or by copying through a
vector, depending on what the underlying filesystem can
support and whether we are using a security flavor that
requires stable pages. Grep for RQ_SPLICE_OK.

Eventually we want to make use of iomaps to ensure that
reading areas of a file that are not allocated on disk
does not trigger an extent allocation. Anna is working on
that, but I have no idea what it will look like. We can
talk more at LSF, if you'll both be around.

Also... I find I have to put back the use of MSG_MORE and
friends in here, otherwise kTLS will split each of these
kernel_sendsomething() calls into its own TLS record. This
code is likely going to look different after support for
RPC-with-TLS goes in.


>> Therefore, this kind of change needs to be accompanied by both
>> benchmark results and some field testing to convince me it won't
>> cause harm.
> 
> Yep.
> 
>> And, we have to make certain that this doesn't break operation
>> with kTLS sockets... do they support MSG_SPLICE_PAGES ?
> 
> I haven't yet tackled AF_TLS, AF_KCM or AF_SMC as they seem significantly more
> complex than TCP and UDP.  I thought I'd get some feedback on what I have
> before I tried my hand at those.

OK, I didn't mean AF_TLS, I meant the stuff under net/tls,
which is AF_INET[6] and TCP, but with a ULP in place. It's
got its own sendpage and sendmsg methods that choke when
an unrecognized MSG_ flag is present.

But OK, you're just asking for feedback, so I'll put my red
pencil down.


--
Chuck Lever



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 16:24   ` David Howells
@ 2023-03-16 18:06     ` David Howells
  2023-03-16 19:01       ` Trond Myklebust
                         ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: David Howells @ 2023-03-16 18:06 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: dhowells, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexander Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs

Trond Myklebust <trondmy@hammerspace.com> wrote:

> 1) This is code that is common to the client and the server. Why are we
> adding unused 3 bvec slots to every client RPC call?

Fair point, but I'm trying to avoid making four+ sendmsg calls in nfsd rather
than one.

> 2) It obfuscates the existence of these bvec slots.

True, it'd be nice to find a better way to do it.  Question is, can the client
make use of MSG_SPLICE_PAGES also?

> 3) knfsd may use splice_direct_to_actor() in order to avoid copying the page
> cache data into private buffers (it just takes a reference to the
> pages). Using MSG_SPLICE_PAGES will presumably require it to protect those
> pages against further writes while the socket is referencing them.

Upstream sunrpc is using sendpage with TCP.  It already has that issue.
MSG_SPLICE_PAGES is a way of doing sendpage through sendmsg.

David


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 18:06     ` David Howells
@ 2023-03-16 19:01       ` Trond Myklebust
  2023-03-22 13:10       ` David Howells
  2023-03-22 18:15       ` [RFC PATCH] iov_iter: Add an iterator-of-iterators David Howells
  2 siblings, 0 replies; 14+ messages in thread
From: Trond Myklebust @ 2023-03-16 19:01 UTC (permalink / raw)
  To: David Howells
  Cc: Matthew Wilcox, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexander Viro, Christoph Hellwig, Jens Axboe,
	Jeffrey Layton, Christian Brauner, Linus Torvalds, netdev,
	linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs



> On Mar 16, 2023, at 14:06, David Howells <dhowells@redhat.com> wrote:
> 
> Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
>> 1) This is code that is common to the client and the server. Why are we
>> adding unused 3 bvec slots to every client RPC call?
> 
> Fair point, but I'm trying to avoid making four+ sendmsg calls in nfsd rather
> than one.

Add an enum iter_type for ITER_ITER ? :-)

Otherwise, please just split these functions into one for knfsd and a separate one for the client.

> 
>> 2) It obfuscates the existence of these bvec slots.
> 
> True, it'd be nice to find a better way to do it.  Question is, can the client
> make use of MSG_SPLICE_PAGES also?

The requirement for O_DIRECT support means we get the stable write issues with added extra spicy sauce.

> 
>> 3) knfsd may use splice_direct_to_actor() in order to avoid copying the page
>> cache data into private buffers (it just takes a reference to the
>> pages). Using MSG_SPLICE_PAGES will presumably require it to protect those
>> pages against further writes while the socket is referencing them.
> 
> Upstream sunrpc is using sendpage with TCP.  It already has that issue.
> MSG_SPLICE_PAGES is a way of doing sendpage through sendmsg.

Fair enough. I do seem to remember a schism with the knfsd developers over that issue.

_________________________________
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 16:17   ` Trond Myklebust
  2023-03-16 17:10     ` Chuck Lever III
  2023-03-16 17:28     ` David Howells
@ 2023-03-16 21:21     ` David Howells
  2023-03-17 15:29       ` Chuck Lever III
  2 siblings, 1 reply; 14+ messages in thread
From: David Howells @ 2023-03-16 21:21 UTC (permalink / raw)
  To: Chuck Lever III
  Cc: dhowells, Trond Myklebust, Matthew Wilcox, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Al Viro,
	Christoph Hellwig, Jens Axboe, Jeffrey Layton, Christian Brauner,
	Linus Torvalds, netdev, linux-fsdevel, linux-kernel, linux-mm,
	Anna Schumaker, Linux NFS Mailing List

Chuck Lever III <chuck.lever@oracle.com> wrote:

> Therefore, this kind of change needs to be accompanied by both
> benchmark results and some field testing to convince me it won't
> cause harm.

Btw, what do you use to benchmark NFS performance?

David


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 21:21     ` David Howells
@ 2023-03-17 15:29       ` Chuck Lever III
  0 siblings, 0 replies; 14+ messages in thread
From: Chuck Lever III @ 2023-03-17 15:29 UTC (permalink / raw)
  To: David Howells
  Cc: Trond Myklebust, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Al Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Linux NFS Mailing List


> On Mar 16, 2023, at 5:21 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Chuck Lever III <chuck.lever@oracle.com> wrote:
> 
>> Therefore, this kind of change needs to be accompanied by both
>> benchmark results and some field testing to convince me it won't
>> cause harm.
> 
> Btw, what do you use to benchmark NFS performance?

It depends on what I'm trying to observe. I have only a small
handful of systems in my lab, which is why I was not able to
immediately detect the effects of the zero-copy change in my
lab. Daire has a large client cohort on a fast network, so is
able to see the impact of that kind of change quite readily.

A perhaps more interesting question is what kind of tooling
would I use to measure the performance of the proposed change.

The bottom line is whether or not applications on clients can
see a change. NFS client implementations can hide server and
network latency improvement from applications, and RPC-on-TCP
adds palpable latency as well that reduces the efficacy of
server performance optimizations.

For that I might use a multi-threaded fio workload with fixed
record sizes (2KB, 8KB, 128KB, 1MB) and then look at the
throughput numbers and latency distribution for each size.

In a single thread qd=1 test, iozone can show changes in
READ latency pretty clearly, though most folks believe qd=1
tests are junk.

I generally run such tests on 100GbE with a tmpfs or NVMe
export to take filesystem latencies out of the equation,
although that might matter more for WRITE latency if you
can keep your READ workload completely in server memory.

To measure server-side behavior without the effects of the
network or client, NFSD has a built-in trace point,
nfsd:svc_stats_latency, that records the latency in
microseconds of each RPC. Run the above workloads and
record this tracepoint (perhaps with a capture filter to
record only the latency of READ operations).

Then you can post-process the raw latencies to get an average
latency and deviation, or even look at latency distribution
to see if the shape of the outlier curve has changed. I use
awk for this.

[ Sidebar: you can use this tracepoint to track latency
outliers too, but that's another topic. ]

Second, I might try a flame graph study to measure changes in
instruction path length, and also capture an average cycles-
per-byte-read value. Looking at CPU cache misses can often be
a rathole, but flame graphs can surface changes there too.

And lastly, you might want to visit lock_stats to see if
there is any significant change in lock contention. An
unexpected increase in lock contention can often rob
positive changes made in other areas.


My guess is that for the RQ_SPLICE_OK case, the difference
would amount to the elimination of the kernel_sendpage
calls, which are indirect, but not terribly expensive.
Those calls amount to a significant cost only on large I/O.
It might not amount to much relative to the other costs
in the READ path.

So the real purpose here would have to be refactoring to
use bvecs instead of the bespoke xdr_buf structure, and I
would like to see support for bvecs in all of our transports
(looking at you, RDMA) to make this truly worthwhile. I had
started this a while back, but lack of a bvec-based RDMA API
made it less interesting to me. It isn't clear to me yet
whether bvecs or folios should be the replacement for
xdr_buf's head/pages/tail, but I'm a paid-in-full member of
the uneducated rabble.

This might sound like a lot of pushback, but actually I am
open to discussing clean-ups in this area, including the
one you proposed. Just getting a little more careful about
this kind of change as time goes on. And it sounds like you
were already aware of the most recent previous attempt at
this kind of improvement.


--
Chuck Lever



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
  2023-03-16 18:06     ` David Howells
  2023-03-16 19:01       ` Trond Myklebust
@ 2023-03-22 13:10       ` David Howells
  2023-03-22 18:15       ` [RFC PATCH] iov_iter: Add an iterator-of-iterators David Howells
  2 siblings, 0 replies; 14+ messages in thread
From: David Howells @ 2023-03-22 13:10 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: dhowells, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexander Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs

Trond Myklebust <trondmy@hammerspace.com> wrote:

> Add an enum iter_type for ITER_ITER ? :-)

Actually, that might not be such a bad idea, now that I've pondered on it some
more.  Give it an array of iterators and add a flag to each iterator to say if
it can be spliced from or not.

Once ITER_PIPE is killed off, advancing and reverting over it should be pretty
straightforward - though each iterator would also need to keep track of how
big it started off as in order that it can be reverted over.

David


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC PATCH] iov_iter: Add an iterator-of-iterators
  2023-03-16 18:06     ` David Howells
  2023-03-16 19:01       ` Trond Myklebust
  2023-03-22 13:10       ` David Howells
@ 2023-03-22 18:15       ` David Howells
  2023-03-22 18:47         ` Trond Myklebust
  2023-03-22 18:49         ` Matthew Wilcox
  2 siblings, 2 replies; 14+ messages in thread
From: David Howells @ 2023-03-22 18:15 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: dhowells, Matthew Wilcox, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexander Viro, Christoph Hellwig,
	Jens Axboe, Jeffrey Layton, Christian Brauner, Linus Torvalds,
	netdev, linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs

Trond Myklebust <trondmy@hammerspace.com> wrote:

> Add an enum iter_type for ITER_ITER ? :-)

Well, you asked for it...  It's actually fairly straightforward once
ITER_PIPE is removed.

---
iov_iter: Add an iterator-of-iterators

Provide an I/O iterator that takes an array of iterators and iterates over
them in turn.  Then make the sunrpc service code (and thus nfsd) use it.

In this particular instance, the svc_tcp_sendmsg() sets up an array of
three iterators: once for the marker+header, one for the body and one
optional one for the tail, then sets msg_iter to be an
iterator-of-iterators across them.

Signed-off-by: David Howells <dhowells@redhat.com>
---    
 include/linux/uio.h  |   19 +++-
 lib/iov_iter.c       |  233 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/sunrpc/svcsock.c |   29 +++---
 3 files changed, 258 insertions(+), 23 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 74598426edb4..321381d3d616 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -27,6 +27,7 @@ enum iter_type {
 	ITER_XARRAY,
 	ITER_DISCARD,
 	ITER_UBUF,
+	ITER_ITERLIST,
 };
 
 #define ITER_SOURCE	1	// == WRITE
@@ -43,17 +44,17 @@ struct iov_iter {
 	bool nofault;
 	bool data_source;
 	bool user_backed;
-	union {
-		size_t iov_offset;
-		int last_offset;
-	};
+	bool spliceable;
+	size_t iov_offset;
 	size_t count;
+	size_t orig_count;
 	union {
 		const struct iovec *iov;
 		const struct kvec *kvec;
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
 		void __user *ubuf;
+		struct iov_iter *iterlist;
 	};
 	union {
 		unsigned long nr_segs;
@@ -104,6 +105,11 @@ static inline bool iov_iter_is_xarray(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_XARRAY;
 }
 
+static inline bool iov_iter_is_iterlist(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_ITERLIST;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
 	return i->data_source ? WRITE : READ;
@@ -238,6 +244,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
+void iov_iter_iterlist(struct iov_iter *i, unsigned int direction, struct iov_iter *iterlist,
+		       unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 		size_t maxsize, unsigned maxpages, size_t *start,
 		iov_iter_extraction_t extraction_flags);
@@ -345,7 +353,8 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 /* Flags for iov_iter_get/extract_pages*() */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index fad95e4cf372..34ce3b958b6c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -282,7 +282,8 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 		.iov = iov,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 EXPORT_SYMBOL(iov_iter_init);
@@ -364,6 +365,26 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = _copy_from_iter(addr, part, i->iterlist);
+			addr += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
 	if (user_backed_iter(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
@@ -380,6 +401,27 @@ size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = _copy_from_iter_nocache(addr, part,
+							    i->iterlist);
+			addr += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
 	iterate_and_advance(i, bytes, base, len, off,
 		__copy_from_user_inatomic_nocache(addr + off, base, len),
 		memcpy(addr + off, base, len)
@@ -411,6 +453,27 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = _copy_from_iter_flushcache(addr, part,
+							       i->iterlist);
+			addr += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
 	iterate_and_advance(i, bytes, base, len, off,
 		__copy_from_user_flushcache(addr + off, base, len),
 		memcpy_flushcache(addr + off, base, len)
@@ -514,7 +577,31 @@ EXPORT_SYMBOL(iov_iter_zero);
 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
 				  struct iov_iter *i)
 {
-	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+	char *kaddr, *p;
+
+	if (unlikely(iov_iter_is_iterlist(i))) {
+		size_t copied = 0;
+
+		while (bytes && i->count) {
+			size_t part = min(bytes, i->iterlist->count), n;
+
+			if (part > 0)
+				n = copy_page_from_iter_atomic(page, offset, part,
+							       i->iterlist);
+			offset += n;
+			copied += n;
+			bytes -= n;
+			i->count -= n;
+			if (n < part || !bytes)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		return copied;
+	}
+
+	kaddr = kmap_atomic(page);
+	p = kaddr + offset;
 	if (!page_copy_sane(page, offset, bytes)) {
 		kunmap_atomic(kaddr);
 		return 0;
@@ -585,19 +672,49 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		iov_iter_bvec_advance(i, size);
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
+	}else if (iov_iter_is_iterlist(i)) {
+		i->count -= size;
+		for (;;) {
+			size_t part = min(size, i->iterlist->count);
+
+			if (part > 0)
+				iov_iter_advance(i->iterlist, part);
+			size -= part;
+			if (!size)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
 	}
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
+static void iov_iter_revert_iterlist(struct iov_iter *i, size_t unroll)
+{
+	for (;;) {
+		size_t part = min(unroll, i->iterlist->orig_count - i->iterlist->count);
+
+		if (part > 0)
+			iov_iter_revert(i->iterlist, part);
+		unroll -= part;
+		if (!unroll)
+			break;
+		i->iterlist--;
+		i->nr_segs++;
+	}
+}
+
 void iov_iter_revert(struct iov_iter *i, size_t unroll)
 {
 	if (!unroll)
 		return;
-	if (WARN_ON(unroll > MAX_RW_COUNT))
+	if (WARN_ON(unroll > i->orig_count - i->count))
 		return;
 	i->count += unroll;
 	if (unlikely(iov_iter_is_discard(i)))
 		return;
+	if (unlikely(iov_iter_is_iterlist(i)))
+		return iov_iter_revert_iterlist(i, unroll);
 	if (unroll <= i->iov_offset) {
 		i->iov_offset -= unroll;
 		return;
@@ -641,6 +758,8 @@ EXPORT_SYMBOL(iov_iter_revert);
  */
 size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
+	if (iov_iter_is_iterlist(i))
+		i = i->iterlist;
 	if (i->nr_segs > 1) {
 		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 			return min(i->count, i->iov->iov_len - i->iov_offset);
@@ -662,7 +781,8 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
 		.kvec = kvec,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 EXPORT_SYMBOL(iov_iter_kvec);
@@ -678,7 +798,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 		.bvec = bvec,
 		.nr_segs = nr_segs,
 		.iov_offset = 0,
-		.count = count
+		.count = count,
+		.orig_count = count,
 	};
 }
 EXPORT_SYMBOL(iov_iter_bvec);
@@ -706,6 +827,7 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
 		.xarray = xarray,
 		.xarray_start = start,
 		.count = count,
+		.orig_count = count,
 		.iov_offset = 0
 	};
 }
@@ -727,11 +849,47 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 		.iter_type = ITER_DISCARD,
 		.data_source = false,
 		.count = count,
+		.orig_count = count,
 		.iov_offset = 0
 	};
 }
 EXPORT_SYMBOL(iov_iter_discard);
 
+/**
+ * iov_iter_iterlist - Initialise an I/O iterator that is a list of iterators
+ * @iter: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @iterlist: The list of iterators
+ * @nr_segs: The number of elements in the list
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator that just discards everything that's written to it.
+ * It's only available as a source iterator (for WRITE), all the iterators in
+ * the list must be the same and none of them can be ITER_ITERLIST type.
+ */
+void iov_iter_iterlist(struct iov_iter *iter, unsigned int direction,
+		       struct iov_iter *iterlist, unsigned long nr_segs,
+		       size_t count)
+{
+	unsigned long i;
+
+	BUG_ON(direction != WRITE);
+	for (i = 0; i < nr_segs; i++) {
+		BUG_ON(iterlist[i].iter_type == ITER_ITERLIST);
+		BUG_ON(!iterlist[i].data_source);
+	}
+
+	*iter = (struct iov_iter){
+		.iter_type	= ITER_ITERLIST,
+		.data_source	= true,
+		.count		= count,
+		.orig_count	= count,
+		.iterlist	= iterlist,
+		.nr_segs	= nr_segs,
+	};
+}
+EXPORT_SYMBOL(iov_iter_iterlist);
+
 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 				   unsigned len_mask)
 {
@@ -879,6 +1037,15 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	if (iov_iter_is_xarray(i))
 		return (i->xarray_start + i->iov_offset) | i->count;
 
+	if (iov_iter_is_iterlist(i)) {
+		unsigned long align = 0;
+		unsigned int j;
+
+		for (j = 0; j < i->nr_segs; j++)
+			align |= iov_iter_alignment(&i->iterlist[j]);
+		return align;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(iov_iter_alignment);
@@ -1078,6 +1245,18 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 	}
 	if (iov_iter_is_xarray(i))
 		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
+	if (iov_iter_is_iterlist(i)) {
+		ssize_t size;
+
+		while (!i->iterlist->count) {
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		size = __iov_iter_get_pages_alloc(i->iterlist, pages, maxsize, maxpages,
+						  start, extraction_flags);
+		i->count -= size;
+		return size;
+	}
 	return -EFAULT;
 }
 
@@ -1126,6 +1305,31 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
+static size_t csum_and_copy_from_iterlist(void *addr, size_t bytes, __wsum *csum,
+					  struct iov_iter *i)
+{
+	size_t copied = 0, n;
+
+	while (i->count && i->nr_segs) {
+		struct iov_iter *j = i->iterlist;
+
+		if (j->count == 0) {
+			i->iterlist++;
+			i->nr_segs--;
+			continue;
+		}
+
+		n = csum_and_copy_from_iter(addr, bytes - copied, csum, j);
+		addr += n;
+		copied += n;
+		i->count -= n;
+		if (n == 0)
+			break;
+	}
+
+	return copied;
+}
+
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 			       struct iov_iter *i)
 {
@@ -1133,6 +1337,8 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 	sum = *csum;
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
+	if (iov_iter_is_iterlist(i))
+		return csum_and_copy_from_iterlist(addr, bytes, csum, i);
 
 	iterate_and_advance(i, bytes, base, len, off, ({
 		next = csum_and_copy_from_user(base, addr + off, len);
@@ -1236,6 +1442,21 @@ static int bvec_npages(const struct iov_iter *i, int maxpages)
 	return npages;
 }
 
+static int iterlist_npages(const struct iov_iter *i, int maxpages)
+{
+	ssize_t size = i->count;
+	const struct iov_iter *p;
+	int npages = 0;
+
+	for (p = i->iterlist; size; p++) {
+		size -= p->count;
+		npages += iov_iter_npages(p, maxpages - npages);
+		if (unlikely(npages >= maxpages))
+			return maxpages;
+	}
+	return npages;
+}
+
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	if (unlikely(!i->count))
@@ -1255,6 +1476,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
 		return min(npages, maxpages);
 	}
+	if (iov_iter_is_iterlist(i))
+		return iterlist_npages(i, maxpages);
 	return 0;
 }
 EXPORT_SYMBOL(iov_iter_npages);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1d0f0f764e16..030a1fa5171b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1073,11 +1073,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
 {
 	const struct kvec *head = xdr->head;
 	const struct kvec *tail = xdr->tail;
+	struct iov_iter iters[3];
+	struct bio_vec head_bv, tail_bv;
 	struct msghdr msg = {
-		.msg_flags	= MSG_SPLICE_PAGES,
+		.msg_flags	= 0, //MSG_SPLICE_PAGES,
 	};
-	void *m, *h, *t;
-	int ret, n = xdr_buf_pagecount(xdr), size;
+	void *m, *t;
+	int ret, n = 2, size;
 
 	*sentp = 0;
 	ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
@@ -1089,27 +1091,28 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
 	if (!m)
 		return -ENOMEM;
 
-	h = m + sizeof(marker);
-	t = h + head->iov_len;
+	memcpy(m, &marker, sizeof(marker));
+	if (head->iov_len)
+		memcpy(m + sizeof(marker), head->iov_base, head->iov_len);
+	bvec_set_virt(&head_bv, m, sizeof(marker) + head->iov_len);
+	iov_iter_bvec(&iters[0], ITER_SOURCE, &head_bv, 1,
+		      sizeof(marker) + head->iov_len);
 
-	bvec_set_virt(&xdr->bvec[-1], m, sizeof(marker) + head->iov_len);
-	n++;
+	iov_iter_bvec(&iters[1], ITER_SOURCE, xdr->bvec,
+		      xdr_buf_pagecount(xdr), xdr->page_len);
 
 	if (tail->iov_len) {
 		t = page_frag_alloc(NULL, tail->iov_len, GFP_KERNEL);
 		if (!t)
 			return -ENOMEM;
-		bvec_set_virt(&xdr->bvec[n],  t, tail->iov_len);
 		memcpy(t, tail->iov_base, tail->iov_len);
+		bvec_set_virt(&tail_bv,  t, tail->iov_len);
+		iov_iter_bvec(&iters[2], ITER_SOURCE, &tail_bv, 1, tail->iov_len);
 		n++;
 	}
 
-	memcpy(m, &marker, sizeof(marker));
-	if (head->iov_len)
-		memcpy(h, head->iov_base, head->iov_len);
-
 	size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
-	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 1, n, size);
+	iov_iter_iterlist(&msg.msg_iter, ITER_SOURCE, iters, n, size);
 
 	ret = sock_sendmsg(sock, &msg);
 	if (ret < 0)


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] iov_iter: Add an iterator-of-iterators
  2023-03-22 18:15       ` [RFC PATCH] iov_iter: Add an iterator-of-iterators David Howells
@ 2023-03-22 18:47         ` Trond Myklebust
  2023-03-22 18:49         ` Matthew Wilcox
  1 sibling, 0 replies; 14+ messages in thread
From: Trond Myklebust @ 2023-03-22 18:47 UTC (permalink / raw)
  To: David Howells
  Cc: Matthew Wilcox, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexander Viro, Christoph Hellwig, Jens Axboe,
	Jeffrey Layton, Christian Brauner, Linus Torvalds, netdev,
	linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs



> On Mar 22, 2023, at 14:15, David Howells <dhowells@redhat.com> wrote:
> 
> Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
>> Add an enum iter_type for ITER_ITER ? :-)
> 
> Well, you asked for it...  It's actually fairly straightforward once
> ITER_PIPE is removed.
> 
> ---
> iov_iter: Add an iterator-of-iterators
> 
> Provide an I/O iterator that takes an array of iterators and iterates over
> them in turn.  Then make the sunrpc service code (and thus nfsd) use it.
> 
> In this particular instance, the svc_tcp_sendmsg() sets up an array of
> three iterators: once for the marker+header, one for the body and one
> optional one for the tail, then sets msg_iter to be an
> iterator-of-iterators across them.

Cool! This is something that can be used on the receive side as well, so very useful. I can imagine it might also open up a few more use cases for ITER_XARRAY.

Thanks!
  Trond

_________________________________
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] iov_iter: Add an iterator-of-iterators
  2023-03-22 18:15       ` [RFC PATCH] iov_iter: Add an iterator-of-iterators David Howells
  2023-03-22 18:47         ` Trond Myklebust
@ 2023-03-22 18:49         ` Matthew Wilcox
  1 sibling, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2023-03-22 18:49 UTC (permalink / raw)
  To: David Howells
  Cc: Trond Myklebust, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexander Viro, Christoph Hellwig, Jens Axboe,
	Jeffrey Layton, Christian Brauner, Linus Torvalds, netdev,
	linux-fsdevel, linux-kernel, linux-mm, Anna Schumaker,
	Charles Edward Lever, linux-nfs

On Wed, Mar 22, 2023 at 06:15:45PM +0000, David Howells wrote:
> @@ -43,17 +44,17 @@ struct iov_iter {
>  	bool nofault;
>  	bool data_source;
>  	bool user_backed;
> -	union {
> -		size_t iov_offset;
> -		int last_offset;
> -	};
> +	bool spliceable;

We've now up to five u8s in a row here (iter_type, nofault, data_source,
user_backed).  Is it time to turn some/all of them into:

	bool nofault:1;
	bool data_source:1;
	bool user_backed:1;
	bool spliceable:1;

You can't take the address of them then, but I don't believe we do that
anywhere.


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-03-22 18:49 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20230316152618.711970-1-dhowells@redhat.com>
2023-03-16 15:26 ` [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage David Howells
2023-03-16 16:17   ` Trond Myklebust
2023-03-16 17:10     ` Chuck Lever III
2023-03-16 17:28     ` David Howells
2023-03-16 17:41       ` Chuck Lever III
2023-03-16 21:21     ` David Howells
2023-03-17 15:29       ` Chuck Lever III
2023-03-16 16:24   ` David Howells
2023-03-16 18:06     ` David Howells
2023-03-16 19:01       ` Trond Myklebust
2023-03-22 13:10       ` David Howells
2023-03-22 18:15       ` [RFC PATCH] iov_iter: Add an iterator-of-iterators David Howells
2023-03-22 18:47         ` Trond Myklebust
2023-03-22 18:49         ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).