All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Ahern <dsahern@kernel.org>
To: Pavel Begunkov <asml.silence@gmail.com>
Cc: io-uring@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	"David S . Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>,
	Jonathan Lemon <jonathan.lemon@gmail.com>,
	Willem de Bruijn <willemb@google.com>,
	Jens Axboe <axboe@kernel.dk>,
	kernel-team@fb.com
Subject: Re: [RFC net-next v3 05/29] net: bvec specific path in zerocopy_sg_from_iter
Date: Tue, 28 Jun 2022 16:52:04 -0600	[thread overview]
Message-ID: <20220628225204.GA27554@u2004-local> (raw)
In-Reply-To: <5143111391e771dc97237e2a5e6a74223ef8f15f.1653992701.git.asml.silence@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 570 bytes --]

On Tue, Jun 28, 2022 at 07:56:27PM +0100, Pavel Begunkov wrote:
> Add an bvec specialised and optimised path in zerocopy_sg_from_iter.
> It'll be used later for {get,put}_page() optimisations.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  net/core/datagram.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
> 

Rather than propagating iter functions, I have been using the attached
patch for a few months now. It leverages your ubuf_info in msghdr to
allow in kernel users to pass in their own iter handler.

[-- Attachment #2: 0001-net-Allow-custom-iter-handler-in-uarg.patch --]
[-- Type: text/x-diff, Size: 6667 bytes --]

From 1101177acb64832df2bb2b44d9305a8ebc4ca648 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Tue, 19 Apr 2022 10:39:59 -0600
Subject: [PATCH] net: Allow custom iter handler in uarg

Add support for custom iov_iter handling to ubuf. The idea is that
in-kernel subsystems want control over how an SG is split.

The custom iterator is a union with mmpin to keep the size of
ubuf_info <= sizeof(skb->cb) which is 48B.

Signed-off-by: David Ahern <dsahern@kernel.org>
---
 include/linux/skbuff.h | 21 ++++++++++++++++-----
 net/core/datagram.c    | 11 ++++++++---
 net/core/datagram.h    |  3 ++-
 net/core/skbuff.c      | 19 +++++++++++++++----
 net/ipv4/ip_output.c   |  2 +-
 net/ipv6/ip6_output.c  |  2 +-
 6 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dbf820a50a39..71161f65dedd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -482,11 +482,21 @@ struct ubuf_info {
 	};
 	refcount_t refcnt;
 	u8 flags;
+	u8 has_sg_from_iter;
 
-	struct mmpin {
-		struct user_struct *user;
-		unsigned int num_pg;
-	} mmp;
+	/* sg_from_iter is expected to be used with ubuf in
+	 * msghdr and is only referenced at the transport
+	 * layer segmenting an iov into packets. mmpin is used
+	 * by in-tree ubuf_info {re,}alloc at L3 layer.
+	 */
+	union {
+		int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
+				    struct iov_iter *from, size_t length);
+		struct mmpin {
+			struct user_struct *user;
+			unsigned int num_pg;
+		} mmp;
+	};
 };
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
@@ -503,7 +513,8 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 			   bool success);
 
-int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len,
+			    struct ubuf_info *uarg);
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
 			     struct ubuf_info *uarg);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 15ab9ffb27fe..9ca61a0a400d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -617,10 +617,15 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 EXPORT_SYMBOL(skb_copy_datagram_from_iter);
 
 int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
-			    struct iov_iter *from, size_t length)
+			    struct iov_iter *from, size_t length,
+			    struct ubuf_info *uarg)
 {
-	int frag = skb_shinfo(skb)->nr_frags;
+	int frag;
 
+	if (unlikely(uarg && uarg->has_sg_from_iter))
+		return uarg->sg_from_iter(sk, skb, from, length);
+
+	frag = skb_shinfo(skb)->nr_frags;
 	while (length && iov_iter_count(from)) {
 		struct page *pages[MAX_SKB_FRAGS];
 		struct page *last_head = NULL;
@@ -704,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 		return -EFAULT;
 
-	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U, NULL);
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
diff --git a/net/core/datagram.h b/net/core/datagram.h
index bcfb75bfa3b2..65027fcf3322 100644
--- a/net/core/datagram.h
+++ b/net/core/datagram.h
@@ -10,6 +10,7 @@ struct sk_buff;
 struct iov_iter;
 
 int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
-			    struct iov_iter *from, size_t length);
+			    struct iov_iter *from, size_t length,
+			    struct ubuf_info *uarg);
 
 #endif /* _NET_CORE_DATAGRAM_H_ */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 17b93177a68f..9acb43e5a779 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1158,6 +1158,7 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
 
 	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
 	uarg = (void *)skb->cb;
+	uarg->has_sg_from_iter = 0;
 	uarg->mmp.user = NULL;
 
 	if (mm_account_pinned_pages(&uarg->mmp, size)) {
@@ -1206,6 +1207,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
 			return NULL;
 		}
 
+		if (WARN_ON(uarg->has_sg_from_iter)) {
+			uarg->has_sg_from_iter = 0;
+			uarg->mmp.user = NULL;
+			uarg->mmp.num_pg = 0;
+		}
+
 		next = (u32)atomic_read(&sk->sk_zckey);
 		if ((u32)(uarg->id + uarg->len) == next) {
 			if (mm_account_pinned_pages(&uarg->mmp, size))
@@ -1258,7 +1265,10 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg)
 	u32 lo, hi;
 	u16 len;
 
-	mm_unaccount_pinned_pages(&uarg->mmp);
+
+	WARN_ON(uarg->has_sg_from_iter);
+	if (!uarg->has_sg_from_iter)
+		mm_unaccount_pinned_pages(&uarg->mmp);
 
 	/* if !len, there was only 1 call, and it was aborted
 	 * so do not queue a completion notification
@@ -1319,9 +1329,10 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 }
 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
 
-int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len,
+			    struct ubuf_info *uarg)
 {
-	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len, uarg);
 }
 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
 
@@ -1339,7 +1350,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 	if (orig_uarg && uarg != orig_uarg)
 		return -EEXIST;
 
-	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len, uarg);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 		struct sock *save_sk = skb->sk;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1b6a64b19c76..1ff403c2dcb0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1238,7 +1238,7 @@ static int __ip_append_data(struct sock *sk,
 			skb->truesize += copy;
 			wmem_alloc_delta += copy;
 		} else {
-			err = skb_zerocopy_iter_dgram(skb, from, copy);
+			err = skb_zerocopy_iter_dgram(skb, from, copy, uarg);
 			if (err < 0)
 				goto error;
 		}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 63a217128f8b..6795144653ac 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1791,7 +1791,7 @@ static int __ip6_append_data(struct sock *sk,
 			skb->truesize += copy;
 			wmem_alloc_delta += copy;
 		} else {
-			err = skb_zerocopy_iter_dgram(skb, from, copy);
+			err = skb_zerocopy_iter_dgram(skb, from, copy, uarg);
 			if (err < 0)
 				goto error;
 		}
-- 
2.25.1


  parent reply	other threads:[~2022-06-28 22:52 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-28 18:56 [RFC net-next v3 00/29] io_uring zerocopy send Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 01/29] ipv4: avoid partial copy for zc Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 02/29] ipv6: " Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 03/29] skbuff: add SKBFL_DONT_ORPHAN flag Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 04/29] skbuff: carry external ubuf_info in msghdr Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 05/29] net: bvec specific path in zerocopy_sg_from_iter Pavel Begunkov
2022-06-28 20:06   ` Al Viro
2022-06-28 21:33     ` Pavel Begunkov
2022-06-28 22:52   ` David Ahern [this message]
2022-07-04 13:31     ` Pavel Begunkov
2022-07-05  2:28       ` David Ahern
2022-07-05 14:03         ` Pavel Begunkov
2022-07-05 22:09           ` Pavel Begunkov
2022-07-06 15:11             ` David Ahern
2022-06-28 18:56 ` [RFC net-next v3 06/29] net: optimise bvec-based zc page referencing Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 07/29] net: don't track pfmemalloc for managed frags Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 08/29] skbuff: don't mix ubuf_info of different types Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 09/29] ipv4/udp: support zc with managed data Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 10/29] ipv6/udp: " Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 11/29] tcp: " Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 12/29] tcp: kill extra io_uring's uarg refcounting Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 13/29] net: let callers provide extra ubuf_info refs Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 14/29] io_uring: opcode independent fixed buf import Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 15/29] io_uring: add zc notification infrastructure Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 16/29] io_uring: cache struct io_notif Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 17/29] io_uring: complete notifiers in tw Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 18/29] io_uring: add notification slot registration Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 19/29] io_uring: rename IORING_OP_FILES_UPDATE Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 20/29] io_uring: add zc notification flush requests Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 21/29] io_uring: wire send zc request type Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 22/29] io_uring: account locked pages for non-fixed zc Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 23/29] io_uring: allow to pass addr into sendzc Pavel Begunkov
2022-06-29  7:42   ` Stefan Metzmacher
2022-06-29  9:53     ` Pavel Begunkov
2022-08-13  8:45       ` Stefan Metzmacher
2022-08-15  9:46         ` Pavel Begunkov
2022-08-15 11:40           ` Stefan Metzmacher
2022-08-15 12:19             ` Pavel Begunkov
2022-08-15 13:30               ` Stefan Metzmacher
2022-08-15 14:09                 ` Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 24/29] io_uring: add rsrc referencing for notifiers Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 25/29] io_uring: sendzc with fixed buffers Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 26/29] io_uring: flush notifiers after sendzc Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 27/29] io_uring: allow to override zc tag on flush Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 28/29] io_uring: batch submission notif referencing Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 29/29] selftests/io_uring: test zerocopy send Pavel Begunkov
2022-06-28 19:03 ` [RFC net-next v3 00/29] io_uring " Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220628225204.GA27554@u2004-local \
    --to=dsahern@kernel.org \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=davem@davemloft.net \
    --cc=io-uring@vger.kernel.org \
    --cc=jonathan.lemon@gmail.com \
    --cc=kernel-team@fb.com \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.