All of lore.kernel.org
 help / color / mirror / Atom feed
From: Paolo Abeni <pabeni@redhat.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Cc: Network Development <netdev@vger.kernel.org>,
	David Miller <davem@davemloft.net>,
	Willem de Bruijn <willemb@google.com>
Subject: Re: [PATCH net-next v2 1/2] udp: msg_zerocopy
Date: Thu, 29 Nov 2018 09:27:36 +0100	[thread overview]
Message-ID: <4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com> (raw)
In-Reply-To: <CAF=yD-L4bBm_obmsW5m7aTGwFv8uC-hDA7zo9qsAUr0y32Rr0Q@mail.gmail.com>

Hi,

Thank you for the update!

On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
> I did revert to the basic implementation using an extra ref
> for the function call, similar to TCP, as you suggested.
> 
> On top of that as a separate optimization patch I have a
> variant that uses refcnt zero by replacing refcount_inc with
> refcount_set(.., refcount_read(..) + 1). Not very pretty.

If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):

---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 	refcount_inc(&uarg->refcnt);
 }
 
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+	if (uarg)
+		refcount_set(&uarg->refcnt, cnt);
+}
+
 void sock_zerocopy_put(struct ubuf_info *uarg);
 void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 {
 	if (skb && uarg && !skb_zcopy(skb)) {
-		sock_zerocopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
 		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+		return 1;
 	}
+	return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	if (__skb_zcopy_set(skb, uarg))
+		sock_zerocopy_get(uarg);
 }
 
 static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
-		/* Stream socks hold a ref for the syscall, as skbs can be sent
-		 * and freed inside the loop, dropping refcnt to 0 inbetween.
-		 * Datagrams do not need this, but sock_zerocopy_put expects it.
-		 */
-		if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
-			refcount_set(&uarg->refcnt, 1);
-
 		sock_zerocopy_put(uarg);
 	}
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
+	int uarg_refs = 0;
 	bool paged;
 
 	skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
 
 	if (flags & MSG_ZEROCOPY && length) {
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		uarg_refs = 1;
 		if (!uarg)
 			return -ENOBUFS;
 		if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 		}
 	}
 
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
 		length -= copy;
 	}
 
+	__sock_zerocopy_init(uarg, uarg_refs);
 	if (wmem_alloc_delta)
 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	__sock_zerocopy_init(uarg, uarg_refs);
 	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.

WDYT?

Thanks,

Paolo

  reply	other threads:[~2018-11-29 19:32 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-26 15:29 [PATCH net-next v2 0/2] udp msg_zerocopy Willem de Bruijn
2018-11-26 15:29 ` [PATCH net-next v2 1/2] udp: msg_zerocopy Willem de Bruijn
2018-11-26 16:32   ` Paolo Abeni
2018-11-26 17:59     ` Willem de Bruijn
2018-11-26 18:04       ` Paolo Abeni
2018-11-26 18:19         ` Willem de Bruijn
2018-11-26 19:49           ` Willem de Bruijn
2018-11-28 23:50             ` Willem de Bruijn
2018-11-29  8:27               ` Paolo Abeni [this message]
2018-11-29 16:17                 ` Willem de Bruijn
2018-11-29  7:31   ` [udp] a4a142d3d7: WARNING:at_lib/refcount.c:#refcount_inc_checked kernel test robot
2018-11-29  7:31     ` kernel test robot
2018-11-26 15:29 ` [PATCH net-next v2 2/2] selftests: extend zerocopy tests to udp Willem de Bruijn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com \
    --to=pabeni@redhat.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=willemb@google.com \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.