All of lore.kernel.org
 help / color / mirror / Atom feed
From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, linux-api@vger.kernel.org,
	Willem de Bruijn <willemb@google.com>
Subject: [PATCH net-next v3 06/13] sock: MSG_ZEROCOPY notification coalescing
Date: Wed, 21 Jun 2017 17:18:09 -0400	[thread overview]
Message-ID: <20170621211816.53837-7-willemdebruijn.kernel@gmail.com> (raw)
In-Reply-To: <20170621211816.53837-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

In the simple case, each sendmsg() call generates data and eventually
a zerocopy ready notification N, where N indicates the Nth successful
invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket.

TCP and corked sockets can cause send() calls to append new data to an
existing sk_buff and, thus, ubuf_info. In that case the notification
must hold a range. odify ubuf_info to store a inclusive range [N..N+m]
and add skb_zerocopy_realloc() to optionally extend an existing range.

Also coalesce notifications in this common case: if a notification
[1, 1] is about to be queued while [0, 0] is the queue tail, just modify
the head of the queue to read [0, 1].

Coalescing is limited to a few TSO frames worth of data to bound
notification latency.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h | 15 ++++++--
 net/core/skbuff.c      | 98 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c20248b26b53..554017a3aa3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -408,14 +408,25 @@ enum {
  */
 struct ubuf_info {
 	void (*callback)(struct ubuf_info *, bool zerocopy_success);
-	void *ctx;
-	unsigned long desc;
+	union {
+		struct {
+			unsigned long desc;
+			void *ctx;
+		};
+		struct {
+			u32 id;
+			u16 len;
+			u32 bytelen;
+		};
+	};
 	atomic_t refcnt;
 };
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+					struct ubuf_info *uarg);
 
 static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 89ba5ad024a5..41fedc4e651f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -941,7 +941,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg = (void *)skb->cb;
 
 	uarg->callback = sock_zerocopy_callback;
-	uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
+	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+	uarg->len = 1;
+	uarg->bytelen = size;
 	atomic_set(&uarg->refcnt, 0);
 	sock_hold(sk);
 
@@ -954,24 +956,98 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
 	return container_of((void *)uarg, struct sk_buff, cb);
 }
 
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+					struct ubuf_info *uarg)
+{
+	if (uarg) {
+		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
+		u32 bytelen, next;
+
+		/* realloc only when socket is locked (TCP, UDP cork),
+		 * so uarg->len and sk_zckey access is serialized
+		 */
+		if (!sock_owned_by_user(sk)) {
+			WARN_ON_ONCE(1);
+			return NULL;
+		}
+
+		bytelen = uarg->bytelen + size;
+		if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+			/* TCP can create new skb to attach new uarg */
+			if (sk->sk_type == SOCK_STREAM)
+				goto new_alloc;
+			return NULL;
+		}
+
+		next = (u32)atomic_read(&sk->sk_zckey);
+		if ((u32)(uarg->id + uarg->len) == next) {
+			uarg->len++;
+			uarg->bytelen = bytelen;
+			atomic_set(&sk->sk_zckey, ++next);
+			return uarg;
+		}
+	}
+
+new_alloc:
+	return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	u64 sum_len;
+	u32 old_lo, old_hi;
+
+	old_lo = serr->ee.ee_info;
+	old_hi = serr->ee.ee_data;
+	sum_len = old_hi - old_lo + 1ULL + len;
+
+	if (sum_len >= (1ULL << 32))
+		return false;
+
+	if (lo != old_hi + 1)
+		return false;
+
+	serr->ee.ee_data += len;
+	return true;
+}
+
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 {
 	struct sock_exterr_skb *serr;
-	struct sk_buff *skb = skb_from_uarg(uarg);
+	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
 	struct sock *sk = skb->sk;
-	u16 id = uarg->desc;
+	struct sk_buff_head *q = &sk->sk_error_queue;
+	unsigned long flags;
+	u32 lo, hi;
+	u16 len;
 
-	if (sock_flag(sk, SOCK_DEAD))
+	/* if !len, there was only 1 call, and it was aborted
+	 * so do not queue a completion notification
+	 */
+	if (!uarg->len || sock_flag(sk, SOCK_DEAD))
 		goto release;
 
+	len = uarg->len;
+	lo = uarg->id;
+	hi = uarg->id + len - 1;
+
 	serr = SKB_EXT_ERR(skb);
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = 0;
 	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
-	serr->ee.ee_data = id;
+	serr->ee.ee_data = hi;
+	serr->ee.ee_info = lo;
 
-	skb_queue_tail(&sk->sk_error_queue, skb);
-	skb = NULL;
+	spin_lock_irqsave(&q->lock, flags);
+	tail = skb_peek_tail(q);
+	if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+	    !skb_zerocopy_notify_extend(tail, lo, len)) {
+		__skb_queue_tail(q, skb);
+		skb = NULL;
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
 
 	sk->sk_error_report(sk);
 
@@ -998,6 +1074,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		struct sock *sk = skb_from_uarg(uarg)->sk;
 
 		atomic_dec(&sk->sk_zckey);
+		uarg->len--;
 
 		/* sock_zerocopy_put expects a ref. Most sockets take one per
 		 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
@@ -1045,9 +1122,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
 			     struct ubuf_info *uarg)
 {
+	struct ubuf_info *orig_uarg = skb_zcopy(skb);
 	struct iov_iter orig_iter = msg->msg_iter;
 	int err, orig_len = skb->len;
 
+	/* An skb can only point to one uarg. This edge case happens when
+	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
+	 */
+	if (orig_uarg && uarg != orig_uarg)
+		return -EEXIST;
+
 	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 		/* Streams do not free skb on error. Reset to prev state. */
-- 
2.13.1.611.g7e3b11ae1-goog

  parent reply	other threads:[~2017-06-21 21:18 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-21 21:18 [PATCH net-next v3 00/13] socket sendmsg MSG_ZEROCOPY Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 01/13] sock: allocate skbs from optmem Willem de Bruijn
     [not found] ` <20170621211816.53837-1-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-21 21:18   ` [PATCH net-next v3 02/13] sock: skb_copy_ubufs support for compound pages Willem de Bruijn
     [not found]     ` <20170621211816.53837-3-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-22 17:05       ` David Miller
     [not found]         ` <20170622.130528.1762873686654379973.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2017-06-22 20:57           ` Willem de Bruijn
     [not found]             ` <CAF=yD-+WQ1b9BytG4JuhgHCR8gAKvUVJPUfi4t-u_DqAFnVrkA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-23  1:36               ` David Miller
2017-06-23  3:59                 ` Willem de Bruijn
     [not found]                   ` <CAF=yD-JGekEkBsFfgZa+-TN3-QBPy4sfK0QLPx83Uh3DAoodvg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-27 15:53                     ` Willem de Bruijn
     [not found]                       ` <CAF=yD-LJ-EvWbKtxqrcK1D=nRSzWzpPcFVOX1poZ3+P=aa0QfA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-29 15:54                         ` Willem de Bruijn
2017-06-21 21:18   ` [PATCH net-next v3 13/13] test: add msg_zerocopy test Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 03/13] sock: add MSG_ZEROCOPY Willem de Bruijn
     [not found]   ` <20170621211816.53837-4-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-22 17:06     ` David Miller
2017-06-21 21:18 ` [PATCH net-next v3 04/13] sock: add SOCK_ZEROCOPY sockopt Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 05/13] sock: enable MSG_ZEROCOPY Willem de Bruijn
2017-06-21 21:18 ` Willem de Bruijn [this message]
     [not found]   ` <20170621211816.53837-7-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-22 17:07     ` [PATCH net-next v3 06/13] sock: MSG_ZEROCOPY notification coalescing David Miller
2017-06-21 21:18 ` [PATCH net-next v3 07/13] sock: add ee_code SO_EE_CODE_ZEROCOPY_COPIED Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 08/13] sock: ulimit on MSG_ZEROCOPY pages Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 09/13] tcp: enable MSG_ZEROCOPY Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 10/13] udp: " Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 11/13] raw: enable MSG_ZEROCOPY with IP_HDRINCL Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 12/13] packet: enable MSG_ZEROCOPY Willem de Bruijn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170621211816.53837-7-willemdebruijn.kernel@gmail.com \
    --to=willemdebruijn.kernel@gmail.com \
    --cc=davem@davemloft.net \
    --cc=linux-api@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.