From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, linux-api@vger.kernel.org,
Willem de Bruijn <willemb@google.com>
Subject: [PATCH net-next v3 06/13] sock: MSG_ZEROCOPY notification coalescing
Date: Wed, 21 Jun 2017 17:18:09 -0400 [thread overview]
Message-ID: <20170621211816.53837-7-willemdebruijn.kernel@gmail.com> (raw)
In-Reply-To: <20170621211816.53837-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
In the simple case, each sendmsg() call generates data and eventually
a zerocopy ready notification N, where N indicates the Nth successful
invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket.
TCP and corked sockets can cause send() calls to append new data to an
existing sk_buff and, thus, ubuf_info. In that case the notification
must hold a range. odify ubuf_info to store a inclusive range [N..N+m]
and add skb_zerocopy_realloc() to optionally extend an existing range.
Also coalesce notifications in this common case: if a notification
[1, 1] is about to be queued while [0, 0] is the queue tail, just modify
the head of the queue to read [0, 1].
Coalescing is limited to a few TSO frames worth of data to bound
notification latency.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/linux/skbuff.h | 15 ++++++--
net/core/skbuff.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 104 insertions(+), 9 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c20248b26b53..554017a3aa3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -408,14 +408,25 @@ enum {
*/
struct ubuf_info {
void (*callback)(struct ubuf_info *, bool zerocopy_success);
- void *ctx;
- unsigned long desc;
+ union {
+ struct {
+ unsigned long desc;
+ void *ctx;
+ };
+ struct {
+ u32 id;
+ u16 len;
+ u32 bytelen;
+ };
+ };
atomic_t refcnt;
};
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg);
static inline void sock_zerocopy_get(struct ubuf_info *uarg)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 89ba5ad024a5..41fedc4e651f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -941,7 +941,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->callback = sock_zerocopy_callback;
- uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
+ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+ uarg->len = 1;
+ uarg->bytelen = size;
atomic_set(&uarg->refcnt, 0);
sock_hold(sk);
@@ -954,24 +956,98 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
return container_of((void *)uarg, struct sk_buff, cb);
}
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg)
+{
+ if (uarg) {
+ const u32 byte_limit = 1 << 19; /* limit to a few TSO */
+ u32 bytelen, next;
+
+ /* realloc only when socket is locked (TCP, UDP cork),
+ * so uarg->len and sk_zckey access is serialized
+ */
+ if (!sock_owned_by_user(sk)) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ bytelen = uarg->bytelen + size;
+ if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ /* TCP can create new skb to attach new uarg */
+ if (sk->sk_type == SOCK_STREAM)
+ goto new_alloc;
+ return NULL;
+ }
+
+ next = (u32)atomic_read(&sk->sk_zckey);
+ if ((u32)(uarg->id + uarg->len) == next) {
+ uarg->len++;
+ uarg->bytelen = bytelen;
+ atomic_set(&sk->sk_zckey, ++next);
+ return uarg;
+ }
+ }
+
+new_alloc:
+ return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ u64 sum_len;
+ u32 old_lo, old_hi;
+
+ old_lo = serr->ee.ee_info;
+ old_hi = serr->ee.ee_data;
+ sum_len = old_hi - old_lo + 1ULL + len;
+
+ if (sum_len >= (1ULL << 32))
+ return false;
+
+ if (lo != old_hi + 1)
+ return false;
+
+ serr->ee.ee_data += len;
+ return true;
+}
+
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
{
struct sock_exterr_skb *serr;
- struct sk_buff *skb = skb_from_uarg(uarg);
+ struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock *sk = skb->sk;
- u16 id = uarg->desc;
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ unsigned long flags;
+ u32 lo, hi;
+ u16 len;
- if (sock_flag(sk, SOCK_DEAD))
+ /* if !len, there was only 1 call, and it was aborted
+ * so do not queue a completion notification
+ */
+ if (!uarg->len || sock_flag(sk, SOCK_DEAD))
goto release;
+ len = uarg->len;
+ lo = uarg->id;
+ hi = uarg->id + len - 1;
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = 0;
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
- serr->ee.ee_data = id;
+ serr->ee.ee_data = hi;
+ serr->ee.ee_info = lo;
- skb_queue_tail(&sk->sk_error_queue, skb);
- skb = NULL;
+ spin_lock_irqsave(&q->lock, flags);
+ tail = skb_peek_tail(q);
+ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+ !skb_zerocopy_notify_extend(tail, lo, len)) {
+ __skb_queue_tail(q, skb);
+ skb = NULL;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
sk->sk_error_report(sk);
@@ -998,6 +1074,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
struct sock *sk = skb_from_uarg(uarg)->sk;
atomic_dec(&sk->sk_zckey);
+ uarg->len--;
/* sock_zerocopy_put expects a ref. Most sockets take one per
* skb, which is zero on abort. tcp_sendmsg holds one extra, to
@@ -1045,9 +1122,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
{
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
struct iov_iter orig_iter = msg->msg_iter;
int err, orig_len = skb->len;
+ /* An skb can only point to one uarg. This edge case happens when
+ * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
/* Streams do not free skb on error. Reset to prev state. */
--
2.13.1.611.g7e3b11ae1-goog
next prev parent reply other threads:[~2017-06-21 21:18 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-06-21 21:18 [PATCH net-next v3 00/13] socket sendmsg MSG_ZEROCOPY Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 01/13] sock: allocate skbs from optmem Willem de Bruijn
[not found] ` <20170621211816.53837-1-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-21 21:18 ` [PATCH net-next v3 02/13] sock: skb_copy_ubufs support for compound pages Willem de Bruijn
[not found] ` <20170621211816.53837-3-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-22 17:05 ` David Miller
[not found] ` <20170622.130528.1762873686654379973.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2017-06-22 20:57 ` Willem de Bruijn
[not found] ` <CAF=yD-+WQ1b9BytG4JuhgHCR8gAKvUVJPUfi4t-u_DqAFnVrkA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-23 1:36 ` David Miller
2017-06-23 3:59 ` Willem de Bruijn
[not found] ` <CAF=yD-JGekEkBsFfgZa+-TN3-QBPy4sfK0QLPx83Uh3DAoodvg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-27 15:53 ` Willem de Bruijn
[not found] ` <CAF=yD-LJ-EvWbKtxqrcK1D=nRSzWzpPcFVOX1poZ3+P=aa0QfA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-29 15:54 ` Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 13/13] test: add msg_zerocopy test Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 03/13] sock: add MSG_ZEROCOPY Willem de Bruijn
[not found] ` <20170621211816.53837-4-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-22 17:06 ` David Miller
2017-06-21 21:18 ` [PATCH net-next v3 04/13] sock: add SOCK_ZEROCOPY sockopt Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 05/13] sock: enable MSG_ZEROCOPY Willem de Bruijn
2017-06-21 21:18 ` Willem de Bruijn [this message]
[not found] ` <20170621211816.53837-7-willemdebruijn.kernel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-06-22 17:07 ` [PATCH net-next v3 06/13] sock: MSG_ZEROCOPY notification coalescing David Miller
2017-06-21 21:18 ` [PATCH net-next v3 07/13] sock: add ee_code SO_EE_CODE_ZEROCOPY_COPIED Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 08/13] sock: ulimit on MSG_ZEROCOPY pages Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 09/13] tcp: enable MSG_ZEROCOPY Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 10/13] udp: " Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 11/13] raw: enable MSG_ZEROCOPY with IP_HDRINCL Willem de Bruijn
2017-06-21 21:18 ` [PATCH net-next v3 12/13] packet: enable MSG_ZEROCOPY Willem de Bruijn
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170621211816.53837-7-willemdebruijn.kernel@gmail.com \
--to=willemdebruijn.kernel@gmail.com \
--cc=davem@davemloft.net \
--cc=linux-api@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=willemb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).