From mboxrd@z Thu Jan 1 00:00:00 1970 From: Eric Dumazet Subject: [PATCH net-next 2/3] tcp: implement coalescing on backlog queue Date: Wed, 21 Nov 2018 09:52:39 -0800 Message-ID: <20181121175240.6075-3-edumazet@google.com> References: <20181121175240.6075-1-edumazet@google.com> Mime-Version: 1.0 Content-Transfer-Encoding: 8bit Cc: netdev , Jean-Louis Dupond , Neal Cardwell , Yuchung Cheng , Eric Dumazet , Eric Dumazet To: "David S . Miller" Return-path: Received: from mail-pl1-f196.google.com ([209.85.214.196]:46768 "EHLO mail-pl1-f196.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1731013AbeKVE2K (ORCPT ); Wed, 21 Nov 2018 23:28:10 -0500 Received: by mail-pl1-f196.google.com with SMTP id t13so6520883ply.13 for ; Wed, 21 Nov 2018 09:52:48 -0800 (PST) In-Reply-To: <20181121175240.6075-1-edumazet@google.com> Sender: netdev-owner@vger.kernel.org List-ID: In case GRO is not as efficient as it should be or disabled, we might have a user thread trapped in __release_sock() while softirq handler flood packets up to the point we have to drop. This patch balances work done from user thread and softirq, to give more chances to __release_sock() to complete its work. This also helps if we receive many ACK packets, since GRO does not aggregate them. Signed-off-by: Eric Dumazet Tested-by: Jean-Louis Dupond Cc: Neal Cardwell Cc: Yuchung Cheng --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_ipv4.c | 75 +++++++++++++++++++++++++++++++++++---- 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index f80135e5feaa886000009db6dff75b2bc2d637b2..86dc24a96c90ab047d5173d625450facd6c6dd79 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -243,6 +243,7 @@ enum LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */ LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ + LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */ LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 70289682a6701438aed99a00a9705c39fa4394d3..c3610b37bb4ce665b1976d8cc907b6dd0de42ab9 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 795605a2327504b8a025405826e7e0ca8dc8501d..401e1d1cb904a4c7963d8baa419cfbf178593344 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1619,12 +1619,10 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - - /* Only socket owner can try to collapse/prune rx queues - * to reduce memory overhead, so add a little headroom here. - * Few sockets backlog are possibly concurrently non empty. - */ - limit += 64*1024; + struct skb_shared_info *shinfo; + const struct tcphdr *th; + struct sk_buff *tail; + unsigned int hdrlen; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -1636,6 +1634,71 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); + if (unlikely(tcp_checksum_complete(skb))) { + bh_unlock_sock(sk); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + return true; + } + + /* Attempt coalescing to last skb in backlog, even if we are + * above the limits. + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. + */ + th = (const struct tcphdr *)skb->data; + hdrlen = th->doff * 4; + shinfo = skb_shinfo(skb); + + if (!shinfo->gso_size) + shinfo->gso_size = skb->len - hdrlen; + + if (!shinfo->gso_segs) + shinfo->gso_segs = 1; + + tail = sk->sk_backlog.tail; + if (tail && + TCP_SKB_CB(tail)->end_seq == TCP_SKB_CB(skb)->seq && +#ifdef CONFIG_TLS_DEVICE + tail->decrypted == skb->decrypted && +#endif + !memcmp(tail->data + sizeof(*th), skb->data + sizeof(*th), + hdrlen - sizeof(*th))) { + bool fragstolen; + int delta; + + __skb_pull(skb, hdrlen); + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + TCP_SKB_CB(tail)->has_rxtstamp = true; + tail->tstamp = skb->tstamp; + skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + + /* Not as strict as GRO. We only need to carry mss max value */ + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, + skb_shinfo(tail)->gso_size); + + skb_shinfo(tail)->gso_segs += shinfo->gso_segs; + + sk->sk_backlog.len += delta; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPBACKLOGCOALESCE); + kfree_skb_partial(skb, fragstolen); + return false; + } + __skb_push(skb, hdrlen); + } + + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); -- 2.19.1.1215.g8438c0b245-goog