All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
@ 2017-10-06  5:21 Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 1/7] net: add rb_to_skb() and other rb tree helpers Eric Dumazet
                   ` (7 more replies)
  0 siblings, 8 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

This patch series implement RB-tree based retransmit queue for TCP,
to better match modern BDP.

Tested:

 On receiver :
 netem on ingress : delay 150ms 200us loss 1
 GRO disabled to force stress and SACK storms.

for f in `seq 1 10`
do
 ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1
done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}'

Before patch :

323.87  351.48  339.59  338.62  306.72
204.07  304.93  291.88  202.47  176.88
->   2840

After patch:

1700.83 2207.98 2070.17 1544.26 2114.76
2124.89 1693.14 1080.91 2216.82 1299.94
->  18053

Average of 1805 Mbits istead of 284 Mbits.

Eric Dumazet (7):
  net: add rb_to_skb() and other rb tree helpers
  tcp: uninline tcp_write_queue_purge()
  tcp: tcp_tx_timestamp() cleanup
  tcp: tcp_mark_head_lost() optimization
  tcp: reduce tcp_fastretrans_alert() verbosity
  tcp: pass previous skb to tcp_shifted_skb()
  tcp: implement rb-tree based retransmit queue

 include/linux/skbuff.h  |  18 +++++
 include/net/sock.h      |   7 +-
 include/net/tcp.h       | 100 ++++++++++++--------------
 net/ipv4/tcp.c          |  63 ++++++++++++----
 net/ipv4/tcp_fastopen.c |   8 +--
 net/ipv4/tcp_input.c    | 187 ++++++++++++++++++++++++------------------------
 net/ipv4/tcp_ipv4.c     |   2 +-
 net/ipv4/tcp_output.c   | 137 +++++++++++++++++++----------------
 net/ipv4/tcp_timer.c    |  24 ++++---
 net/sched/sch_netem.c   |  14 ++--
 10 files changed, 311 insertions(+), 249 deletions(-)

-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH net-next 1/7] net: add rb_to_skb() and other rb tree helpers
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 2/7] tcp: uninline tcp_write_queue_purge() Eric Dumazet
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/skbuff.h  | 18 ++++++++++++++++++
 net/ipv4/tcp_fastopen.c |  8 +++-----
 net/ipv4/tcp_input.c    | 33 ++++++++++++---------------------
 net/sched/sch_netem.c   | 14 ++++----------
 4 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 01a985937867935dd615d355f4a662a9f8674b83..03634ec2f9181bdf63e5acbd2fadc831086eaa87 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 	return __skb_grow(skb, len);
 }
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 29fff14d5a53db7ba34b0e2f2feaf46dfb55513e..7ee4aadcdd711e211a0a87e315328ba84d9defb3 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node *p;
-	struct sk_buff *skb;
 	struct dst_entry *dst;
+	struct sk_buff *skb;
 
 	if (!tp->syn_fastopen)
 		return;
 
 	if (!tp->data_segs_in) {
-		p = rb_first(&tp->out_of_order_queue);
-		if (p && !rb_next(p)) {
-			skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 				tcp_fastopen_active_disable(sk);
 				return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb0d7ed84b94110ee95b66befcad143505665ed5..90afe41435966fc62ae8cb5799ca4c99995076dc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4335,7 +4335,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 	p = rb_first(&tp->out_of_order_queue);
 	while (p) {
-		skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = rb_to_skb(p);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4399,7 +4399,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node **p, *q, *parent;
+	struct rb_node **p, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
 	bool fragstolen;
@@ -4458,7 +4458,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	parent = NULL;
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 			p = &parent->rb_left;
 			continue;
@@ -4503,9 +4503,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 merge_right:
 	/* Remove other segments covered by skb. */
-	while ((q = rb_next(&skb->rbnode)) != NULL) {
-		skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4520,7 +4518,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_drop(sk, skb1);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
-	if (!q)
+	if (!skb1)
 		tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4706,7 +4704,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
 	if (list)
 		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+	return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4735,7 +4733,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 			p = &parent->rb_left;
 		else
@@ -4854,26 +4852,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb, *head;
-	struct rb_node *p;
 	u32 start, end;
 
-	p = rb_first(&tp->out_of_order_queue);
-	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+	skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
 	if (!skb) {
-		p = rb_last(&tp->out_of_order_queue);
-		/* Note: This is possible p is NULL here. We do not
-		 * use rb_entry_safe(), as ooo_last_skb is valid only
-		 * if rbtree is not empty.
-		 */
-		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 		return;
 	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
 
 	for (head = skb;;) {
-		skb = tcp_skb_next(skb, NULL);
+		skb = skb_rb_next(skb);
 
 		/* Range is terminated when we see a gap or when
 		 * we are at the queue end.
@@ -4916,14 +4907,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
-		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+		tcp_drop(sk, rb_to_skb(node));
 		sk_mem_reclaim(sk);
 		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 		    !tcp_under_memory_pressure(sk))
 			break;
 		node = prev;
 	} while (node);
-	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+	tp->ooo_last_skb = rb_to_skb(prev);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5a4f1008029068372019a965186e7a3c0a18aac3..db0228a65e8c58d770374a9d4324f07a7f8d8e0c 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -148,12 +148,6 @@ struct netem_skb_cb {
 	psched_time_t	time_to_send;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-	return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch)
 	struct rb_node *p = rb_first(&q->t_root);
 
 	while (p) {
-		struct sk_buff *skb = netem_rb_to_skb(p);
+		struct sk_buff *skb = rb_to_skb(p);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &q->t_root);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		struct sk_buff *skb;
 
 		parent = *p;
-		skb = netem_rb_to_skb(parent);
+		skb = rb_to_skb(parent);
 		if (tnext >= netem_skb_cb(skb)->time_to_send)
 			p = &parent->rb_right;
 		else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				struct sk_buff *t_skb;
 				struct netem_skb_cb *t_last;
 
-				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
 				    t_last->time_to_send > last->time_to_send) {
@@ -617,7 +611,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	if (p) {
 		psched_time_t time_to_send;
 
-		skb = netem_rb_to_skb(p);
+		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 2/7] tcp: uninline tcp_write_queue_purge()
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 1/7] net: add rb_to_skb() and other rb tree helpers Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 3/7] tcp: tcp_tx_timestamp() cleanup Eric Dumazet
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

Since the upcoming rtx rbtree will add some extra code,
it is time to not inline this fat function anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/tcp.h | 15 +--------------
 net/ipv4/tcp.c    | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3b16f353b539a563dae0b37328a52d67e6476f31..744559b727847db79b9751e6c6bef57e83c141f8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1606,20 +1606,7 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
 	skb->_skb_refdst = _save;		\
 }
 
-/* write queue abstraction */
-static inline void tcp_write_queue_purge(struct sock *sk)
-{
-	struct sk_buff *skb;
-
-	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
-		tcp_skb_tsorted_anchor_cleanup(skb);
-		sk_wmem_free_skb(sk, skb);
-	}
-	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
-	sk_mem_reclaim(sk);
-	tcp_clear_all_retrans_hints(tcp_sk(sk));
-}
+void tcp_write_queue_purge(struct sock *sk);
 
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8cf742fd4f99d7eb1bd9632afcfb09c36ba1130e..f8ebae62f834ce6059d5bcf7dcb84988187dfbce 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2318,6 +2318,20 @@ static inline bool tcp_need_reset(int state)
 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
 }
 
+void tcp_write_queue_purge(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		tcp_skb_tsorted_anchor_cleanup(skb);
+		sk_wmem_free_skb(sk, skb);
+	}
+	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+	sk_mem_reclaim(sk);
+	tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
 int tcp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 3/7] tcp: tcp_tx_timestamp() cleanup
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 1/7] net: add rb_to_skb() and other rb tree helpers Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 2/7] tcp: uninline tcp_write_queue_purge() Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 4/7] tcp: tcp_mark_head_lost() optimization Eric Dumazet
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

tcp_write_queue_tail() call can be factorized.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f8ebae62f834ce6059d5bcf7dcb84988187dfbce..b8d379c8093661ccc4ef7fb82fa92828ab5f6918 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -469,8 +469,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_init_buffer_space(sk);
 }
 
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
 {
+	struct sk_buff *skb = tcp_write_queue_tail(sk);
+
 	if (tsflags && skb) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -1041,7 +1043,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 
 out:
 	if (copied) {
-		tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+		tcp_tx_timestamp(sk, sk->sk_tsflags);
 		if (!(flags & MSG_SENDPAGE_NOTLAST))
 			tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 	}
@@ -1418,7 +1420,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 
 out:
 	if (copied) {
-		tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
+		tcp_tx_timestamp(sk, sockc.tsflags);
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 	}
 out_nopush:
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 4/7] tcp: tcp_mark_head_lost() optimization
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
                   ` (2 preceding siblings ...)
  2017-10-06  5:21 ` [PATCH net-next 3/7] tcp: tcp_tx_timestamp() cleanup Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 5/7] tcp: reduce tcp_fastretrans_alert() verbosity Eric Dumazet
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

It will be a bit more expensive to get the head of rtx queue
once rtx queue is converted to an rb-tree.

We can avoid this extra cost in case tp->lost_skb_hint is set.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_input.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 90afe41435966fc62ae8cb5799ca4c99995076dc..abc3b1db81f85a7feebda40cebc90c07afcef446 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2207,12 +2207,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 	const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
 
 	WARN_ON(packets > tp->packets_out);
-	if (tp->lost_skb_hint) {
-		skb = tp->lost_skb_hint;
-		cnt = tp->lost_cnt_hint;
+	skb = tp->lost_skb_hint;
+	if (skb) {
 		/* Head already handled? */
-		if (mark_head && skb != tcp_write_queue_head(sk))
+		if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
 			return;
+		cnt = tp->lost_cnt_hint;
 	} else {
 		skb = tcp_write_queue_head(sk);
 		cnt = 0;
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 5/7] tcp: reduce tcp_fastretrans_alert() verbosity
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
                   ` (3 preceding siblings ...)
  2017-10-06  5:21 ` [PATCH net-next 4/7] tcp: tcp_mark_head_lost() optimization Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 6/7] tcp: pass previous skb to tcp_shifted_skb() Eric Dumazet
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

With upcoming rb-tree implementation, the checks will trigger
more often, and this is expected.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index abc3b1db81f85a7feebda40cebc90c07afcef446..be7644204cd4493d0574aa065afbd6cb82a6b3bb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2804,9 +2804,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
 				    (tcp_fackets_out(tp) > tp->reordering));
 
-	if (WARN_ON(!tp->packets_out && tp->sacked_out))
+	if (!tp->packets_out && tp->sacked_out)
 		tp->sacked_out = 0;
-	if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+	if (!tp->sacked_out && tp->fackets_out)
 		tp->fackets_out = 0;
 
 	/* Now state machine starts.
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 6/7] tcp: pass previous skb to tcp_shifted_skb()
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
                   ` (4 preceding siblings ...)
  2017-10-06  5:21 ` [PATCH net-next 5/7] tcp: reduce tcp_fastretrans_alert() verbosity Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06  5:21 ` [PATCH net-next 7/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
  2017-10-06 23:31 ` [PATCH net-next 0/7] " David Miller
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

No need to recompute previous skb, as it will be a bit more
expensive when rtx queue is converted to RB tree.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_input.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be7644204cd4493d0574aa065afbd6cb82a6b3bb..72c4732ae2da122a878e4673ce56812055de1972 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1288,13 +1288,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
 /* Shift newly-SACKed bytes from this skb to the immediately previous
  * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
  */
-static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+			    struct sk_buff *skb,
 			    struct tcp_sacktag_state *state,
 			    unsigned int pcount, int shifted, int mss,
 			    bool dup_sack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
 	u32 start_seq = TCP_SKB_CB(skb)->seq;	/* start of newly-SACKed */
 	u32 end_seq = start_seq + shifted;	/* end of newly-SACKed */
 
@@ -1495,7 +1495,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 
 	if (!skb_shift(prev, skb, len))
 		goto fallback;
-	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
 		goto out;
 
 	/* Hole filled allows collapsing with the next as well, this is very
@@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	len = skb->len;
 	if (skb_shift(prev, skb, len)) {
 		pcount += tcp_skb_pcount(skb);
-		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+		tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+				len, mss, 0);
 	}
 
 out:
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 7/7] tcp: implement rb-tree based retransmit queue
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
                   ` (5 preceding siblings ...)
  2017-10-06  5:21 ` [PATCH net-next 6/7] tcp: pass previous skb to tcp_shifted_skb() Eric Dumazet
@ 2017-10-06  5:21 ` Eric Dumazet
  2017-10-06 23:31 ` [PATCH net-next 0/7] " David Miller
  7 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2017-10-06  5:21 UTC (permalink / raw)
  To: David S . Miller, Neal Cardwell, Yuchung Cheng
  Cc: netdev, Eric Dumazet, Eric Dumazet

Using a linear list to store all skbs in write queue has been okay
for quite a while : O(N) is not too bad when N < 500.

Things get messy when N is the order of 100,000 : Modern TCP stacks
want 10Gbit+ of throughput even with 200 ms RTT flows.

40 ns per cache line miss means a full scan can use 4 ms,
blowing away CPU caches.

SACK processing often can use various hints to avoid parsing
whole retransmit queue. But with high packet losses and/or high
reordering, hints no longer work.

Sender has to process thousands of unfriendly SACK, accumulating
a huge socket backlog, burning a cpu and massively dropping packets.

Using an rb-tree for retransmit queue has been avoided for years
because it added complexity and overhead, but now is the time
to be more resistant and say no to quadratic behavior.

1) RTX queue is no longer part of the write queue : already sent skbs
are stored in one rb-tree.

2) Since reaching the head of write queue no longer needs
sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue

Tested:

 On receiver :
 netem on ingress : delay 150ms 200us loss 1
 GRO disabled to force stress and SACK storms.

for f in `seq 1 10`
do
 ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1
done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}'

Before patch :

323.87
351.48
339.59
338.62
306.72
204.07
304.93
291.88
202.47
176.88
   2840

After patch:

1700.83
2207.98
2070.17
1544.26
2114.76
2124.89
1693.14
1080.91
2216.82
1299.94
  18053

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h    |   7 ++-
 include/net/tcp.h     |  89 ++++++++++++++++----------------
 net/ipv4/tcp.c        |  41 +++++++++++----
 net/ipv4/tcp_input.c  | 133 +++++++++++++++++++++++++-----------------------
 net/ipv4/tcp_ipv4.c   |   2 +-
 net/ipv4/tcp_output.c | 137 +++++++++++++++++++++++++++-----------------------
 net/ipv4/tcp_timer.c  |  24 +++++----
 7 files changed, 245 insertions(+), 188 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index a6b9a8d1a6df3f72df8f1aac0f577257fa6452d0..4827094f1db44fce1c044c6fb6a4a98f8dbd4b22 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -60,7 +60,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/cgroup-defs.h>
-
+#include <linux/rbtree.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
 #include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
 	int			sk_wmem_queued;
 	refcount_t		sk_wmem_alloc;
 	unsigned long		sk_tsq_flags;
-	struct sk_buff		*sk_send_head;
+	union {
+		struct sk_buff	*sk_send_head;
+		struct rb_root	tcp_rtx_queue;
+	};
 	struct sk_buff_head	sk_write_queue;
 	__s32			sk_peek_off;
 	int			sk_write_pending;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 744559b727847db79b9751e6c6bef57e83c141f8..5a95e5886b55e03e4a8bfeac3506c657a4f97dde 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
 void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+enum tcp_queue {
+	TCP_FRAG_IN_WRITE_QUEUE,
+	TCP_FRAG_IN_RTX_QUEUE,
+};
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+		 struct sk_buff *skb, u32 len,
+		 unsigned int mss_now, gfp_t gfp);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
@@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
 
 void tcp_write_queue_purge(struct sock *sk);
 
+static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
+{
+	return skb_rb_first(&sk->tcp_rtx_queue);
+}
+
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
 {
 	return skb_peek(&sk->sk_write_queue);
@@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
 	return skb_queue_prev(&sk->sk_write_queue, skb);
 }
 
-#define tcp_for_write_queue(skb, sk)					\
-	skb_queue_walk(&(sk)->sk_write_queue, skb)
-
-#define tcp_for_write_queue_from(skb, sk)				\
-	skb_queue_walk_from(&(sk)->sk_write_queue, skb)
-
 #define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
 	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
 static inline struct sk_buff *tcp_send_head(const struct sock *sk)
 {
-	return sk->sk_send_head;
+	return skb_peek(&sk->sk_write_queue);
 }
 
 static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
 	return skb_queue_is_last(&sk->sk_write_queue, skb);
 }
 
-static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
+static inline bool tcp_write_queue_empty(const struct sock *sk)
 {
-	if (tcp_skb_is_last(sk, skb))
-		sk->sk_send_head = NULL;
-	else
-		sk->sk_send_head = tcp_write_queue_next(sk, skb);
+	return skb_queue_empty(&sk->sk_write_queue);
+}
+
+static inline bool tcp_rtx_queue_empty(const struct sock *sk)
+{
+	return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
+}
+
+static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
+{
+	return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
 }
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
 {
-	if (sk->sk_send_head == skb_unlinked) {
-		sk->sk_send_head = NULL;
+	if (tcp_write_queue_empty(sk))
 		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	}
+
 	if (tcp_sk(sk)->highest_sack == skb_unlinked)
 		tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-	sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
 	__tcp_add_write_queue_tail(sk, skb);
 
 	/* Queue it, remembering where we must start sending. */
-	if (sk->sk_send_head == NULL) {
-		sk->sk_send_head = skb;
+	if (sk->sk_write_queue.next == skb) {
 		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
 
 		if (tcp_sk(sk)->highest_sack == NULL)
@@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
 	__skb_queue_head(&sk->sk_write_queue, skb);
 }
 
-/* Insert buff after skb on the write queue of sk.  */
-static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
-						struct sk_buff *buff,
-						struct sock *sk)
-{
-	__skb_queue_after(&sk->sk_write_queue, skb, buff);
-}
-
 /* Insert new before skb on the write queue of sk.  */
 static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 						  struct sk_buff *skb,
 						  struct sock *sk)
 {
 	__skb_queue_before(&sk->sk_write_queue, skb, new);
-
-	if (sk->sk_send_head == skb)
-		sk->sk_send_head = new;
 }
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
-	list_del(&skb->tcp_tsorted_anchor);
-	tcp_skb_tsorted_anchor_cleanup(skb);
 	__skb_unlink(skb, &sk->sk_write_queue);
 }
 
-static inline bool tcp_write_queue_empty(struct sock *sk)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
+
+static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
 {
-	return skb_queue_empty(&sk->sk_write_queue);
+	tcp_skb_tsorted_anchor_cleanup(skb);
+	rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
+}
+
+static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
+{
+	list_del(&skb->tcp_tsorted_anchor);
+	tcp_rtx_queue_unlink(skb, sk);
+	sk_wmem_free_skb(sk, skb);
 }
 
 static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
 
 static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
 {
-	tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
-						tcp_write_queue_next(sk, skb);
+	struct sk_buff *next = skb_rb_next(skb);
+
+	tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
 }
 
 static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
 
 static inline void tcp_highest_sack_reset(struct sock *sk)
 {
-	tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+	struct sk_buff *skb = tcp_rtx_queue_head(sk);
+
+	tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
 }
 
 /* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
-	const struct sk_buff *skb = tcp_write_queue_head(sk);
+	const struct sk_buff *skb = tcp_rtx_queue_head(sk);
 	u32 rto = inet_csk(sk)->icsk_rto;
 	u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8d379c8093661ccc4ef7fb82fa92828ab5f6918..3b34850d361fd20e59a42d56d398aea26bb8ecda 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->out_of_order_queue = RB_ROOT;
+	sk->tcp_rtx_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	INIT_LIST_HEAD(&tp->tsq_node);
 	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
-	if (!tcp_send_head(sk))
-		return;
-
 	skb = tcp_write_queue_tail(sk);
+	if (!skb)
+		return;
 	if (!(flags & MSG_MORE) || forced_push(tp))
 		tcp_mark_push(tp, skb);
 
@@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 		int copy, i;
 		bool can_coalesce;
 
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+		if (!skb || (copy = size_goal - skb->len) <= 0 ||
 		    !tcp_skb_can_collapse_to(skb)) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
 
 			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
-						  skb_queue_empty(&sk->sk_write_queue));
+					tcp_rtx_and_write_queues_empty(sk));
 			if (!skb)
 				goto wait_for_memory;
 
@@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
-		skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+		skb = tcp_write_queue_tail(sk);
 		uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
 		if (!uarg) {
 			err = -ENOBUFS;
@@ -1275,7 +1275,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 		int max = size_goal;
 
 		skb = tcp_write_queue_tail(sk);
-		if (tcp_send_head(sk)) {
+		if (skb) {
 			if (skb->ip_summed == CHECKSUM_NONE)
 				max = mss_now;
 			copy = max - skb->len;
@@ -1295,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 				process_backlog = false;
 				goto restart;
 			}
-			first_skb = skb_queue_empty(&sk->sk_write_queue);
+			first_skb = tcp_rtx_and_write_queues_empty(sk);
 			skb = sk_stream_alloc_skb(sk,
 						  select_size(sk, sg, first_skb),
 						  sk->sk_allocation,
@@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 
 	/* XXX -- need to support SO_PEEK_OFF */
 
+	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+		if (err)
+			return err;
+		copied += skb->len;
+	}
+
 	skb_queue_walk(&sk->sk_write_queue, skb) {
 		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
 		if (err)
@@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state)
 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
 }
 
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+		/* Since we are deleting whole queue, no need to
+		 * list_del(&skb->tcp_tsorted_anchor)
+		 */
+		tcp_rtx_queue_unlink(skb, sk);
+		sk_wmem_free_skb(sk, skb);
+	}
+}
+
 void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
@@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk)
 		tcp_skb_tsorted_anchor_cleanup(skb);
 		sk_wmem_free_skb(sk, skb);
 	}
+	tcp_rtx_queue_purge(sk);
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 	 * issue in __tcp_select_window()
 	 */
 	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
-	tcp_init_send_head(sk);
 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
 	__sk_dst_reset(sk);
 	dst_release(sk->sk_rx_dst);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 72c4732ae2da122a878e4673ce56812055de1972..d0682ce2a5d67852ce52063f12c0506324aab984 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1142,6 +1142,7 @@ struct tcp_sacktag_state {
 	u64	last_sackt;
 	struct rate_sample *rate;
 	int	flag;
+	unsigned int mss_now;
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 		if (pkt_len >= skb->len && !in_sack)
 			return 0;
 
-		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+		err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+				   pkt_len, mss, GFP_ATOMIC);
 		if (err < 0)
 			return err;
 	}
@@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
 		TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
 
-	tcp_unlink_write_queue(skb, sk);
-	sk_wmem_free_skb(sk, skb);
+	tcp_rtx_queue_unlink_and_free(skb, sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
 
@@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 		goto fallback;
 
 	/* Can only happen with delayed DSACK + discard craziness */
-	if (unlikely(skb == tcp_write_queue_head(sk)))
+	prev = skb_rb_prev(skb);
+	if (!prev)
 		goto fallback;
-	prev = tcp_write_queue_prev(sk, skb);
 
 	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
 		goto fallback;
@@ -1501,12 +1502,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	/* Hole filled allows collapsing with the next as well, this is very
 	 * useful when hole on every nth skb pattern happens
 	 */
-	if (prev == tcp_write_queue_tail(sk))
+	skb = skb_rb_next(prev);
+	if (!skb)
 		goto out;
-	skb = tcp_write_queue_next(sk, prev);
 
 	if (!skb_can_shift(skb) ||
-	    (skb == tcp_send_head(sk)) ||
 	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
 	    (mss != tcp_skb_seglen(skb)))
 		goto out;
@@ -1539,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *tmp;
 
-	tcp_for_write_queue_from(skb, sk) {
+	skb_rbtree_walk_from(skb) {
 		int in_sack = 0;
 		bool dup_sack = dup_sack_in;
 
-		if (skb == tcp_send_head(sk))
-			break;
-
 		/* queue is in-order => we can short-circuit the walk early */
 		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
 			break;
@@ -1607,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 	return skb;
 }
 
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+					   struct tcp_sacktag_state *state,
+					   u32 seq)
+{
+	struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+	struct sk_buff *skb;
+	int unack_bytes;
+
+	while (*p) {
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (before(seq, TCP_SKB_CB(skb)->seq)) {
+			p = &parent->rb_left;
+			continue;
+		}
+		if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+			p = &parent->rb_right;
+			continue;
+		}
+
+		state->fack_count = 0;
+		unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
+		if (state->mss_now && unack_bytes > 0)
+			state->fack_count = unack_bytes / state->mss_now;
+
+		return skb;
+	}
+	return NULL;
+}
+
 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
 					struct tcp_sacktag_state *state,
 					u32 skip_to_seq)
 {
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-
-		if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
-			break;
+	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+		return skb;
 
-		state->fack_count += tcp_skb_pcount(skb);
-	}
-	return skb;
+	return tcp_sacktag_bsearch(sk, state, skip_to_seq);
 }
 
 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1745,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 		}
 	}
 
-	skb = tcp_write_queue_head(sk);
+	state->mss_now = tcp_current_mss(sk);
 	state->fack_count = 0;
+	skb = NULL;
 	i = 0;
 
 	if (!tp->sacked_out) {
@@ -1970,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk)
 	if (tcp_is_reno(tp))
 		tcp_reset_reno_sack(tp);
 
-	skb = tcp_write_queue_head(sk);
+	skb = tcp_rtx_queue_head(sk);
 	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
@@ -1979,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk)
 	}
 	tcp_clear_all_retrans_hints(tp);
 
-	tcp_for_write_queue(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-
+	skb_rbtree_walk_from(skb) {
 		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 			     is_reneg);
 		if (mark_lost)
@@ -2215,13 +2231,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			return;
 		cnt = tp->lost_cnt_hint;
 	} else {
-		skb = tcp_write_queue_head(sk);
+		skb = tcp_rtx_queue_head(sk);
 		cnt = 0;
 	}
 
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
+	skb_rbtree_walk_from(skb) {
 		/* TODO: do this better */
 		/* this is not the most efficient way to do this... */
 		tp->lost_skb_hint = skb;
@@ -2245,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			/* If needed, chop off the prefix to mark as lost. */
 			lost = (packets - oldcnt) * mss;
 			if (lost < skb->len &&
-			    tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+			    tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+					 lost, mss, GFP_ATOMIC) < 0)
 				break;
 			cnt = packets;
 		}
@@ -2329,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk)
 	if (tp->retrans_out)
 		return true;
 
-	skb = tcp_write_queue_head(sk);
+	skb = tcp_rtx_queue_head(sk);
 	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
 		return true;
 
@@ -2370,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
 	if (unmark_loss) {
 		struct sk_buff *skb;
 
-		tcp_for_write_queue(skb, sk) {
-			if (skb == tcp_send_head(sk))
-				break;
+		skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
 		}
 		tp->lost_out = 0;
@@ -2617,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	unsigned int mss = tcp_current_mss(sk);
 	u32 prior_lost = tp->lost_out;
 
-	tcp_for_write_queue(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
+	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
 		if (tcp_skb_seglen(skb) > mss &&
 		    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2713,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 			 * is updated in tcp_ack()). Otherwise fall back to
 			 * the conventional recovery.
 			 */
-			if (tcp_send_head(sk) &&
+			if (!tcp_write_queue_empty(sk) &&
 			    after(tcp_wnd_end(tp), tp->snd_nxt)) {
 				*rexmit = REXMIT_NEW;
 				return;
@@ -3077,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
+	struct sk_buff *skb, *next;
 	bool fully_acked = true;
 	long sack_rtt_us = -1L;
 	long seq_rtt_us = -1L;
 	long ca_rtt_us = -1L;
-	struct sk_buff *skb;
 	u32 pkts_acked = 0;
 	u32 last_in_flight = 0;
 	bool rtt_update;
@@ -3089,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 	first_ackt = 0;
 
-	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 		u8 sacked = scb->sacked;
 		u32 acked_pcount;
@@ -3107,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				break;
 			fully_acked = false;
 		} else {
-			/* Speedup tcp_unlink_write_queue() and next loop */
-			prefetchw(skb->next);
 			acked_pcount = tcp_skb_pcount(skb);
 		}
 
@@ -3160,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		if (!fully_acked)
 			break;
 
-		tcp_unlink_write_queue(skb, sk);
-		sk_wmem_free_skb(sk, skb);
+		next = skb_rb_next(skb);
 		if (unlikely(skb == tp->retransmit_skb_hint))
 			tp->retransmit_skb_hint = NULL;
 		if (unlikely(skb == tp->lost_skb_hint))
 			tp->lost_skb_hint = NULL;
+		tcp_rtx_queue_unlink_and_free(skb, sk);
 	}
 
 	if (!skb)
@@ -3257,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 static void tcp_ack_probe(struct sock *sk)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *head = tcp_send_head(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 
 	/* Was it a usable window open? */
-
-	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+	if (!head)
+		return;
+	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
 		icsk->icsk_backoff = 0;
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
 		/* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3382,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 			tp->pred_flags = 0;
 			tcp_fast_path_check(sk);
 
-			if (tcp_send_head(sk))
+			if (!tcp_write_queue_empty(sk))
 				tcp_slow_start_after_idle_check(sk);
 
 			if (nwin > tp->max_window) {
@@ -3567,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
 
-	/* We very likely will need to access write queue head. */
-	prefetchw(sk->sk_write_queue.next);
+	/* We very likely will need to access rtx queue. */
+	prefetch(sk->tcp_rtx_queue.rb_node);
 
 	/* If the ack is older than previous acks
 	 * then we can probably ignore it.
@@ -3682,8 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
 	 */
-	if (tcp_send_head(sk))
-		tcp_ack_probe(sk);
+	tcp_ack_probe(sk);
 
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
@@ -4726,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
 }
 
 /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -5530,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 				    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
 	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
 	bool syn_drop = false;
 
@@ -5565,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
 	if (data) { /* Retransmit unacked data in SYN */
-		tcp_for_write_queue_from(data, sk) {
-			if (data == tcp_send_head(sk) ||
-			    __tcp_retransmit_skb(sk, data, 1))
+		skb_rbtree_walk_from(data) {
+			if (__tcp_retransmit_skb(sk, data, 1))
 				break;
 		}
 		tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c7460fd90884c2f04ae135547836bcff5108fc13..5418ecf03b787a645b1b7ae32207a155f24b87b0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 					       TCP_TIMEOUT_INIT;
 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 
-		skb = tcp_write_queue_head(sk);
+		skb = tcp_rtx_queue_head(sk);
 		BUG_ON(!skb);
 
 		tcp_mstamp_refresh(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8162e288017843fca2694c4e22b2e8981572256b..696b0a168f160a2c7242fcc1f76dfa5e071ef255 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int prior_packets = tp->packets_out;
 
-	tcp_advance_send_head(sk, skb);
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
+	__skb_unlink(skb, &sk->sk_write_queue);
+	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
 	tp->packets_out += tcp_skb_pcount(skb);
 	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
 	TCP_SKB_CB(skb)->eor = 0;
 }
 
+/* Insert buff after skb on the write or rtx queue of sk.  */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+					 struct sk_buff *buff,
+					 struct sock *sk,
+					 enum tcp_queue tcp_queue)
+{
+	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+		__skb_queue_after(&sk->sk_write_queue, skb, buff);
+	else
+		tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
  * Remember, these are still headerless SKBs at this point.
  */
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+		 struct sk_buff *skb, u32 len,
 		 unsigned int mss_now, gfp_t gfp)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
-	tcp_insert_write_queue_after(skb, buff, sk);
+	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
 	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
 	return 0;
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 		 * is caused by insufficient sender buffer:
 		 * 1) just sent some data (see tcp_write_xmit)
 		 * 2) not cwnd limited (this else condition)
-		 * 3) no more data to send (null tcp_send_head )
+		 * 3) no more data to send (tcp_write_queue_empty())
 		 * 4) application is hitting buffer limit (SOCK_NOSPACE)
 		 */
-		if (!tcp_send_head(sk) && sk->sk_socket &&
+		if (tcp_write_queue_empty(sk) && sk->sk_socket &&
 		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
 		    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 			tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+			struct sk_buff *skb, unsigned int len,
 			unsigned int mss_now, gfp_t gfp)
 {
 	struct sk_buff *buff;
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* All of a TSO frame must be composed of paged data.  */
 	if (skb->len != skb->data_len)
-		return tcp_fragment(sk, skb, len, mss_now, gfp);
+		return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
 
 	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
 	if (unlikely(!buff))
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
-	tcp_insert_write_queue_after(skb, buff, sk);
+	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
 
 	return 0;
 }
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 			goto send_now;
 	}
 
-	head = tcp_write_queue_head(sk);
-
+	/* TODO : use tsorted_sent_queue ? */
+	head = tcp_rtx_queue_head(sk);
+	if (!head)
+		goto send_now;
 	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
 	/* If next ACK is likely to come too late (half srtt), do not defer */
 	if (age < (tp->srtt_us >> 4))
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 	limit <<= factor;
 
 	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
-		/* Always send the 1st or 2nd skb in write queue.
+		/* Always send skb if rtx queue is empty.
 		 * No need to wait for TX completion to call us back,
 		 * after softirq/tasklet schedule.
 		 * This helps when TX completions are delayed too much.
 		 */
-		if (skb == sk->sk_write_queue.next ||
-		    skb->prev == sk->sk_write_queue.next)
+		if (tcp_rtx_queue_empty(sk))
 			return false;
 
 		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
 	 * it's the "most interesting" or current chrono we are
 	 * tracking and starts busy chrono if we have pending data.
 	 */
-	if (tcp_write_queue_empty(sk))
+	if (tcp_rtx_and_write_queues_empty(sk))
 		tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
 	else if (type == tp->chrono_type)
 		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						    nonagle);
 
 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+		    unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+					  skb, limit, mss_now, gfp)))
 			break;
 
 		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2350,7 +2368,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		tcp_cwnd_validate(sk, is_cwnd_limited);
 		return false;
 	}
-	return !tp->packets_out && tcp_send_head(sk);
+	return !tp->packets_out && !tcp_write_queue_empty(sk);
 }
 
 bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 		return false;
 
 	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-	     tcp_send_head(sk))
+	     !tcp_write_queue_empty(sk))
 		return false;
 
 	/* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
 	int mss = tcp_current_mss(sk);
 
 	skb = tcp_send_head(sk);
-	if (skb) {
-		if (tcp_snd_wnd_test(tp, skb, mss)) {
-			pcount = tp->packets_out;
-			tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
-			if (tp->packets_out > pcount)
-				goto probe_sent;
-			goto rearm_timer;
-		}
-		skb = tcp_write_queue_prev(sk, skb);
-	} else {
-		skb = tcp_write_queue_tail(sk);
+	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+		pcount = tp->packets_out;
+		tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+		if (tp->packets_out > pcount)
+			goto probe_sent;
+		goto rearm_timer;
 	}
+	skb = skb_rb_last(&sk->tcp_rtx_queue);
 
 	/* At most one outstanding TLP retransmission. */
 	if (tp->tlp_high_seq)
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
 		goto rearm_timer;
 
 	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
-		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+		if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+					  (pcount - 1) * mss, mss,
 					  GFP_ATOMIC)))
 			goto rearm_timer;
-		skb = tcp_write_queue_next(sk, skb);
+		skb = skb_rb_next(skb);
 	}
 
 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+	struct sk_buff *next_skb = skb_rb_next(skb);
 	int skb_size, next_skb_size;
 
 	skb_size = skb->len;
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	}
 	tcp_highest_sack_combine(sk, next_skb, skb);
 
-	tcp_unlink_write_queue(next_skb, sk);
-
 	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
 		skb->ip_summed = CHECKSUM_PARTIAL;
 
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 
 	tcp_skb_collapse_tstamp(skb, next_skb);
 
-	sk_wmem_free_skb(sk, next_skb);
+	tcp_rtx_queue_unlink_and_free(next_skb, sk);
 	return true;
 }
 
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
 		return false;
 	if (skb_cloned(skb))
 		return false;
-	if (skb == tcp_send_head(sk))
-		return false;
 	/* Some heuristics for collapsing over SACK'd could be invented */
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
 		return false;
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 		return;
 
-	tcp_for_write_queue_from_safe(skb, tmp, sk) {
+	skb_rbtree_walk_from_safe(skb, tmp) {
 		if (!tcp_can_collapse(sk, skb))
 			break;
 
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	len = cur_mss * segs;
 	if (skb->len > len) {
-		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+		if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+				 cur_mss, GFP_ATOMIC))
 			return -ENOMEM; /* We'll try again later. */
 	} else {
 		if (skb_unclone(skb, GFP_ATOMIC))
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
-	struct sk_buff *hole = NULL;
 	u32 max_segs;
 	int mib_idx;
 
 	if (!tp->packets_out)
 		return;
 
-	if (tp->retransmit_skb_hint) {
-		skb = tp->retransmit_skb_hint;
-	} else {
-		skb = tcp_write_queue_head(sk);
+	skb = tp->retransmit_skb_hint;
+	if (!skb) {
+		rtx_head = tcp_rtx_queue_head(sk);
+		skb = rtx_head;
 	}
-
 	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
-	tcp_for_write_queue_from(skb, sk) {
+	skb_rbtree_walk_from(skb) {
 		__u8 sacked;
 		int segs;
 
-		if (skb == tcp_send_head(sk))
-			break;
-
 		if (tcp_pacing_check(sk))
 			break;
 
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
 
-		if (skb == tcp_write_queue_head(sk) &&
+		if (skb == rtx_head &&
 		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
 	 * Note: in the latter case, FIN packet will be sent after a timeout,
 	 * as TCP stack thinks it has already been transmitted.
 	 */
-	if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+	if (!tskb && tcp_under_memory_pressure(sk))
+		tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+	if (tskb) {
 coalesce:
 		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
 		TCP_SKB_CB(tskb)->end_seq++;
 		tp->write_seq++;
-		if (!tcp_send_head(sk)) {
+		if (tcp_write_queue_empty(sk)) {
 			/* This means tskb was already sent.
 			 * Pretend we included the FIN on previous transmit.
 			 * We need to set tp->snd_nxt to the value it would have
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
 {
 	struct sk_buff *skb;
 
-	skb = tcp_write_queue_head(sk);
+	skb = tcp_rtx_queue_head(sk);
 	if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
-		pr_debug("%s: wrong queue state\n", __func__);
+		pr_err("%s: wrong queue state\n", __func__);
 		return -EFAULT;
 	}
 	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
 			if (!nskb)
 				return -ENOMEM;
 			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
-			tcp_unlink_write_queue(skb, sk);
+			tcp_rtx_queue_unlink_and_free(skb, sk);
 			__skb_header_release(nskb);
-			__tcp_add_write_queue_head(sk, nskb);
-			sk_wmem_free_skb(sk, skb);
+			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
 			sk->sk_wmem_queued += nskb->truesize;
 			sk_mem_charge(sk, nskb->truesize);
 			skb = nskb;
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
 
 	tcb->end_seq += skb->len;
 	__skb_header_release(skb);
-	__tcp_add_write_queue_tail(sk, skb);
 	sk->sk_wmem_queued += skb->truesize;
 	sk_mem_charge(sk, skb->truesize);
 	tp->write_seq = tcb->end_seq;
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
 	if (!err) {
 		tp->syn_data = (fo->copied > 0);
+		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
 		goto done;
 	}
 
-	/* data was not sent, this is our new send_head */
-	sk->sk_send_head = syn_data;
+	/* data was not sent, put it in write_queue */
+	__skb_queue_tail(&sk->sk_write_queue, syn_data);
 	tp->packets_out -= tcp_skb_pcount(syn_data);
 
 fallback:
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
 	tp->retrans_stamp = tcp_time_stamp(tp);
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
+	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
 
 	/* Send off SYN; include data in Fast Open. */
 	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
 		    skb->len > mss) {
 			seg_size = min(seg_size, mss);
 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
-			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+					 skb, seg_size, mss, GFP_ATOMIC))
 				return -1;
 		} else if (!tcp_skb_pcount(skb))
 			tcp_set_skb_tso_segs(skb, mss);
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
 
 	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
-	if (tp->packets_out || !tcp_send_head(sk)) {
+	if (tp->packets_out || tcp_write_queue_empty(sk)) {
 		/* Cancel probe timer, if it is not required. */
 		icsk->icsk_probes_out = 0;
 		icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 655dd8d7f064f998f7caa323822dfffe9984ebdf..7014cc00c74c7107b8dd1ffcbdfeef8092792ba2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
 		return false;
 
 	start_ts = tcp_sk(sk)->retrans_stamp;
-	if (unlikely(!start_ts))
-		start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+	if (unlikely(!start_ts)) {
+		struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+		if (!head)
+			return false;
+		start_ts = tcp_skb_timestamp(head);
+	}
 
 	if (likely(timeout == 0)) {
 		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
 static void tcp_probe_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb = tcp_send_head(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
 	u32 start_ts;
 
-	if (tp->packets_out || !tcp_send_head(sk)) {
+	if (tp->packets_out || !skb) {
 		icsk->icsk_probes_out = 0;
 		return;
 	}
@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
 	 * corresponding system limit. We also implement similar policy when
 	 * we use RTO to probe window in tcp_retransmit_timer().
 	 */
-	start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+	start_ts = tcp_skb_timestamp(skb);
 	if (!start_ts)
-		tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+		skb->skb_mstamp = tp->tcp_mstamp;
 	else if (icsk->icsk_user_timeout &&
 		 (s32)(tcp_time_stamp(tp) - start_ts) >
 		 jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
 	if (!tp->packets_out)
 		goto out;
 
-	WARN_ON(tcp_write_queue_empty(sk));
+	WARN_ON(tcp_rtx_queue_empty(sk));
 
 	tp->tlp_high_seq = 0;
 
@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
 			goto out;
 		}
 		tcp_enter_loss(sk);
-		tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+		tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
 		__sk_dst_reset(sk);
 		goto out_reset_timer;
 	}
@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
 
 	tcp_enter_loss(sk);
 
-	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
 		/* Retransmission failed because of local congestion,
 		 * do not backoff.
 		 */
@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = keepalive_time_when(tp);
 
 	/* It is alive without keepalive 8) */
-	if (tp->packets_out || tcp_send_head(sk))
+	if (tp->packets_out || !tcp_write_queue_empty(sk))
 		goto resched;
 
 	elapsed = keepalive_time_elapsed(tp);
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
                   ` (6 preceding siblings ...)
  2017-10-06  5:21 ` [PATCH net-next 7/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
@ 2017-10-06 23:31 ` David Miller
  2018-01-21 20:52   ` Tal Gilboa
  7 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2017-10-06 23:31 UTC (permalink / raw)
  To: edumazet; +Cc: ncardwell, ycheng, netdev, eric.dumazet

From: Eric Dumazet <edumazet@google.com>
Date: Thu,  5 Oct 2017 22:21:20 -0700

> This patch series implement RB-tree based retransmit queue for TCP,
> to better match modern BDP.

Indeed, there was a lot of resistence to this due to the overhead
for small retransmit queue sizes, but with today's scale this is
long overdue.

So, series applied, nice work!

Maybe we can look into dynamic schemes where when the queue never
goes over N entries we elide the rbtree and use a list.  I'm not
so sure how practical that would be.

Thanks!

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2017-10-06 23:31 ` [PATCH net-next 0/7] " David Miller
@ 2018-01-21 20:52   ` Tal Gilboa
  2018-01-21 23:47     ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: Tal Gilboa @ 2018-01-21 20:52 UTC (permalink / raw)
  To: David Miller, edumazet
  Cc: ncardwell, ycheng, netdev, eric.dumazet, Saeed Mahameed,
	Tariq Toukan, Amir Ancel

Hi Eric,
We have noticed a degradation on both of our drivers (mlx4 and mlx5) 
when running TCP. Exact scenario is single stream TCP with 1KB packets. 
The degradation is a steady 50% drop.
We tracked the offending commit to be:
75c119a ("tcp: implement rb-tree based retransmit queue")

Since mlx4 and mlx5 code base is completely different and by looking at 
the changes in this commit, we believe the issue is external to the 
mlx4/5 drivers.

I see in the comment below you anticipated some overhead, but this may 
be a too common case to ignore.

Can you please review and consider reverting/fixing it?

Thanks,

Tal G.

On 10/7/2017 2:31 AM, David Miller wrote:
> From: Eric Dumazet <edumazet@google.com>
> Date: Thu,  5 Oct 2017 22:21:20 -0700
> 
>> This patch series implement RB-tree based retransmit queue for TCP,
>> to better match modern BDP.
> 
> Indeed, there was a lot of resistence to this due to the overhead
> for small retransmit queue sizes, but with today's scale this is
> long overdue.
> 
> So, series applied, nice work!
> 
> Maybe we can look into dynamic schemes where when the queue never
> goes over N entries we elide the rbtree and use a list.  I'm not
> so sure how practical that would be.
> 
> Thanks!
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-01-21 20:52   ` Tal Gilboa
@ 2018-01-21 23:47     ` Eric Dumazet
  2018-01-24 14:42       ` Tal Gilboa
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2018-01-21 23:47 UTC (permalink / raw)
  To: Tal Gilboa
  Cc: David Miller, ncardwell, ycheng, netdev, eric.dumazet,
	Saeed Mahameed, Tariq Toukan, Amir Ancel

On Sun, Jan 21, 2018 at 12:52 PM, Tal Gilboa <talgi@mellanox.com> wrote:
> Hi Eric,
> We have noticed a degradation on both of our drivers (mlx4 and mlx5) when
> running TCP. Exact scenario is single stream TCP with 1KB packets. The
> degradation is a steady 50% drop.
> We tracked the offending commit to be:
> 75c119a ("tcp: implement rb-tree based retransmit queue")
>
> Since mlx4 and mlx5 code base is completely different and by looking at the
> changes in this commit, we believe the issue is external to the mlx4/5
> drivers.
>
> I see in the comment below you anticipated some overhead, but this may be a
> too common case to ignore.
>
> Can you please review and consider reverting/fixing it?
>

Hi Tal

You have to provide way more details than a simple mail, asking for a
" revert or a fix " ...

On our GFEs, we got a win, while I was expecting a small overhead,
given the apparent complexity of dealing with RB tree instead of
linear list.

And on the stress scenario described in my patch set, the win was
absolutely abysmal.

A " single strean TCP with 1KB packets"  is not something we need to optimize,
unless there is some really strange setup for one of your customers ?

Here we deal with millions of TCP flows, and this is what we need to
take care of.

Thanks.

> Thanks,
>
> Tal G.
>
>
> On 10/7/2017 2:31 AM, David Miller wrote:
>>
>> From: Eric Dumazet <edumazet@google.com>
>> Date: Thu,  5 Oct 2017 22:21:20 -0700
>>
>>> This patch series implement RB-tree based retransmit queue for TCP,
>>> to better match modern BDP.
>>
>>
>> Indeed, there was a lot of resistence to this due to the overhead
>> for small retransmit queue sizes, but with today's scale this is
>> long overdue.
>>
>> So, series applied, nice work!
>>
>> Maybe we can look into dynamic schemes where when the queue never
>> goes over N entries we elide the rbtree and use a list.  I'm not
>> so sure how practical that would be.
>>
>> Thanks!
>>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-01-21 23:47     ` Eric Dumazet
@ 2018-01-24 14:42       ` Tal Gilboa
  2018-01-24 15:09         ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: Tal Gilboa @ 2018-01-24 14:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, ncardwell, ycheng, netdev, eric.dumazet,
	Saeed Mahameed, Tariq Toukan, Amir Ancel

Hi Eric,
My choice of words in my comment was misplaced, and I apologies. It 
completely missed the point. I understand, of course, the importance of 
optimizing real-life scenarios.

We are currently evaluating this patch and if/how it might affect our 
customers. We would also evaluate your suggestion below.

We will contact you if and when we have a real concern.
Thanks.

On 1/22/2018 1:47 AM, Eric Dumazet wrote:
> On Sun, Jan 21, 2018 at 12:52 PM, Tal Gilboa <talgi@mellanox.com> wrote:
>> Hi Eric,
>> We have noticed a degradation on both of our drivers (mlx4 and mlx5) when
>> running TCP. Exact scenario is single stream TCP with 1KB packets. The
>> degradation is a steady 50% drop.
>> We tracked the offending commit to be:
>> 75c119a ("tcp: implement rb-tree based retransmit queue")
>>
>> Since mlx4 and mlx5 code base is completely different and by looking at the
>> changes in this commit, we believe the issue is external to the mlx4/5
>> drivers.
>>
>> I see in the comment below you anticipated some overhead, but this may be a
>> too common case to ignore.
>>
>> Can you please review and consider reverting/fixing it?
>>
> 
> Hi Tal
> 
> You have to provide way more details than a simple mail, asking for a
> " revert or a fix " ...
> 
> On our GFEs, we got a win, while I was expecting a small overhead,
> given the apparent complexity of dealing with RB tree instead of
> linear list.
> 
> And on the stress scenario described in my patch set, the win was
> absolutely abysmal.
> 
> A " single strean TCP with 1KB packets"  is not something we need to optimize,
> unless there is some really strange setup for one of your customers ?
> 
> Here we deal with millions of TCP flows, and this is what we need to
> take care of.
> 
> Thanks.
> 
>> Thanks,
>>
>> Tal G.
>>
>>
>> On 10/7/2017 2:31 AM, David Miller wrote:
>>>
>>> From: Eric Dumazet <edumazet@google.com>
>>> Date: Thu,  5 Oct 2017 22:21:20 -0700
>>>
>>>> This patch series implement RB-tree based retransmit queue for TCP,
>>>> to better match modern BDP.
>>>
>>>
>>> Indeed, there was a lot of resistence to this due to the overhead
>>> for small retransmit queue sizes, but with today's scale this is
>>> long overdue.
>>>
>>> So, series applied, nice work!
>>>
>>> Maybe we can look into dynamic schemes where when the queue never
>>> goes over N entries we elide the rbtree and use a list.  I'm not
>>> so sure how practical that would be.
>>>
>>> Thanks!
>>>
>>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-01-24 14:42       ` Tal Gilboa
@ 2018-01-24 15:09         ` Eric Dumazet
  2018-02-06 13:51           ` Tal Gilboa
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2018-01-24 15:09 UTC (permalink / raw)
  To: Tal Gilboa, Eric Dumazet
  Cc: David Miller, ncardwell, ycheng, netdev, Saeed Mahameed,
	Tariq Toukan, Amir Ancel

On Wed, 2018-01-24 at 16:42 +0200, Tal Gilboa wrote:
> Hi Eric,
> My choice of words in my comment was misplaced, and I apologies. It 
> completely missed the point. I understand, of course, the importance of 
> optimizing real-life scenarios.
> 
> We are currently evaluating this patch and if/how it might affect our 
> customers. We would also evaluate your suggestion below.
> 
> We will contact you if and when we have a real concern.
> Thanks.

Sure, I am curious how a 50% regression can be possible as a matter of
fact, so please update even if this caused by some specific synthetic
test conditions.

Thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-01-24 15:09         ` Eric Dumazet
@ 2018-02-06 13:51           ` Tal Gilboa
  2018-02-06 14:19             ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: Tal Gilboa @ 2018-02-06 13:51 UTC (permalink / raw)
  To: Eric Dumazet, Eric Dumazet
  Cc: David Miller, ncardwell, ycheng, netdev, Saeed Mahameed,
	Tariq Toukan, Amir Ancel

On 1/24/2018 5:09 PM, Eric Dumazet wrote:
> On Wed, 2018-01-24 at 16:42 +0200, Tal Gilboa wrote:
>> Hi Eric,
>> My choice of words in my comment was misplaced, and I apologies. It
>> completely missed the point. I understand, of course, the importance of
>> optimizing real-life scenarios.
>>
>> We are currently evaluating this patch and if/how it might affect our
>> customers. We would also evaluate your suggestion below.
>>
>> We will contact you if and when we have a real concern.
>> Thanks.
> 
> Sure, I am curious how a 50% regression can be possible as a matter of
> fact, so please update even if this caused by some specific synthetic
> test conditions.
> 
> Thanks.
> 

Sorry for the delay in the response. I ran super_netperf for 64B, 128B, 
256B and 512B and 500, 1000, 2000 and 4000 streams and compared these 
(consecutive) commits:
Base - f331981 tcp: pass previous skb to tcp_shifted_skb()
rb_tree - 75c119a tcp: implement rb-tree based retransmit queue

I got lower BW with rb-tree for all cases.
Example - 2000 streams results (in Gb/s):
size | Base | rb-tree | degradation
  64  | 25.6 | 23.3    | -9%
  128 | 52.8 | 44.43   | -16%
  256 | 89.8 | 66.1    | -26.5%
  512 | 87.7 | 67.8    | -22.7%

I'm currently working on improving our CPU utilization in TX flow (by 
better utilizing payload aggregation mechanisms). It somewhat improves 
the rb-tree results when applied on top of it, but not for all cases and 
not to the "base" results.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-02-06 13:51           ` Tal Gilboa
@ 2018-02-06 14:19             ` Eric Dumazet
  2018-02-06 15:22               ` David Laight
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2018-02-06 14:19 UTC (permalink / raw)
  To: Tal Gilboa
  Cc: Eric Dumazet, David Miller, ncardwell, ycheng, netdev,
	Saeed Mahameed, Tariq Toukan, Amir Ancel

On Tue, Feb 6, 2018 at 5:51 AM, Tal Gilboa <talgi@mellanox.com> wrote:
> On 1/24/2018 5:09 PM, Eric Dumazet wrote:
>>
>> On Wed, 2018-01-24 at 16:42 +0200, Tal Gilboa wrote:
>>>
>>> Hi Eric,
>>> My choice of words in my comment was misplaced, and I apologies. It
>>> completely missed the point. I understand, of course, the importance of
>>> optimizing real-life scenarios.
>>>
>>> We are currently evaluating this patch and if/how it might affect our
>>> customers. We would also evaluate your suggestion below.
>>>
>>> We will contact you if and when we have a real concern.
>>> Thanks.
>>
>>
>> Sure, I am curious how a 50% regression can be possible as a matter of
>> fact, so please update even if this caused by some specific synthetic
>> test conditions.
>>
>> Thanks.
>>
>
> Sorry for the delay in the response. I ran super_netperf for 64B, 128B, 256B
> and 512B and 500, 1000, 2000 and 4000 streams and compared these
> (consecutive) commits:
> Base - f331981 tcp: pass previous skb to tcp_shifted_skb()
> rb_tree - 75c119a tcp: implement rb-tree based retransmit queue
>
> I got lower BW with rb-tree for all cases.
> Example - 2000 streams results (in Gb/s):
> size | Base | rb-tree | degradation
>  64  | 25.6 | 23.3    | -9%
>  128 | 52.8 | 44.43   | -16%
>  256 | 89.8 | 66.1    | -26.5%
>  512 | 87.7 | 67.8    | -22.7%
>
> I'm currently working on improving our CPU utilization in TX flow (by better
> utilizing payload aggregation mechanisms). It somewhat improves the rb-tree
> results when applied on top of it, but not for all cases and not to the
> "base" results.


Hi

Please give exact details.
Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little sense.
We are not optimizing stack for pathological cases, sorry.

If you are using MSG_EOR to force silly skbs in the rtx queue, then
you should not do that ...

Thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-02-06 14:19             ` Eric Dumazet
@ 2018-02-06 15:22               ` David Laight
  2018-02-06 15:52                 ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: David Laight @ 2018-02-06 15:22 UTC (permalink / raw)
  To: 'Eric Dumazet', Tal Gilboa
  Cc: Eric Dumazet, David Miller, ncardwell, ycheng, netdev,
	Saeed Mahameed, Tariq Toukan, Amir Ancel

From: Eric Dumazet
> Sent: 06 February 2018 14:20
...
> Please give exact details.
> Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little sense.
> We are not optimizing stack for pathological cases, sorry.

There are plenty of workloads which are not bulk data and where multiple
small buffers get sent at unknown intervals (which may be back to back).
Such connections have to have Nagle disabled because the Nagle delays
are 'horrid'.
Clearly lost packets can cause delays, but they are rare on local networks.

	David


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-02-06 15:22               ` David Laight
@ 2018-02-06 15:52                 ` Eric Dumazet
  2018-02-06 16:27                   ` Tal Gilboa
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2018-02-06 15:52 UTC (permalink / raw)
  To: David Laight, 'Eric Dumazet', Tal Gilboa
  Cc: David Miller, ncardwell, ycheng, netdev, Saeed Mahameed,
	Tariq Toukan, Amir Ancel

On Tue, 2018-02-06 at 15:22 +0000, David Laight wrote:
> From: Eric Dumazet
> > Sent: 06 February 2018 14:20
> 
> ...
> > Please give exact details.
> > Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little sense.
> > We are not optimizing stack for pathological cases, sorry.
> 
> There are plenty of workloads which are not bulk data and where multiple
> small buffers get sent at unknown intervals (which may be back to back).
> Such connections have to have Nagle disabled because the Nagle delays
> are 'horrid'.
> Clearly lost packets can cause delays, but they are rare on local networks.

Auto corking makes sure aggregation happens, even for when Nagle is in
the picture.

netperf -- -m 256    will still cook 64KB TSO packets

netperf is not adding delays between each send(), unless it has been
modified.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-02-06 15:52                 ` Eric Dumazet
@ 2018-02-06 16:27                   ` Tal Gilboa
  2018-02-06 17:05                     ` Eric Dumazet
  0 siblings, 1 reply; 19+ messages in thread
From: Tal Gilboa @ 2018-02-06 16:27 UTC (permalink / raw)
  To: Eric Dumazet, David Laight, 'Eric Dumazet'
  Cc: David Miller, ncardwell, ycheng, netdev, Saeed Mahameed,
	Tariq Toukan, Amir Ancel

On 2/6/2018 5:52 PM, Eric Dumazet wrote:
> On Tue, 2018-02-06 at 15:22 +0000, David Laight wrote:
>> From: Eric Dumazet
>>> Sent: 06 February 2018 14:20
>>
>> ...
>>> Please give exact details.
>>> Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little sense.
>>> We are not optimizing stack for pathological cases, sorry.
>>
>> There are plenty of workloads which are not bulk data and where multiple
>> small buffers get sent at unknown intervals (which may be back to back).
>> Such connections have to have Nagle disabled because the Nagle delays
>> are 'horrid'.
>> Clearly lost packets can cause delays, but they are rare on local networks.
> 
> Auto corking makes sure aggregation happens, even for when Nagle is in
> the picture.

> 
> netperf -- -m 256    will still cook 64KB TSO packets

This is what we would have liked to see, but auto corking isn't forcing 
64KB TSO packets. Under certain conditions, specifically when TX queue 
is empty, it would send the SKB to transmit even if it isn't full:
static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
				int size_goal)
{
	return skb->len < size_goal &&
	       sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
	       skb != tcp_write_queue_head(sk) &&
	       refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
When skb == tcp_write_queue_head(sk) corking is done. This is part of 
the optimization for mlx5 driver I've mentioned. If we can better 
utilize auto corking we shouldn't have an issue.

> 
> netperf is not adding delays between each send(), unless it has been
> modified.
> 
> 

I ran this command:
./super_netperf 2000 -H <IP> -l 30 -f g -- -m $size
didn't change netperf in any way.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue
  2018-02-06 16:27                   ` Tal Gilboa
@ 2018-02-06 17:05                     ` Eric Dumazet
  0 siblings, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2018-02-06 17:05 UTC (permalink / raw)
  To: Tal Gilboa
  Cc: Eric Dumazet, David Laight, David Miller, ncardwell, ycheng,
	netdev, Saeed Mahameed, Tariq Toukan, Amir Ancel

On Tue, Feb 6, 2018 at 8:27 AM, Tal Gilboa <talgi@mellanox.com> wrote:
> On 2/6/2018 5:52 PM, Eric Dumazet wrote:
>>
>> On Tue, 2018-02-06 at 15:22 +0000, David Laight wrote:
>>>
>>> From: Eric Dumazet
>>>>
>>>> Sent: 06 February 2018 14:20
>>>
>>>
>>> ...
>>>>
>>>> Please give exact details.
>>>> Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little
>>>> sense.
>>>> We are not optimizing stack for pathological cases, sorry.
>>>
>>>
>>> There are plenty of workloads which are not bulk data and where multiple
>>> small buffers get sent at unknown intervals (which may be back to back).
>>> Such connections have to have Nagle disabled because the Nagle delays
>>> are 'horrid'.
>>> Clearly lost packets can cause delays, but they are rare on local
>>> networks.
>>
>>
>> Auto corking makes sure aggregation happens, even for when Nagle is in
>> the picture.
>
>
>>
>> netperf -- -m 256    will still cook 64KB TSO packets
>
>
> This is what we would have liked to see, but auto corking isn't forcing 64KB
> TSO packets. Under certain conditions, specifically when TX queue is empty,
> it would send the SKB to transmit even if it isn't full:

Yes.

Auto corking does not predict the future, nor arm a high resolution
timer when application
does a send(small_size)

This packet is sent immaediately, as instructed by application and TCP
normal behavior.

But second or third packet would detect the condition.

Unless a driver does skb_orphan() too early, breaking back pressure.

> static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
>                                 int size_goal)
> {
>         return skb->len < size_goal &&
>                sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
>                skb != tcp_write_queue_head(sk) &&
>                refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
> }
> When skb == tcp_write_queue_head(sk) corking is done. This is part of the
> optimization for mlx5 driver I've mentioned. If we can better utilize auto
> corking we shouldn't have an issue.

Or not issue expensive system calls for small payloads. stdio was
invented a while back :)

Meltdown/Spectre mitigation put high price to system calls nowadays.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2018-02-06 17:05 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-06  5:21 [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 1/7] net: add rb_to_skb() and other rb tree helpers Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 2/7] tcp: uninline tcp_write_queue_purge() Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 3/7] tcp: tcp_tx_timestamp() cleanup Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 4/7] tcp: tcp_mark_head_lost() optimization Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 5/7] tcp: reduce tcp_fastretrans_alert() verbosity Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 6/7] tcp: pass previous skb to tcp_shifted_skb() Eric Dumazet
2017-10-06  5:21 ` [PATCH net-next 7/7] tcp: implement rb-tree based retransmit queue Eric Dumazet
2017-10-06 23:31 ` [PATCH net-next 0/7] " David Miller
2018-01-21 20:52   ` Tal Gilboa
2018-01-21 23:47     ` Eric Dumazet
2018-01-24 14:42       ` Tal Gilboa
2018-01-24 15:09         ` Eric Dumazet
2018-02-06 13:51           ` Tal Gilboa
2018-02-06 14:19             ` Eric Dumazet
2018-02-06 15:22               ` David Laight
2018-02-06 15:52                 ` Eric Dumazet
2018-02-06 16:27                   ` Tal Gilboa
2018-02-06 17:05                     ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.