[PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers
@ 2019-05-04 23:48 Eric Dumazet
  2019-05-04 23:48 ` [PATCH net-next 1/2] net_sched: sch_fq: do not assume EDT packets are ordered Eric Dumazet
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Eric Dumazet @ 2019-05-04 23:48 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Willem de Bruijn, Eric Dumazet

Willem added GSO support to UDP stack, greatly improving performance
of QUIC servers.

We also want to enable in-kernel pacing, which is possible thanks to EDT
model, since each sendmsg() can provide a timestamp for the skbs.

We have to change sch_fq to enable feeding packets in arbitrary EDT order,
and make sure that packet classification do not trust unconnected sockets.

Note that this patch series also is a prereq for a future TCP change
enabling per-flow delays/reorders/losses to implement high performance
TCP emulators.

Eric Dumazet (2):
  net_sched: sch_fq: do not assume EDT packets are ordered
  net_sched: sch_fq: handle non connected flows

 net/sched/sch_fq.c | 110 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 96 insertions(+), 14 deletions(-)

-- 
2.21.0.1020.gf2820cf01a-goog

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH net-next 1/2] net_sched: sch_fq: do not assume EDT packets are ordered
  2019-05-04 23:48 [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers Eric Dumazet
@ 2019-05-04 23:48 ` Eric Dumazet
  2019-05-04 23:48 ` [PATCH net-next 2/2] net_sched: sch_fq: handle non connected flows Eric Dumazet
  2019-05-07 19:09 ` [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers David Miller
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2019-05-04 23:48 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Willem de Bruijn, Eric Dumazet

TCP stack makes sure packets for a given flow are monotically
increasing, but we want to allow UDP packets to use EDT as
well, so that QUIC servers can use in-kernel pacing.

This patch adds a per-flow rb-tree on which packets might
be stored. We still try to use the linear list for the
typical cases where packets are queued with monotically
increasing skb->tstamp, since queue/dequeue packets on
a standard list is O(1).

Note that the ability to store packets in arbitrary EDT
order will allow us to implement later a per TCP socket
mechanism adding delays (with jitter eventually) and reorders,
to implement convenient network emulators.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 95 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 12 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index d107c74767cd1d3258b7f038c0c3176db589a51f..ee138365ec45ee01cb10f149ae5b1d7635fa1185 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -54,10 +54,23 @@
 #include <net/tcp_states.h>
 #include <net/tcp.h>
 
+struct fq_skb_cb {
+	u64	        time_to_send;
+};
+
+static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
+	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
 /*
- * Per flow structure, dynamically allocated
+ * Per flow structure, dynamically allocated.
+ * If packets have monotically increasing time_to_send, they are placed in O(1)
+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
  */
 struct fq_flow {
+	struct rb_root	t_root;
 	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
 	union {
 		struct sk_buff *tail;	/* last skb in the list */
@@ -298,6 +311,8 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 		q->stat_allocation_errors++;
 		return &q->internal;
 	}
+	/* f->t_root is already zeroed after kmem_cache_zalloc() */
+
 	fq_flow_set_detached(f);
 	f->sk = sk;
 	if (skb->sk)
@@ -312,14 +327,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 	return f;
 }
 
+static struct sk_buff *fq_peek(struct fq_flow *flow)
+{
+	struct sk_buff *skb = skb_rb_first(&flow->t_root);
+	struct sk_buff *head = flow->head;
+
+	if (!skb)
+		return head;
+
+	if (!head)
+		return skb;
+
+	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
+		return skb;
+	return head;
+}
+
+static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
+			  struct sk_buff *skb)
+{
+	if (skb == flow->head) {
+		flow->head = skb->next;
+	} else {
+		rb_erase(&skb->rbnode, &flow->t_root);
+		skb->dev = qdisc_dev(sch);
+	}
+}
 
 /* remove one skb from head of flow queue */
 static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 {
-	struct sk_buff *skb = flow->head;
+	struct sk_buff *skb = fq_peek(flow);
 
 	if (skb) {
-		flow->head = skb->next;
+		fq_erase_head(sch, flow, skb);
 		skb_mark_not_on_list(skb);
 		flow->qlen--;
 		qdisc_qstats_backlog_dec(sch, skb);
@@ -330,15 +371,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 
 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 {
-	struct sk_buff *head = flow->head;
+	struct rb_node **p, *parent;
+	struct sk_buff *head, *aux;
 
-	skb->next = NULL;
-	if (!head)
-		flow->head = skb;
-	else
-		flow->tail->next = skb;
+	fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
 
-	flow->tail = skb;
+	head = flow->head;
+	if (!head ||
+	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
+		if (!head)
+			flow->head = skb;
+		else
+			flow->tail->next = skb;
+		flow->tail = skb;
+		skb->next = NULL;
+		return;
+	}
+
+	p = &flow->t_root.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		parent = *p;
+		aux = rb_to_skb(parent);
+		if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, &flow->t_root);
 }
 
 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -450,9 +512,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 		goto begin;
 	}
 
-	skb = f->head;
+	skb = fq_peek(f);
 	if (skb) {
-		u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
+		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
 					     f->time_next_packet);
 
 		if (now < time_next_packet) {
@@ -533,6 +595,15 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 
 static void fq_flow_purge(struct fq_flow *flow)
 {
+	struct rb_node *p = rb_first(&flow->t_root);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &flow->t_root);
+		rtnl_kfree_skbs(skb, skb);
+	}
 	rtnl_kfree_skbs(flow->head, flow->tail);
 	flow->head = NULL;
 	flow->qlen = 0;
-- 
2.21.0.1020.gf2820cf01a-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH net-next 2/2] net_sched: sch_fq: handle non connected flows
  2019-05-04 23:48 [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers Eric Dumazet
  2019-05-04 23:48 ` [PATCH net-next 1/2] net_sched: sch_fq: do not assume EDT packets are ordered Eric Dumazet
@ 2019-05-04 23:48 ` Eric Dumazet
  2019-05-07 19:09 ` [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers David Miller
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2019-05-04 23:48 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Willem de Bruijn, Eric Dumazet

FQ packet scheduler assumed that packets could be classified
based on their owning socket.

This means that if a UDP server uses one UDP socket to send
packets to different destinations, packets all land
in one FQ flow.

This is unfair, since each TCP flow has a unique bucket, meaning
that in case of pressure (fully utilised uplink), TCP flows
have more share of the bandwidth.

If we instead detect unconnected sockets, we can use a stochastic
hash based on the 4-tuple hash.

This also means a QUIC server using one UDP socket will properly
spread the outgoing packets to different buckets, and in-kernel
pacing based on EDT model will no longer risk having big rb-tree on
one flow.

Note that UDP application might provide the skb->hash in an
ancillary message at sendmsg() time to avoid the cost of a dissection
in fq packet scheduler.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index ee138365ec45ee01cb10f149ae5b1d7635fa1185..26a94e5cd5dfae34109649b04a1ebcaafa0f545b 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -270,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 		 */
 		sk = (struct sock *)((hash << 1) | 1UL);
 		skb_orphan(skb);
+	} else if (sk->sk_state == TCP_CLOSE) {
+		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+		/*
+		 * Sockets in TCP_CLOSE are non connected.
+		 * Typical use case is UDP sockets, they can send packets
+		 * with sendto() to many different destinations.
+		 * We probably could use a generic bit advertising
+		 * non connected sockets, instead of sk_state == TCP_CLOSE,
+		 * if we care enough.
+		 */
+		sk = (struct sock *)((hash << 1) | 1UL);
 	}

 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
@@ -290,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 			 * It not, we need to refill credit with
 			 * initial quantum
 			 */
-			if (unlikely(skb->sk &&
+			if (unlikely(skb->sk == sk &&
 				     f->socket_hash != sk->sk_hash)) {
 				f->credit = q->initial_quantum;
 				f->socket_hash = sk->sk_hash;
@@ -315,7 +326,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)

 	fq_flow_set_detached(f);
 	f->sk = sk;
-	if (skb->sk)
+	if (skb->sk == sk)
 		f->socket_hash = sk->sk_hash;
 	f->credit = q->initial_quantum;

-- 
2.21.0.1020.gf2820cf01a-goog

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers
  2019-05-04 23:48 [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers Eric Dumazet
  2019-05-04 23:48 ` [PATCH net-next 1/2] net_sched: sch_fq: do not assume EDT packets are ordered Eric Dumazet
  2019-05-04 23:48 ` [PATCH net-next 2/2] net_sched: sch_fq: handle non connected flows Eric Dumazet
@ 2019-05-07 19:09 ` David Miller
  2 siblings, 0 replies; 4+ messages in thread
From: David Miller @ 2019-05-07 19:09 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, willemb, eric.dumazet

From: Eric Dumazet <edumazet@google.com>
Date: Sat,  4 May 2019 16:48:52 -0700

> Willem added GSO support to UDP stack, greatly improving performance
> of QUIC servers.
> 
> We also want to enable in-kernel pacing, which is possible thanks to EDT
> model, since each sendmsg() can provide a timestamp for the skbs.
> 
> We have to change sch_fq to enable feeding packets in arbitrary EDT order,
> and make sure that packet classification do not trust unconnected sockets.
> 
> Note that this patch series also is a prereq for a future TCP change
> enabling per-flow delays/reorders/losses to implement high performance
> TCP emulators.

Looks great, series applied.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-05-07 19:09 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-04 23:48 [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers Eric Dumazet
2019-05-04 23:48 ` [PATCH net-next 1/2] net_sched: sch_fq: do not assume EDT packets are ordered Eric Dumazet
2019-05-04 23:48 ` [PATCH net-next 2/2] net_sched: sch_fq: handle non connected flows Eric Dumazet
2019-05-07 19:09 ` [PATCH net-next 0/2] net_sched: sch_fq: enable in-kernel pacing for QUIC servers David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).