All of lore.kernel.org
 help / color / mirror / Atom feed
From: Neal Cardwell <ncardwell@google.com>
To: David Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org, Eric Dumazet <edumazet@google.com>,
	Van Jacobson <vanj@google.com>,
	Neal Cardwell <ncardwell@google.com>,
	Yuchung Cheng <ycheng@google.com>,
	Nandita Dukkipati <nanditad@google.com>,
	Soheil Hassas Yeganeh <soheil@google.com>
Subject: [PATCH net-next 03/14] net_sched: sch_fq: add low_rate_threshold parameter
Date: Fri, 16 Sep 2016 14:48:52 -0400	[thread overview]
Message-ID: <1474051743-13311-4-git-send-email-ncardwell@google.com> (raw)
In-Reply-To: <1474051743-13311-1-git-send-email-ncardwell@google.com>

From: Eric Dumazet <edumazet@google.com>

This commit adds to the fq module a low_rate_threshold parameter to
insert a delay after all packets if the socket requests a pacing rate
below the threshold.

This helps achieve more precise control of the sending rate with
low-rate paths, especially policers. The basic issue is that if a
congestion control module detects a policer at a certain rate, it may
want fq to be able to shape to that policed rate. That way the sender
can avoid policer drops by having the packets arrive at the policer at
or just under the policed rate.

The default threshold of 550Kbps was chosen analytically so that for
policers or links at 500Kbps or 512Kbps fq would very likely invoke
this mechanism, even if the pacing rate was briefly slightly above the
available bandwidth. This value was then empirically validated with
two years of production testing on YouTube video servers.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_fq.c             | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 2382eed..f8e39db 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -792,6 +792,8 @@ enum {
 
 	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */
 
+	TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
+
 	__TCA_FQ_MAX
 };
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b9..40ad4fc 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
 	u32		orphan_mask;	/* mask for orphaned skb */
+	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
 	u8		rate_enable;
 	u8		fq_trees_log;
@@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
-	u32 rate;
+	u32 rate, plen;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
@@ -482,7 +483,7 @@ begin:
 	prefetch(&skb->end);
 	f->credit -= qdisc_pkt_len(skb);
 
-	if (f->credit > 0 || !q->rate_enable)
+	if (!q->rate_enable)
 		goto out;
 
 	/* Do not pace locally generated ack packets */
@@ -493,8 +494,15 @@ begin:
 	if (skb->sk)
 		rate = min(skb->sk->sk_pacing_rate, rate);
 
+	if (rate <= q->low_rate_threshold) {
+		f->credit = 0;
+		plen = qdisc_pkt_len(skb);
+	} else {
+		plen = max(qdisc_pkt_len(skb), q->quantum);
+		if (f->credit > 0)
+			goto out;
+	}
 	if (rate != ~0U) {
-		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
 		u64 len = (u64)plen * NSEC_PER_SEC;
 
 		if (likely(rate))
@@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
+	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_FLOW_MAX_RATE])
 		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
 
+	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+		q->low_rate_threshold =
+			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
 	if (tb[TCA_FQ_RATE_ENABLE]) {
 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
 
@@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
+	q->low_rate_threshold	= 550000 / 8;
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	if (opt)
@@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
 			jiffies_to_usecs(q->flow_refill_delay)) ||
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+			q->low_rate_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
-- 
2.8.0.rc3.226.g39d4020

  parent reply	other threads:[~2016-09-16 18:49 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-16 18:48 [PATCH net-next 00/14] tcp: BBR congestion control algorithm Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 01/14] lib/win_minmax: windowed min or max estimator Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 02/14] tcp: use windowed min filter library for TCP min_rtt estimation Neal Cardwell
2016-09-16 19:21   ` kbuild test robot
2016-09-16 19:25     ` Neal Cardwell
2016-09-16 18:48 ` Neal Cardwell [this message]
2016-09-16 18:48 ` [PATCH net-next 04/14] tcp: count packets marked lost for a TCP connection Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 05/14] tcp: track data delivery rate " Neal Cardwell
2016-09-16 21:38   ` kbuild test robot
2016-09-17 12:09     ` Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 06/14] tcp: track application-limited rate samples Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 07/14] tcp: export data delivery rate Neal Cardwell
2016-09-16 21:38   ` kbuild test robot
2016-09-17  3:56   ` kbuild test robot
2016-09-16 20:03     ` Neal Cardwell
2016-09-16 20:11       ` Eric Dumazet
2016-09-16 20:04     ` Eric Dumazet
2016-09-16 18:48 ` [PATCH net-next 08/14] tcp: allow congestion control module to request TSO skb segment count Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 09/14] tcp: export tcp_tso_autosize() and parameterize minimum number of TSO segments Neal Cardwell
2016-09-16 18:48 ` [PATCH net-next 10/14] tcp: export tcp_mss_to_mtu() for congestion control modules Neal Cardwell
2016-09-16 18:49 ` [PATCH net-next 11/14] tcp: allow congestion control to expand send buffer differently Neal Cardwell
2016-09-16 18:49 ` [PATCH net-next 12/14] tcp: new CC hook to set sending rate with rate_sample in any CA state Neal Cardwell
2016-09-16 18:49 ` [PATCH net-next 13/14] tcp: increase ICSK_CA_PRIV_SIZE from 64 bytes to 88 Neal Cardwell
2016-09-16 18:49 ` [PATCH net-next 14/14] tcp_bbr: add BBR congestion control Neal Cardwell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1474051743-13311-4-git-send-email-ncardwell@google.com \
    --to=ncardwell@google.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=nanditad@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=soheil@google.com \
    --cc=vanj@google.com \
    --cc=ycheng@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.