* [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
@ 2016-05-01 23:47 Eric Dumazet
2016-05-02 7:49 ` Jesper Dangaard Brouer
2016-05-03 16:47 ` David Miller
0 siblings, 2 replies; 8+ messages in thread
From: Eric Dumazet @ 2016-05-01 23:47 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Dave Taht, Jonathan Morton
From: Eric Dumazet <edumazet@google.com>
In presence of inelastic flows and stress, we can call
fq_codel_drop() for every packet entering fq_codel qdisc.
fq_codel_drop() is quite expensive, as it does a linear scan
of 4 KB of memory to find a fat flow.
Once found, it drops the oldest packet of this flow.
Instead of dropping a single packet, try to drop 50% of the backlog
of this fat flow, with a configurable limit of 64 packets per round.
TCA_FQ_CODEL_DROP_BATCH_SIZE is the new attribute to make this
limit configurable.
With this strategy the 4 KB search is amortized to a single cache line
per drop [1], so fq_codel_drop() no longer appears at the top of kernel
profile in presence of few inelastic flows.
[1] Assuming a 64byte cache line, and 1024 buckets
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dave Taht <dave.taht@gmail.com>
Cc: Jonathan Morton <chromatix99@gmail.com>
---
include/uapi/linux/pkt_sched.h | 1
net/sched/sch_fq_codel.c | 64 +++++++++++++++++++++----------
2 files changed, 46 insertions(+), 19 deletions(-)
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 1c78c7454c7c..a11afecd4482 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -718,6 +718,7 @@ enum {
TCA_FQ_CODEL_FLOWS,
TCA_FQ_CODEL_QUANTUM,
TCA_FQ_CODEL_CE_THRESHOLD,
+ TCA_FQ_CODEL_DROP_BATCH_SIZE,
__TCA_FQ_CODEL_MAX
};
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5e420b3d4ab..e7b42b0d5145 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -59,6 +59,7 @@ struct fq_codel_sched_data {
u32 flows_cnt; /* number of flows */
u32 perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ u32 drop_batch_size;
struct codel_params cparams;
struct codel_stats cstats;
u32 drop_overlimit;
@@ -135,17 +136,20 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
skb->next = NULL;
}
-static unsigned int fq_codel_drop(struct Qdisc *sch)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
unsigned int maxbacklog = 0, idx = 0, i, len;
struct fq_codel_flow *flow;
+ unsigned int threshold;
- /* Queue is full! Find the fat flow and drop packet from it.
+ /* Queue is full! Find the fat flow and drop packet(s) from it.
* This might sound expensive, but with 1024 flows, we scan
* 4KB of memory, and we dont need to handle a complex tree
* in fast path (packet queue/enqueue) with many cache misses.
+ * In stress mode, we'll try to drop 64 packets from the flow,
+ * amortizing this linear lookup to one cache line per drop.
*/
for (i = 0; i < q->flows_cnt; i++) {
if (q->backlogs[i] > maxbacklog) {
@@ -153,15 +157,24 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
idx = i;
}
}
+
+ /* Our goal is to drop half of this fat flow backlog */
+ threshold = maxbacklog >> 1;
+
flow = &q->flows[idx];
- skb = dequeue_head(flow);
- len = qdisc_pkt_len(skb);
+ len = 0;
+ i = 0;
+ do {
+ skb = dequeue_head(flow);
+ len += qdisc_pkt_len(skb);
+ kfree_skb(skb);
+ } while (++i < max_packets && len < threshold);
+
+ flow->dropped += i;
q->backlogs[idx] -= len;
- sch->q.qlen--;
- qdisc_qstats_drop(sch);
- qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
- flow->dropped++;
+ sch->qstats.drops += i;
+ sch->qstats.backlog -= len;
+ sch->q.qlen -= i;
return idx;
}
@@ -170,14 +183,14 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
unsigned int prev_backlog;
prev_backlog = sch->qstats.backlog;
- fq_codel_drop(sch);
+ fq_codel_drop(sch, 1U);
return prev_backlog - sch->qstats.backlog;
}
static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
- unsigned int idx, prev_backlog;
+ unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
@@ -206,16 +219,22 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_SUCCESS;
prev_backlog = sch->qstats.backlog;
- q->drop_overlimit++;
- /* Return Congestion Notification only if we dropped a packet
- * from this flow.
+ prev_qlen = sch->q.qlen;
+
+ /* fq_codel_drop() is quite expensive, as it performs a linear search
+ * in q->backlogs[] to find a fat flow.
+ * So instead of dropping a single packet, drop half of its backlog
+ * with a 64 packets limit to not add a too big cpu spike here.
*/
- if (fq_codel_drop(sch) == idx)
- return NET_XMIT_CN;
+ ret = fq_codel_drop(sch, q->drop_batch_size);
+
+ q->drop_overlimit += prev_qlen - sch->q.qlen;
- /* As we dropped a packet, better let upper stack know this */
- qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
- return NET_XMIT_SUCCESS;
+ /* As we dropped packet(s), better let upper stack know this */
+ qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
+ prev_backlog - sch->qstats.backlog);
+
+ return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
}
/* This is the specific function called from codel_dequeue()
@@ -335,6 +354,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
[TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
[TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
};
static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -386,6 +406,9 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_CODEL_QUANTUM])
q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+ if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
+ q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = fq_codel_dequeue(sch);
@@ -431,6 +454,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
sch->limit = 10*1024;
q->flows_cnt = 1024;
+ q->drop_batch_size = 64;
q->quantum = psched_mtu(qdisc_dev(sch));
q->perturbation = prandom_u32();
INIT_LIST_HEAD(&q->new_flows);
@@ -489,6 +513,8 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
q->cparams.ecn) ||
nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
+ q->drop_batch_size) ||
nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
q->flows_cnt))
goto nla_put_failure;
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-01 23:47 [PATCH net-next] fq_codel: add batch ability to fq_codel_drop() Eric Dumazet
@ 2016-05-02 7:49 ` Jesper Dangaard Brouer
2016-05-02 14:34 ` Eric Dumazet
2016-05-03 16:47 ` David Miller
1 sibling, 1 reply; 8+ messages in thread
From: Jesper Dangaard Brouer @ 2016-05-02 7:49 UTC (permalink / raw)
To: Eric Dumazet; +Cc: brouer, David Miller, netdev, Dave Taht, Jonathan Morton
On Sun, 01 May 2016 16:47:26 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> In presence of inelastic flows and stress, we can call
> fq_codel_drop() for every packet entering fq_codel qdisc.
>
> fq_codel_drop() is quite expensive, as it does a linear scan
> of 4 KB of memory to find a fat flow.
> Once found, it drops the oldest packet of this flow.
>
> Instead of dropping a single packet, try to drop 50% of the backlog
> of this fat flow, with a configurable limit of 64 packets per round.
>
> TCA_FQ_CODEL_DROP_BATCH_SIZE is the new attribute to make this
> limit configurable.
>
> With this strategy the 4 KB search is amortized to a single cache line
> per drop [1], so fq_codel_drop() no longer appears at the top of kernel
> profile in presence of few inelastic flows.
>
> [1] Assuming a 64byte cache line, and 1024 buckets
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Dave Taht <dave.taht@gmail.com>
> Cc: Jonathan Morton <chromatix99@gmail.com>
> ---
> include/uapi/linux/pkt_sched.h | 1
> net/sched/sch_fq_codel.c | 64 +++++++++++++++++++++----------
> 2 files changed, 46 insertions(+), 19 deletions(-)
>
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 1c78c7454c7c..a11afecd4482 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -718,6 +718,7 @@ enum {
> TCA_FQ_CODEL_FLOWS,
> TCA_FQ_CODEL_QUANTUM,
> TCA_FQ_CODEL_CE_THRESHOLD,
> + TCA_FQ_CODEL_DROP_BATCH_SIZE,
> __TCA_FQ_CODEL_MAX
> };
>
> diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
> index a5e420b3d4ab..e7b42b0d5145 100644
> --- a/net/sched/sch_fq_codel.c
> +++ b/net/sched/sch_fq_codel.c
> @@ -59,6 +59,7 @@ struct fq_codel_sched_data {
> u32 flows_cnt; /* number of flows */
> u32 perturbation; /* hash perturbation */
> u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
> + u32 drop_batch_size;
> struct codel_params cparams;
> struct codel_stats cstats;
> u32 drop_overlimit;
> @@ -135,17 +136,20 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
> skb->next = NULL;
> }
>
> -static unsigned int fq_codel_drop(struct Qdisc *sch)
> +static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
> {
> struct fq_codel_sched_data *q = qdisc_priv(sch);
> struct sk_buff *skb;
> unsigned int maxbacklog = 0, idx = 0, i, len;
> struct fq_codel_flow *flow;
> + unsigned int threshold;
>
> - /* Queue is full! Find the fat flow and drop packet from it.
> + /* Queue is full! Find the fat flow and drop packet(s) from it.
> * This might sound expensive, but with 1024 flows, we scan
> * 4KB of memory, and we dont need to handle a complex tree
> * in fast path (packet queue/enqueue) with many cache misses.
> + * In stress mode, we'll try to drop 64 packets from the flow,
> + * amortizing this linear lookup to one cache line per drop.
> */
> for (i = 0; i < q->flows_cnt; i++) {
> if (q->backlogs[i] > maxbacklog) {
> @@ -153,15 +157,24 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
> idx = i;
> }
> }
> +
> + /* Our goal is to drop half of this fat flow backlog */
> + threshold = maxbacklog >> 1;
> +
> flow = &q->flows[idx];
> - skb = dequeue_head(flow);
> - len = qdisc_pkt_len(skb);
> + len = 0;
> + i = 0;
> + do {
> + skb = dequeue_head(flow);
> + len += qdisc_pkt_len(skb);
> + kfree_skb(skb);
> + } while (++i < max_packets && len < threshold);
> +
> + flow->dropped += i;
What about using bulk free of SKBs here?
There is a very high probability that we are hitting SLUB slowpath,
which involves an expensive locked cmpxchg_double per packet. Instead
we can amortize this cost via kmem_cache_free_bulk().
Maybe extend kfree_skb_list() to hide the slab/kmem_cache call?
> q->backlogs[idx] -= len;
> - sch->q.qlen--;
> - qdisc_qstats_drop(sch);
> - qdisc_qstats_backlog_dec(sch, skb);
> - kfree_skb(skb);
> - flow->dropped++;
> + sch->qstats.drops += i;
> + sch->qstats.backlog -= len;
> + sch->q.qlen -= i;
> return idx;
> }
>
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-02 7:49 ` Jesper Dangaard Brouer
@ 2016-05-02 14:34 ` Eric Dumazet
2016-05-02 16:00 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2016-05-02 14:34 UTC (permalink / raw)
To: Jesper Dangaard Brouer; +Cc: David Miller, netdev, Dave Taht, Jonathan Morton
On Mon, 2016-05-02 at 09:49 +0200, Jesper Dangaard Brouer wrote:
> What about using bulk free of SKBs here?
>
> There is a very high probability that we are hitting SLUB slowpath,
> which involves an expensive locked cmpxchg_double per packet. Instead
> we can amortize this cost via kmem_cache_free_bulk().
>
> Maybe extend kfree_skb_list() to hide the slab/kmem_cache call?
Sounds tricky, because of skb destructors. skb are complex objects.
For each skb, need to free the frags, skb->head, and skb.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-02 14:34 ` Eric Dumazet
@ 2016-05-02 16:00 ` Jesper Dangaard Brouer
2016-05-02 16:12 ` Eric Dumazet
0 siblings, 1 reply; 8+ messages in thread
From: Jesper Dangaard Brouer @ 2016-05-02 16:00 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Dave Taht, Jonathan Morton, brouer
On Mon, 02 May 2016 07:34:28 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2016-05-02 at 09:49 +0200, Jesper Dangaard Brouer wrote:
>
> > What about using bulk free of SKBs here?
> >
> > There is a very high probability that we are hitting SLUB slowpath,
> > which involves an expensive locked cmpxchg_double per packet. Instead
> > we can amortize this cost via kmem_cache_free_bulk().
> >
> > Maybe extend kfree_skb_list() to hide the slab/kmem_cache call?
>
> Sounds tricky, because of skb destructors. skb are complex objects.
>
> For each skb, need to free the frags, skb->head, and skb.
It is not that complicated, inside kfree_skb_list(), we just call
skb_release_all(skb) on each SKB first, and then bulk free the SKB's
themselves in the end. Example see, _kfree_skb_defer().
The question is where to store the SKB array needed by kmem_cache_free_bulk.
The easy option is just to use the stack of kfree_skb_list(), but we
have to be careful about the stack size, it might not be so good
because skb_release_all() can be deep and via skb_release_data() invoke
kfree_skb_list() a second time.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-02 16:00 ` Jesper Dangaard Brouer
@ 2016-05-02 16:12 ` Eric Dumazet
2016-05-02 17:15 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2016-05-02 16:12 UTC (permalink / raw)
To: Jesper Dangaard Brouer; +Cc: David Miller, netdev, Dave Taht, Jonathan Morton
On Mon, 2016-05-02 at 18:00 +0200, Jesper Dangaard Brouer wrote:
> It is not that complicated, inside kfree_skb_list(), we just call
> skb_release_all(skb) on each SKB first, and then bulk free the SKB's
> themselves in the end. Example see, _kfree_skb_defer().
>
> The question is where to store the SKB array needed by kmem_cache_free_bulk.
>
> The easy option is just to use the stack of kfree_skb_list(), but we
> have to be careful about the stack size, it might not be so good
> because skb_release_all() can be deep and via skb_release_data() invoke
> kfree_skb_list() a second time.
>
It sounds you are reinventing the wheel ;)
If drivers use napi_consume_skb(), qdisc should be able to use it the
same, since BH are disabled in their ->enqueue()/->dequeue() handlers.
This would be a separate patch of course.
This fq_codel fix might need to be backported.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-02 16:12 ` Eric Dumazet
@ 2016-05-02 17:15 ` Jesper Dangaard Brouer
2016-05-03 1:07 ` Dave Taht
0 siblings, 1 reply; 8+ messages in thread
From: Jesper Dangaard Brouer @ 2016-05-02 17:15 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Dave Taht, Jonathan Morton, brouer
On Mon, 02 May 2016 09:12:51 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2016-05-02 at 18:00 +0200, Jesper Dangaard Brouer wrote:
>
> > It is not that complicated, inside kfree_skb_list(), we just call
> > skb_release_all(skb) on each SKB first, and then bulk free the SKB's
> > themselves in the end. Example see, _kfree_skb_defer().
> >
> > The question is where to store the SKB array needed by kmem_cache_free_bulk.
> >
> > The easy option is just to use the stack of kfree_skb_list(), but we
> > have to be careful about the stack size, it might not be so good
> > because skb_release_all() can be deep and via skb_release_data() invoke
> > kfree_skb_list() a second time.
> >
>
> It sounds you are reinventing the wheel ;)
>
> If drivers use napi_consume_skb(), qdisc should be able to use it the
> same, since BH are disabled in their ->enqueue()/->dequeue() handlers.
Oh, yes. That is true, we can just use napi_consume_skb(). Should we
have a napi_kfree_skb(), to get the trace_kfree_skb() correct?
> This would be a separate patch of course.
>
> This fq_codel fix might need to be backported.
Agreed. I ACK your patch.
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-02 17:15 ` Jesper Dangaard Brouer
@ 2016-05-03 1:07 ` Dave Taht
0 siblings, 0 replies; 8+ messages in thread
From: Dave Taht @ 2016-05-03 1:07 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Eric Dumazet, David Miller, netdev, Jonathan Morton
I have duplicated the issue on my own hardware. I would like to
explore also upping the codel count in this scenario at some point,
but:
Acked-by: Dave Taht
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH net-next] fq_codel: add batch ability to fq_codel_drop()
2016-05-01 23:47 [PATCH net-next] fq_codel: add batch ability to fq_codel_drop() Eric Dumazet
2016-05-02 7:49 ` Jesper Dangaard Brouer
@ 2016-05-03 16:47 ` David Miller
1 sibling, 0 replies; 8+ messages in thread
From: David Miller @ 2016-05-03 16:47 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, dave.taht, chromatix99
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 01 May 2016 16:47:26 -0700
> From: Eric Dumazet <edumazet@google.com>
>
> In presence of inelastic flows and stress, we can call
> fq_codel_drop() for every packet entering fq_codel qdisc.
>
> fq_codel_drop() is quite expensive, as it does a linear scan
> of 4 KB of memory to find a fat flow.
> Once found, it drops the oldest packet of this flow.
>
> Instead of dropping a single packet, try to drop 50% of the backlog
> of this fat flow, with a configurable limit of 64 packets per round.
>
> TCA_FQ_CODEL_DROP_BATCH_SIZE is the new attribute to make this
> limit configurable.
>
> With this strategy the 4 KB search is amortized to a single cache line
> per drop [1], so fq_codel_drop() no longer appears at the top of kernel
> profile in presence of few inelastic flows.
>
> [1] Assuming a 64byte cache line, and 1024 buckets
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Dave Taht <dave.taht@gmail.com>
> Cc: Jonathan Morton <chromatix99@gmail.com>
Applied, thanks Eric.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2016-05-03 16:47 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-01 23:47 [PATCH net-next] fq_codel: add batch ability to fq_codel_drop() Eric Dumazet
2016-05-02 7:49 ` Jesper Dangaard Brouer
2016-05-02 14:34 ` Eric Dumazet
2016-05-02 16:00 ` Jesper Dangaard Brouer
2016-05-02 16:12 ` Eric Dumazet
2016-05-02 17:15 ` Jesper Dangaard Brouer
2016-05-03 1:07 ` Dave Taht
2016-05-03 16:47 ` David Miller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.