* [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
@ 2020-07-06 18:08 YU, Xiangning
2020-07-06 18:32 ` Stephen Hemminger
` (4 more replies)
0 siblings, 5 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 18:08 UTC (permalink / raw)
To: netdev
Lockless Token Bucket (LTB) is a qdisc implementation that controls the
use of outbound bandwidth on a shared link. With the help of lockless
qdisc, and by decoupling rate limiting and bandwidth sharing, LTB is
designed to scale in the cloud data centers.
Signed-off-by: Xiangning Yu <xiangning.yu@alibaba-inc.com>
---
include/uapi/linux/pkt_sched.h | 35 ++
net/sched/Kconfig | 12 +
net/sched/Makefile | 1 +
net/sched/sch_ltb.c | 1280 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 1328 insertions(+)
create mode 100644 net/sched/sch_ltb.c
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 9e7c2c6..310a627 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -447,6 +447,41 @@ struct tc_htb_xstats {
__s32 ctokens;
};
+/* LTB section */
+
+#define TC_LTB_PROTOVER 3 /* the same as LTB and TC's major */
+#define TC_LTB_NUMPRIO 16
+enum {
+ TCA_LTB_UNSPEC,
+ TCA_LTB_PARMS,
+ TCA_LTB_INIT,
+ TCA_LTB_RATE64,
+ TCA_LTB_CEIL64,
+ TCA_LTB_PAD,
+ __TCA_LTB_MAX,
+};
+#define TCA_LTB_MAX (__TCA_LTB_MAX - 1)
+
+struct tc_ltb_opt {
+ struct tc_ratespec rate;
+ struct tc_ratespec ceil;
+ __u64 measured;
+ __u64 allocated;
+ __u64 high_water;
+ __u32 prio;
+};
+
+struct tc_ltb_glob {
+ __u32 version; /* to match LTB/TC */
+ __u32 defcls; /* default class number */
+};
+
+struct tc_ltb_xstats {
+ __u64 measured;
+ __u64 allocated;
+ __u64 high_water;
+};
+
/* HFSC section */
struct tc_hfsc_qopt {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a3b37d8..9a8adb6 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -76,6 +76,18 @@ config NET_SCH_HTB
To compile this code as a module, choose M here: the
module will be called sch_htb.
+config NET_SCH_LTB
+ tristate "Lockless Token Bucket (LTB)"
+ help
+ Say Y here if you want to use the Lockless Token Buckets (LTB)
+ packet scheduling algorithm.
+
+ LTB is very similar to HTB regarding its goals however is has
+ different implementation and different algorithm.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_ltb.
+
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 66bbf9a..6caa34d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_NET_ACT_GATE) += act_gate.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
+obj-$(CONFIG_NET_SCH_LTB) += sch_ltb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
diff --git a/net/sched/sch_ltb.c b/net/sched/sch_ltb.c
new file mode 100644
index 0000000..494b15f
--- /dev/null
+++ b/net/sched/sch_ltb.c
@@ -0,0 +1,1280 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* net/sched/sch_ltb.c Lockless Token Bucket.
+ *
+ * Authors: Xiangning Yu <xiangning.yu@alibaba-inc.com>
+ * Ke Ma <k.ma@alibaba-inc.com>
+ * Jianjun Duan <jianjun.duan@alibaba-inc.com>
+ * Kun Liu <shubo.lk@alibaba-inc.com>
+ */
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/if_vlan.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/atomic.h>
+#include <linux/kfifo.h>
+#include <linux/kallsyms.h>
+#include <linux/irq_work.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/ethtool.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+#define LTB_VERSION 0x30001
+#define LTB_CLASS_CONDEMED 1
+#define HIGH_FREQ_INTERVAL 1000 /* ns */
+#define LOW_FREQ_INTERVAL 50 /* sampling rate, in ms */
+#define SHADOW_CLASSID 0
+
+#define BYTES_PER_JIFF(bps) ((bps) / HZ)
+#define BYTES_PER_INTERVAL(bps) (LOW_FREQ_INTERVAL * BYTES_PER_JIFF(bps))
+#define MINBW (10 * 1000 * 1000L)
+#define HIGH_THRESHOLD 80
+#define SUPPRESS_THRESHOLD 90
+#define MAX_CPU_COUNT 128 /* make it dynamic */
+#define SKB_QLEN 512
+#define NOW() (jiffies / LOW_FREQ_INTERVAL)
+#define BPS2MBPS(x) ((x) * 8 / 1000000) /* Bps to Mbps */
+
+static struct Qdisc_ops ltb_pcpu_qdisc_ops;
+
+static const struct nla_policy ltb_policy[TCA_LTB_MAX + 1] = {
+ [TCA_LTB_PARMS] = { .len = sizeof(struct tc_ltb_opt) },
+ [TCA_LTB_INIT] = { .len = sizeof(struct tc_ltb_glob) },
+ [TCA_LTB_RATE64] = { .type = NLA_U64 },
+ [TCA_LTB_CEIL64] = { .type = NLA_U64 },
+};
+
+struct ltb_class {
+ struct Qdisc_class_common common;
+ struct psched_ratecfg ratecfg;
+ struct psched_ratecfg ceilcfg;
+ u32 prio;
+ struct ltb_class *parent;
+ struct Qdisc *qdisc;
+ struct Qdisc *root_qdisc;
+ u32 classid;
+ struct list_head pnode;
+ unsigned long state; ____cacheline_aligned_in_smp
+
+ /* Aggr/drain context only */
+ s64 next_timestamp; ____cacheline_aligned_in_smp
+ int num_cpus;
+ int last_cpu;
+ s64 bw_used;
+ s64 last_bytes;
+ s64 last_timestamp;
+ s64 stat_bytes;
+ s64 stat_packets;
+ atomic64_t stat_drops;
+
+ /* Balance thread only */
+ s64 rate; ____cacheline_aligned_in_smp
+ s64 ceil;
+ s64 high_water;
+ int drop_delay;
+ s64 bw_allocated;
+ bool want_more;
+
+ /* Shared b/w aggr/drain thread and balancer */
+ unsigned long curr_interval; ____cacheline_aligned_in_smp
+ s64 bw_measured; /* Measured actual bandwidth */
+ s64 maxbw; /* Calculated bandwidth */
+
+ STRUCT_KFIFO(struct sk_buff *, SKB_QLEN) aggr_queues[MAX_CPU_COUNT];
+ ____cacheline_aligned_in_smp
+ STRUCT_KFIFO(struct sk_buff *, SKB_QLEN * MAX_CPU_COUNT) drain_queue;
+ ____cacheline_aligned_in_smp
+ STRUCT_KFIFO(struct sk_buff *, SKB_QLEN) fanout_queues[MAX_CPU_COUNT];
+ ____cacheline_aligned_in_smp
+
+ struct tasklet_struct aggr_tasklet;
+ struct hrtimer aggr_timer;
+};
+
+struct ltb_pcpu_data {
+ struct Qdisc *qdisc; ____cacheline_aligned_in_smp
+ bool active;
+};
+
+/* Root qdisc private data */
+struct ltb_sched {
+ struct task_struct *bwbalancer_task;
+ wait_queue_head_t bwbalancer_wq;
+
+ int num_cpus;
+ s64 link_speed;
+ struct Qdisc *root_qdisc;
+ struct net_device *dev;
+
+ struct ltb_pcpu_data *pcpu_data; ____cacheline_aligned_in_smp
+ struct tasklet_struct fanout_tasklet;
+
+ struct ltb_class *default_cls;
+ struct ltb_class *shadow_cls; /* If there is no class created */
+ u32 default_classid;
+
+ rwlock_t prio_rows_lock;
+ struct list_head prio_rows[TC_LTB_NUMPRIO]; /* Priority list */
+ struct Qdisc_class_hash clhash;
+};
+
+/* Per-cpu qdisc private data */
+struct ltb_pcpu_sched {
+ struct ltb_sched *ltb;
+ struct Qdisc *qdisc;
+ int cpu;
+ struct irq_work fanout_irq_work;
+ s64 last_irq_timestamp;
+};
+
+/* The cpu where skb is from */
+struct ltb_skb_cb {
+ int cpu;
+};
+
+static inline struct ltb_skb_cb *ltb_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct ltb_skb_cb));
+ return (struct ltb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline s64 get_linkspeed(struct net_device *dev)
+{
+ struct ethtool_link_ksettings ecmd;
+
+ ASSERT_RTNL();
+ if (netif_running(dev) && !__ethtool_get_link_ksettings(dev, &ecmd))
+ /* Convert to bytes per second */
+ return ecmd.base.speed * 1000 * 1000L / 8;
+ return 0;
+}
+
+static int ltb_update_linkspeed(struct ltb_sched *ltb)
+{
+ s64 linkspeed;
+
+ /* Avoid race with kthread_stop() */
+ if (!rtnl_trylock())
+ return -1;
+
+ linkspeed = get_linkspeed(ltb->dev);
+ if (ltb->link_speed != linkspeed)
+ ltb->link_speed = linkspeed;
+ rtnl_unlock();
+ return 0;
+}
+
+static inline int ltb_drain(struct ltb_class *cl)
+{
+ typeof(&cl->drain_queue) queue;
+ struct sk_buff *skb;
+ int npkts, bytes;
+ unsigned long now = NOW();
+ int cpu;
+ struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
+ struct ltb_pcpu_sched *pcpu_q;
+ s64 timestamp;
+ bool need_watchdog = false;
+ struct cpumask cpumask;
+
+ npkts = 0;
+ bytes = 0;
+ cpumask_clear(&cpumask);
+ queue = &cl->drain_queue;
+ while (kfifo_peek(queue, &skb) > 0) {
+ int len = qdisc_pkt_len(skb);
+
+ if (cl->curr_interval != now) {
+ cl->curr_interval = now;
+ timestamp = ktime_get_ns();
+ cl->bw_measured = (cl->stat_bytes - cl->last_bytes) *
+ NSEC_PER_SEC / (timestamp - cl->last_timestamp);
+ cl->last_bytes = cl->stat_bytes;
+ cl->last_timestamp = timestamp;
+ cl->bw_used = 0;
+ } else if (len + cl->bw_used > cl->maxbw) {
+ need_watchdog = true;
+ break;
+ }
+ kfifo_skip(queue);
+ cl->bw_used += len;
+
+ /* Fanout */
+ cpu = ltb_skb_cb(skb)->cpu;
+ ltb_skb_cb(skb)->cpu = 0;
+ if (unlikely(kfifo_put(&cl->fanout_queues[cpu], skb) == 0)) {
+ kfree_skb(skb);
+ atomic64_inc(&cl->stat_drops);
+ } else {
+ /* Account for Generic Segmentation Offload(gso). */
+ cl->stat_bytes += len;
+ cl->stat_packets += skb_is_gso(skb) ?
+ skb_shinfo(skb)->gso_segs : 1;
+ cpumask_set_cpu(cpu, &cpumask);
+ }
+ }
+
+ for_each_cpu(cpu, &cpumask) {
+ struct Qdisc *q = per_cpu_ptr(ltb->pcpu_data, cpu)->qdisc;
+
+ pcpu_q = (struct ltb_pcpu_sched *)qdisc_priv(q);
+ if (!(q->state & __QDISC_STATE_SCHED) && !qdisc_is_running(q))
+ irq_work_queue_on(&pcpu_q->fanout_irq_work, cpu);
+ }
+
+ return need_watchdog;
+}
+
+static void ltb_aggregate(struct ltb_class *cl)
+{
+ s64 timestamp = ktime_get_ns();
+ struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
+ int num_cpus = ltb->num_cpus;
+ int i;
+
+ /* The worker might wake up more often than required */
+ if (cl->next_timestamp > timestamp)
+ /* Try again to keep the pipeline running */
+ goto watchdog;
+
+ cl->next_timestamp = timestamp + HIGH_FREQ_INTERVAL;
+
+ /* Aggregate sk_buff from all CPUs. The memory footprint here should
+ * be fine because we don't touch each packet.
+ *
+ * It's possible to see out of order packets here. While within 1us,
+ * there won't be too many packets for a single flow, and the Linux
+ * scheduler is not expected to schedule an application too often
+ * within this tiny time gap, i.e. 1/1000 jiffes.
+ */
+ for (i = 0; i < num_cpus; i++) {
+ /* Process CPUs in a round-robin fashion */
+ typeof(&cl->aggr_queues[0]) queue;
+ int queue_len, drain_room;
+ int j;
+
+ queue = &cl->aggr_queues[(i + cl->last_cpu) % num_cpus];
+ queue_len = kfifo_len(queue);
+ drain_room = kfifo_avail(&cl->drain_queue);
+ if (drain_room == 0)
+ break;
+
+ queue_len = queue_len < drain_room ? queue_len : drain_room;
+ for (j = 0; j < queue_len; j++) {
+ struct sk_buff *skb;
+
+ if (kfifo_get(queue, &skb)) {
+ if (unlikely(kfifo_put(&cl->drain_queue,
+ skb) == 0)) {
+ kfree_skb(skb);
+ atomic64_inc(&cl->stat_drops);
+ }
+ }
+ }
+ }
+ cl->last_cpu++;
+ if (cl->last_cpu == num_cpus)
+ cl->last_cpu = 0;
+
+ if (ltb_drain(cl) == false)
+ return;
+
+watchdog:
+ if (!test_bit(LTB_CLASS_CONDEMED, &cl->state))
+ hrtimer_start(&cl->aggr_timer,
+ ns_to_ktime(1000 + ktime_get_ns()),
+ HRTIMER_MODE_ABS_PINNED);
+}
+
+static enum hrtimer_restart ltb_aggr_watchdog(struct hrtimer *timer)
+{
+ struct ltb_class *cl = container_of(timer,
+ struct ltb_class, aggr_timer);
+
+ if (!test_bit(LTB_CLASS_CONDEMED, &cl->state))
+ tasklet_schedule(&cl->aggr_tasklet);
+
+ return HRTIMER_NORESTART;
+}
+
+static void ltb_aggr_tasklet(unsigned long arg)
+{
+ struct ltb_class *cl = (struct ltb_class *)arg;
+
+ rcu_read_lock_bh();
+ if (!test_bit(LTB_CLASS_CONDEMED, &cl->state))
+ ltb_aggregate(cl);
+ rcu_read_unlock_bh();
+}
+
+static inline void ltb_fanout(struct ltb_sched *ltb)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < ltb->num_cpus; cpu++) {
+ struct Qdisc *q = per_cpu_ptr(ltb->pcpu_data, cpu)->qdisc;
+ struct ltb_pcpu_sched *pcpu_q =
+ (struct ltb_pcpu_sched *)qdisc_priv(q);
+
+ if (q->q.qlen > 0 && !(q->state & __QDISC_STATE_SCHED) &&
+ !qdisc_is_running(q))
+ irq_work_queue_on(&pcpu_q->fanout_irq_work, cpu);
+ }
+}
+
+static void ltb_fanout_tasklet(unsigned long data)
+{
+ struct ltb_sched *ltb = (struct ltb_sched *)data;
+
+ ltb_fanout(ltb);
+}
+
+static void ltb_fanout_irq_tx_func(struct irq_work *work)
+{
+ struct ltb_pcpu_sched *pcpu_q =
+ container_of(work, struct ltb_pcpu_sched, fanout_irq_work);
+
+ __netif_schedule(pcpu_q->qdisc);
+}
+
+/* How many classes within the same group want more bandwidth */
+static inline int bw_class_want_more_count(struct list_head *head)
+{
+ int n = 0;
+ struct ltb_class *cl;
+
+ list_for_each_entry(cl, head, pnode) {
+ if (cl->want_more)
+ n++;
+ }
+ return n;
+}
+
+/* Redistribute bandwidth among classes with the same priority */
+static int bw_redistribute_prio(struct list_head *lhead, int bw_available,
+ int n, bool *all_reached_ceil)
+{
+ struct ltb_class *cl;
+ int avg = 0;
+ int orig_bw_allocated;
+ int safe_loop = 0;
+
+ do {
+ if (n > 0)
+ avg = bw_available / n;
+ list_for_each_entry(cl, lhead, pnode) {
+ if (!cl->want_more)
+ continue;
+
+ /* Try to allocate as much as possible */
+ orig_bw_allocated = cl->bw_allocated;
+ cl->bw_allocated = min_t(s64, (cl->bw_allocated + avg),
+ cl->ceil);
+ /* Significantly larger than high water */
+ if (cl->bw_allocated > cl->high_water * 120 / 100)
+ cl->bw_allocated = cl->high_water;
+ bw_available -= cl->bw_allocated - orig_bw_allocated;
+ if (cl->bw_allocated >= cl->high_water ||
+ cl->bw_allocated == cl->ceil) {
+ cl->want_more = false;
+ n--;
+ }
+ }
+ } while (bw_available > 0 && n > 0 && safe_loop++ < 2);
+
+ *all_reached_ceil = true;
+ list_for_each_entry(cl, lhead, pnode) {
+ if (cl->bw_allocated != cl->ceil)
+ *all_reached_ceil = false;
+ }
+
+ return bw_available;
+}
+
+static void bw_suppress_lower(struct ltb_sched *ltb, int high)
+{
+ int prio;
+
+ read_lock_bh(<b->prio_rows_lock);
+ for (prio = TC_LTB_NUMPRIO - 1; prio > high; prio--) {
+ struct ltb_class *cl;
+
+ list_for_each_entry(cl, <b->prio_rows[prio], pnode) {
+ if (cl->bw_allocated > cl->rate) {
+ cl->bw_allocated = max_t(s64,
+ cl->bw_measured *
+ 90 / 100, cl->rate);
+ }
+ }
+ }
+ read_unlock_bh(<b->prio_rows_lock);
+}
+
+static int bw_redistribute(struct ltb_sched *ltb, int bw_available)
+{
+ int prio = 0;
+ int n;
+ int highest_non_saturated_prio = TC_LTB_NUMPRIO;
+ bool all_reached_ceil;
+
+ read_lock_bh(<b->prio_rows_lock);
+ for (; prio < TC_LTB_NUMPRIO; prio++) {
+ struct list_head *head = <b->prio_rows[prio];
+
+ all_reached_ceil = true;
+
+ n = bw_class_want_more_count(head);
+ bw_available = bw_redistribute_prio(head, bw_available,
+ n, &all_reached_ceil);
+ if (!all_reached_ceil && highest_non_saturated_prio > prio)
+ highest_non_saturated_prio = prio;
+
+ if (bw_available < 0)
+ break;
+ }
+ read_unlock_bh(<b->prio_rows_lock);
+ return highest_non_saturated_prio;
+}
+
+static void bw_sync_all(struct ltb_sched *ltb, int bw_available,
+ int is_light_traffic)
+{
+ struct ltb_class *cl;
+ int i;
+
+ for (i = 0; i < ltb->clhash.hashsize; i++) {
+ hlist_for_each_entry_rcu(cl, <b->clhash.hash[i],
+ common.hnode) {
+ if (cl->classid == SHADOW_CLASSID)
+ continue;
+
+ if (is_light_traffic)
+ cl->bw_allocated = min_t(s64, cl->ceil,
+ cl->bw_allocated +
+ bw_available);
+ cl->maxbw = BYTES_PER_INTERVAL((s64)cl->bw_allocated);
+ /* Maxbw will be visiable eventually. */
+ smp_mb();
+ }
+ }
+}
+
+static void bw_balance(struct ltb_sched *ltb)
+{
+ struct ltb_class *cl;
+ s64 link_speed = ltb->link_speed;
+ int bw_available = link_speed;
+ s64 total = 0;
+ int high = TC_LTB_NUMPRIO;
+ int is_light_traffic = 1;
+ int i;
+
+ if (unlikely(link_speed <= 0))
+ return;
+
+ for (i = 0; i < ltb->clhash.hashsize; i++) {
+ hlist_for_each_entry_rcu(cl, <b->clhash.hash[i],
+ common.hnode) {
+ if (cl->classid == SHADOW_CLASSID)
+ continue;
+
+ /* It's been a while the bw measurement has stopped */
+ if (NOW() - cl->curr_interval > 2 &&
+ cl->bw_measured != 0)
+ cl->bw_measured = 0;
+
+ if (cl->bw_measured > cl->high_water * 95 / 100) {
+ /* Increase */
+ if (cl->high_water < cl->rate)
+ cl->high_water = min_t(s64,
+ cl->high_water *
+ 2, cl->rate);
+ else
+ cl->high_water =
+ cl->high_water * 120 / 100;
+ cl->high_water = min_t(s64, cl->ceil,
+ cl->high_water);
+ if (cl->drop_delay != 0)
+ cl->drop_delay = 0;
+ } else if (cl->bw_measured <
+ cl->high_water * 85 / 100) {
+ /* Drop */
+ cl->drop_delay++;
+ if (cl->drop_delay == 5) {
+ cl->high_water =
+ cl->bw_measured * 110 / 100;
+ cl->drop_delay = 0;
+ }
+ } else {
+ /* Stable */
+ cl->high_water = cl->bw_allocated;
+ if (cl->drop_delay != 0)
+ cl->drop_delay = 0;
+ }
+
+ cl->high_water = max_t(s64, cl->high_water, MINBW);
+ cl->bw_allocated = min_t(s64, cl->rate, cl->high_water);
+ bw_available -= cl->bw_allocated;
+ if (cl->bw_allocated < cl->high_water)
+ cl->want_more = true;
+ else
+ cl->want_more = false;
+ total += cl->bw_measured;
+ }
+ }
+
+ if (total > HIGH_THRESHOLD * ltb->link_speed / 100) {
+ is_light_traffic = 0;
+
+ /* Redistribute the remaining bandwidth by priority
+ */
+ if (bw_available > 0)
+ high = bw_redistribute(ltb, bw_available);
+
+ /* The link is near satuarated, we need to suppress
+ * those classes that:
+ * - are not of the highest priority that haven't
+ * reached all ceiling.
+ * - consume more than rate.
+ *
+ * This will give the higher priority class a better chance
+ * to gain full speed.
+ */
+ if (total > SUPPRESS_THRESHOLD * ltb->link_speed / 100)
+ bw_suppress_lower(ltb, high);
+ }
+ bw_sync_all(ltb, bw_available, is_light_traffic);
+}
+
+static int ltb_bw_balancer_kthread(void *arg)
+{
+ struct ltb_sched *ltb = (struct ltb_sched *)arg;
+
+ for (;;) {
+ wait_event_interruptible_timeout(ltb->bwbalancer_wq,
+ kthread_should_stop(),
+ LOW_FREQ_INTERVAL);
+ if (kthread_should_stop())
+ break;
+
+ if (ltb_update_linkspeed(ltb) != 0)
+ continue;
+
+ rcu_read_lock_bh();
+ bw_balance(ltb);
+ rcu_read_unlock_bh();
+ }
+ return 0;
+}
+
+static int ltb_parse_opts(struct nlattr *opt, u32 *defcls)
+{
+ struct nlattr *tb[TCA_LTB_MAX + 1];
+ struct tc_ltb_glob *gopt;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt,
+ ltb_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_LTB_INIT])
+ return -EINVAL;
+
+ gopt = nla_data(tb[TCA_LTB_INIT]);
+ if (gopt->version != LTB_VERSION >> 16)
+ return -EINVAL;
+
+ if (defcls)
+ *defcls = gopt->defcls;
+ return 0;
+}
+
+static int ltb_pcpu_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ltb_pcpu_sched *pcpu_q =
+ (struct ltb_pcpu_sched *)qdisc_priv(sch);
+
+ memset(pcpu_q, 0, sizeof(*pcpu_q));
+ pcpu_q->qdisc = sch;
+ init_irq_work(&pcpu_q->fanout_irq_work, ltb_fanout_irq_tx_func);
+ return 0;
+}
+
+static struct sk_buff *ltb_pcpu_class_dequeue(struct ltb_pcpu_sched *pcpu_q,
+ struct ltb_class *cl)
+{
+ struct sk_buff *skb;
+ typeof(&cl->fanout_queues[0]) queue;
+
+ queue = &cl->fanout_queues[pcpu_q->cpu];
+ if (kfifo_peek(queue, &skb) > 0) {
+ kfifo_skip(queue);
+ pcpu_q->qdisc->q.qlen--;
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *ltb_pcpu_dequeue(struct Qdisc *sch)
+{
+ struct ltb_sched *ltb;
+ struct ltb_pcpu_sched *pcpu_q;
+ struct ltb_class *cl;
+ struct sk_buff *skb;
+ int i;
+
+ pcpu_q = (struct ltb_pcpu_sched *)qdisc_priv(sch);
+ ltb = pcpu_q->ltb;
+
+ for (i = 0; i < ltb->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, <b->clhash.hash[i], common.hnode) {
+ skb = ltb_pcpu_class_dequeue(pcpu_q, cl);
+ if (skb)
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+static inline struct ltb_class *ltb_find_class(struct Qdisc *sch, u32 handle)
+{
+ struct ltb_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, handle);
+ if (!clc)
+ return NULL;
+
+ return container_of(clc, struct ltb_class, common);
+}
+
+static struct ltb_class *ltb_alloc_class(struct Qdisc *sch,
+ struct ltb_class *parent, u32 classid,
+ struct psched_ratecfg *ratecfg,
+ struct psched_ratecfg *ceilcfg,
+ u32 prio)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct ltb_class *cl;
+ int i;
+
+ if (ratecfg->rate_bytes_ps > ceilcfg->rate_bytes_ps ||
+ prio < 0 || prio >= TC_LTB_NUMPRIO)
+ return NULL;
+
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (!cl)
+ return NULL;
+
+ cl->common.classid = classid;
+ cl->parent = parent;
+ cl->ratecfg = *ratecfg;
+ cl->ceilcfg = *ceilcfg;
+ cl->prio = prio;
+ cl->classid = classid;
+ cl->root_qdisc = sch;
+ cl->num_cpus = ltb->num_cpus;
+ cl->last_cpu = 0;
+ cl->ceil = ceilcfg->rate_bytes_ps;
+ cl->rate = ratecfg->rate_bytes_ps;
+ cl->bw_allocated = ratecfg->rate_bytes_ps;
+ cl->high_water = cl->bw_allocated * 110 / 100;
+ cl->maxbw = BYTES_PER_INTERVAL((s64)ratecfg->rate_bytes_ps);
+
+ INIT_KFIFO(cl->drain_queue);
+ for (i = 0; i < cl->num_cpus; i++) {
+ INIT_KFIFO(cl->aggr_queues[i]);
+ INIT_KFIFO(cl->fanout_queues[i]);
+ }
+ hrtimer_init(&cl->aggr_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+ cl->aggr_timer.function = ltb_aggr_watchdog;
+ tasklet_init(&cl->aggr_tasklet, ltb_aggr_tasklet,
+ (unsigned long)cl);
+
+ if (classid == ltb->default_classid)
+ rcu_assign_pointer(ltb->default_cls, cl);
+ if (classid != SHADOW_CLASSID) {
+ write_lock_bh(<b->prio_rows_lock);
+ list_add(&cl->pnode, <b->prio_rows[prio]);
+ write_unlock_bh(<b->prio_rows_lock);
+ }
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(<b->clhash, &cl->common);
+ sch_tree_unlock(sch);
+
+ return cl;
+}
+
+static int ltb_modify_class(struct Qdisc *sch, struct ltb_class *cl,
+ struct psched_ratecfg *ratecfg,
+ struct psched_ratecfg *ceilcfg,
+ u32 prio)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+
+ rcu_read_lock_bh();
+ cl->ratecfg = *ratecfg;
+ cl->ceilcfg = *ceilcfg;
+ cl->prio = prio;
+ cl->rate = ratecfg->rate_bytes_ps;
+ cl->ceil = ceilcfg->rate_bytes_ps;
+ cl->bw_allocated = ratecfg->rate_bytes_ps;
+ cl->high_water = cl->bw_allocated * 110 / 100;
+ cl->maxbw = BYTES_PER_INTERVAL((s64)ratecfg->rate_bytes_ps);
+
+ write_lock_bh(<b->prio_rows_lock);
+ list_del(&cl->pnode);
+ list_add(&cl->pnode, <b->prio_rows[prio]);
+ write_unlock_bh(<b->prio_rows_lock);
+
+ rcu_read_unlock_bh();
+
+ return 0;
+}
+
+static void ltb_destroy_class(struct Qdisc *sch, struct ltb_class *cl)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int i;
+
+ if (ltb->default_classid == cl->classid)
+ rcu_assign_pointer(ltb->default_cls, ltb->shadow_cls);
+ cl->state |= LTB_CLASS_CONDEMED;
+ if (cl->classid != SHADOW_CLASSID) {
+ write_lock_bh(<b->prio_rows_lock);
+ list_del(&cl->pnode);
+ write_unlock_bh(<b->prio_rows_lock);
+ }
+
+ hrtimer_cancel(&cl->aggr_timer);
+ tasklet_kill(&cl->aggr_tasklet);
+
+ /* Cleanup pending packets */
+ for (i = 0; i < cl->num_cpus; i++) {
+ while (kfifo_get(&cl->aggr_queues[i], &skb) > 0)
+ kfree_skb(skb);
+
+ while (kfifo_get(&cl->fanout_queues[i], &skb) > 0)
+ kfree_skb(skb);
+ }
+ while (kfifo_get(&cl->drain_queue, &skb) > 0)
+ kfree_skb(skb);
+
+ kfree(cl);
+}
+
+static int ltb_graft_class(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct ltb_class *cl = (struct ltb_class *)arg;
+
+ if (!new)
+ return -EINVAL;
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ return 0;
+}
+
+static struct Qdisc *ltb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct ltb_class *cl = (struct ltb_class *)arg;
+
+ return cl->qdisc;
+}
+
+static void ltb_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static unsigned long ltb_find(struct Qdisc *sch, u32 handle)
+{
+ return (unsigned long)ltb_find_class(sch, handle);
+}
+
+static int ltb_change_class(struct Qdisc *sch, u32 classid,
+ u32 parentid, struct nlattr **tca,
+ unsigned long *arg, struct netlink_ext_ack *extack)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct ltb_class *cl = (struct ltb_class *)*arg, *parent;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_LTB_MAX + 1];
+ struct tc_ltb_opt *lopt;
+ u64 rate64, ceil64;
+ struct psched_ratecfg ratecfg, ceilcfg;
+ u32 prio;
+ int err;
+
+ if (!opt)
+ goto failure;
+
+ err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt, ltb_policy,
+ NULL);
+ if (err < 0)
+ goto failure;
+
+ err = -EINVAL;
+ if (!tb[TCA_LTB_PARMS])
+ goto failure;
+
+ parent = parentid == TC_H_ROOT ? NULL : ltb_find_class(sch, parentid);
+
+ lopt = nla_data(tb[TCA_LTB_PARMS]);
+ if (!lopt->rate.rate || !lopt->ceil.rate)
+ goto failure;
+
+ rate64 = tb[TCA_LTB_RATE64] ? nla_get_u64(tb[TCA_LTB_RATE64]) : 0;
+ ceil64 = tb[TCA_LTB_CEIL64] ? nla_get_u64(tb[TCA_LTB_CEIL64]) : 0;
+ if (rate64 > ceil64)
+ goto failure;
+
+ psched_ratecfg_precompute(&ratecfg, &lopt->rate, rate64);
+ psched_ratecfg_precompute(&ceilcfg, &lopt->ceil, ceil64);
+ prio = lopt->prio;
+ if (prio >= TC_LTB_NUMPRIO)
+ prio = TC_LTB_NUMPRIO - 1;
+
+ if (!cl) {
+ if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
+ ltb_find_class(sch, classid)) {
+ err = -EINVAL;
+ goto failure;
+ }
+ cl = ltb_alloc_class(sch, parent, classid, &ratecfg, &ceilcfg,
+ prio);
+ if (!cl) {
+ err = -ENOBUFS;
+ goto failure;
+ }
+ } else {
+ /* Modify existing class */
+ ltb_modify_class(sch, cl, &ratecfg, &ceilcfg, prio);
+ }
+ qdisc_class_hash_grow(sch, <b->clhash);
+ *arg = (unsigned long)cl;
+ return 0;
+
+failure:
+ return err;
+}
+
+static int ltb_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct ltb_class *cl = (struct ltb_class *)arg;
+
+ sch_tree_lock(sch);
+ if (cl->qdisc)
+ qdisc_purge_queue(cl->qdisc);
+ qdisc_class_hash_remove(<b->clhash, &cl->common);
+ sch_tree_unlock(sch);
+
+ ltb_destroy_class(sch, cl);
+ return 0;
+}
+
+static void ltb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct ltb_sched *q = qdisc_priv(sch);
+ struct ltb_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ /* We don't want to walk the shadow class */
+ if (cl->classid == SHADOW_CLASSID)
+ continue;
+
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static int ltb_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct ltb_class *cl = (struct ltb_class *)arg;
+ struct nlattr *nest;
+ struct tc_ltb_opt opt;
+
+ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ memset(&opt, 0, sizeof(opt));
+ psched_ratecfg_getrate(&opt.rate, &cl->ratecfg);
+ psched_ratecfg_getrate(&opt.ceil, &cl->ceilcfg);
+
+ opt.measured = BPS2MBPS(cl->bw_measured);
+ opt.allocated = BPS2MBPS(cl->bw_allocated);
+ opt.high_water = BPS2MBPS(cl->high_water);
+ opt.prio = cl->prio;
+
+ if (nla_put(skb, TCA_LTB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if ((cl->ratecfg.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_LTB_RATE64, cl->ratecfg.rate_bytes_ps,
+ TCA_LTB_PAD))
+ goto nla_put_failure;
+ if ((cl->ceilcfg.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_LTB_CEIL64, cl->ceilcfg.rate_bytes_ps,
+ TCA_LTB_PAD))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int ltb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct ltb_class *cl = (struct ltb_class *)arg;
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct tc_ltb_xstats xstats;
+
+ memset(&bstats, 0, sizeof(bstats));
+ bstats.bytes = cl->stat_bytes;
+ bstats.packets = cl->stat_packets;
+ memset(&qstats, 0, sizeof(qstats));
+ qstats.drops = cl->stat_drops.counter;
+ memset(&xstats, 0, sizeof(xstats));
+ xstats.measured = BPS2MBPS(cl->bw_measured);
+ xstats.allocated = BPS2MBPS(cl->bw_allocated);
+ xstats.high_water = BPS2MBPS(cl->high_water);
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &qstats, 0) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static struct ltb_class *ltb_classify(struct Qdisc *sch,
+ struct ltb_sched *ltb,
+ struct sk_buff *skb)
+{
+ struct ltb_class *cl;
+
+ /* Allow to select a class by setting skb->priority */
+ if (likely(skb->priority != 0)) {
+ cl = ltb_find_class(sch, skb->priority);
+ if (cl)
+ return cl;
+ }
+ return rcu_dereference_bh(ltb->default_cls);
+}
+
+static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
+ struct sk_buff **to_free)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct ltb_pcpu_sched *pcpu_q;
+ struct ltb_class *cl;
+ struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
+ int cpu;
+
+ cpu = smp_processor_id();
+ pcpu_q = qdisc_priv(pcpu->qdisc);
+ ltb_skb_cb(skb)->cpu = cpu;
+
+ cl = ltb_classify(sch, ltb, skb);
+ if (unlikely(!cl)) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ pcpu->active = true;
+ if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
+ kfree_skb(skb);
+ atomic64_inc(&cl->stat_drops);
+ return NET_XMIT_DROP;
+ }
+
+ sch->q.qlen = 1;
+ pcpu_q->qdisc->q.qlen++;
+ tasklet_schedule(&cl->aggr_tasklet);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *ltb_dequeue(struct Qdisc *sch)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct ltb_pcpu_data *pcpu;
+
+ pcpu = this_cpu_ptr(ltb->pcpu_data);
+
+ if (likely(pcpu->active))
+ pcpu->active = false;
+ else
+ tasklet_schedule(<b->fanout_tasklet);
+
+ return NULL;
+}
+
+static void ltb_reset(struct Qdisc *sch)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct ltb_class *cl;
+ int i;
+
+ sch->q.qlen = 0;
+ for (i = 0; i < ltb->num_cpus; i++)
+ qdisc_reset(per_cpu_ptr(ltb->pcpu_data, i)->qdisc);
+
+ for (i = 0; i < ltb->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, <b->clhash.hash[i], common.hnode) {
+ if (cl->qdisc)
+ qdisc_reset(cl->qdisc);
+ }
+ }
+}
+
+static void ltb_destroy(struct Qdisc *sch)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct hlist_node *tmp;
+ struct ltb_class *cl;
+ int i;
+
+ sch->q.qlen = 0;
+ ltb->default_cls = NULL;
+ ltb->shadow_cls = NULL;
+ tasklet_kill(<b->fanout_tasklet);
+ if (ltb->bwbalancer_task) {
+ kthread_stop(ltb->bwbalancer_task);
+ ltb->bwbalancer_task = NULL;
+ }
+
+ for (i = 0; i < ltb->num_cpus; i++)
+ qdisc_put(per_cpu_ptr(ltb->pcpu_data, i)->qdisc);
+
+ for (i = 0; i < ltb->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, tmp, <b->clhash.hash[i],
+ common.hnode)
+ ltb_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(<b->clhash);
+ free_percpu(ltb->pcpu_data);
+}
+
+static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ltb_sched *ltb = (struct ltb_sched *)qdisc_priv(sch);
+ struct Qdisc *q;
+ int err, i;
+ struct ltb_pcpu_sched *pcpu_q;
+ struct net_device *dev = qdisc_dev(sch);
+ u32 default_classid = 0;
+ struct psched_ratecfg ratecfg;
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (opt) {
+ err = ltb_parse_opts(opt, &default_classid);
+ if (err != 0)
+ return err;
+ }
+
+ memset(ltb, 0, sizeof(*ltb));
+ rwlock_init(<b->prio_rows_lock);
+ for (i = 0; i < TC_LTB_NUMPRIO; i++)
+ INIT_LIST_HEAD(<b->prio_rows[i]);
+
+ ltb->root_qdisc = sch;
+ ltb->dev = dev;
+ ltb->num_cpus = num_online_cpus();
+ if (ltb->num_cpus > MAX_CPU_COUNT)
+ return -EOPNOTSUPP;
+
+ ltb->link_speed = get_linkspeed(ltb->dev);
+ if (ltb->link_speed <= 0)
+ pr_warn("Failed to obtain link speed\n");
+
+ err = qdisc_class_hash_init(<b->clhash);
+ if (err < 0)
+ return err;
+
+ ltb->pcpu_data = alloc_percpu_gfp(struct ltb_pcpu_data,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!ltb->pcpu_data) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ for (i = 0; i < ltb->num_cpus; i++) {
+ q = qdisc_create_dflt(sch->dev_queue,
+ <b_pcpu_qdisc_ops, 0, NULL);
+ if (!q) {
+ err = -ENODEV;
+ goto error;
+ }
+ /* These cannot be initialized in qdisc_init() */
+ pcpu_q = (struct ltb_pcpu_sched *)qdisc_priv(q);
+ pcpu_q->cpu = i;
+ pcpu_q->ltb = ltb;
+
+ per_cpu_ptr(ltb->pcpu_data, i)->qdisc = q;
+ per_cpu_ptr(ltb->pcpu_data, i)->active = false;
+ }
+
+ ltb->default_classid = TC_H_MAKE(TC_H_MAJ(sch->handle),
+ default_classid);
+ ratecfg.rate_bytes_ps = ltb->link_speed;
+ ltb->shadow_cls = ltb_alloc_class(sch, NULL, SHADOW_CLASSID,
+ &ratecfg, &ratecfg, 0);
+ if (!ltb->shadow_cls) {
+ err = -EINVAL;
+ goto error;
+ }
+ ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
+ tasklet_init(<b->fanout_tasklet, ltb_fanout_tasklet,
+ (unsigned long)ltb);
+
+ /* Bandwidth balancer, this logic can be implemented in user-land. */
+ init_waitqueue_head(<b->bwbalancer_wq);
+ ltb->bwbalancer_task =
+ kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
+ wake_up_process(ltb->bwbalancer_task);
+
+ sch->flags |= TCQ_F_NOLOCK;
+ return 0;
+
+error:
+ for (i = 0; i < ltb->num_cpus; i++) {
+ struct ltb_pcpu_data *pcpu = per_cpu_ptr(ltb->pcpu_data, i);
+
+ if (pcpu->qdisc) {
+ qdisc_put(pcpu->qdisc);
+ pcpu->qdisc = NULL;
+ }
+ }
+ if (ltb->pcpu_data) {
+ free_percpu(ltb->pcpu_data);
+ ltb->pcpu_data = NULL;
+ }
+ if (ltb->bwbalancer_task) {
+ kthread_stop(ltb->bwbalancer_task);
+ ltb->bwbalancer_task = NULL;
+ }
+ qdisc_class_hash_destroy(<b->clhash);
+ return err;
+}
+
+static int ltb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct ltb_sched *ltb = qdisc_priv(sch);
+ struct nlattr *nest;
+ struct tc_ltb_glob gopt;
+
+ gopt.version = LTB_VERSION;
+ gopt.defcls = ltb->default_classid;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_LTB_INIT, sizeof(gopt), &gopt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct Qdisc_ops ltb_pcpu_qdisc_ops __read_mostly = {
+ .cl_ops = NULL,
+ .id = "ltb_percore",
+ .priv_size = sizeof(struct ltb_sched),
+ .enqueue = NULL,
+ .dequeue = ltb_pcpu_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = ltb_pcpu_init,
+ .dump = NULL,
+ .owner = THIS_MODULE,
+};
+
+static const struct Qdisc_class_ops ltb_class_ops = {
+ .graft = ltb_graft_class,
+ .leaf = ltb_leaf,
+ .qlen_notify = ltb_qlen_notify,
+ .find = ltb_find,
+ .change = ltb_change_class,
+ .delete = ltb_delete_class,
+ .walk = ltb_walk,
+ .dump = ltb_dump_class,
+ .dump_stats = ltb_dump_class_stats,
+};
+
+static struct Qdisc_ops ltb_qdisc_ops __read_mostly = {
+ .cl_ops = <b_class_ops,
+ .id = "ltb",
+ .priv_size = sizeof(struct ltb_sched),
+ .enqueue = ltb_enqueue,
+ .dequeue = ltb_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = ltb_init,
+ .reset = ltb_reset,
+ .destroy = ltb_destroy,
+ .dump = ltb_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init ltb_module_init(void)
+{
+ return register_qdisc(<b_qdisc_ops);
+}
+
+static void __exit ltb_module_exit(void)
+{
+ unregister_qdisc(<b_qdisc_ops);
+}
+
+module_init(ltb_module_init)
+module_exit(ltb_module_exit)
+MODULE_LICENSE("GPL");
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
@ 2020-07-06 18:32 ` Stephen Hemminger
2020-07-06 18:37 ` Stephen Hemminger
` (3 subsequent siblings)
4 siblings, 0 replies; 22+ messages in thread
From: Stephen Hemminger @ 2020-07-06 18:32 UTC (permalink / raw)
To: YU, Xiangning; +Cc: netdev
On Tue, 07 Jul 2020 02:08:13 +0800
"YU, Xiangning" <xiangning.yu@alibaba-inc.com> wrote:
> +static inline s64 get_linkspeed(struct net_device *dev)
This is not performance sensitive and should not be marked inline.
> +{
> + struct ethtool_link_ksettings ecmd;
> +
> + ASSERT_RTNL();
> + if (netif_running(dev) && !__ethtool_get_link_ksettings(dev, &ecmd))
> + /* Convert to bytes per second */
> + return ecmd.base.speed * 1000 * 1000L / 8;
You need to be more careful in handling of devices that return unknown for speed.
> + return 0;
> +}
> +
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
2020-07-06 18:32 ` Stephen Hemminger
@ 2020-07-06 18:37 ` Stephen Hemminger
2020-07-06 19:56 ` YU, Xiangning
2020-07-06 20:10 ` Cong Wang
` (2 subsequent siblings)
4 siblings, 1 reply; 22+ messages in thread
From: Stephen Hemminger @ 2020-07-06 18:37 UTC (permalink / raw)
To: YU, Xiangning; +Cc: netdev
On Tue, 07 Jul 2020 02:08:13 +0800
"YU, Xiangning" <xiangning.yu@alibaba-inc.com> wrote:
> +static inline int ltb_drain(struct ltb_class *cl)
> +{
> + typeof(&cl->drain_queue) queue;
> + struct sk_buff *skb;
> + int npkts, bytes;
> + unsigned long now = NOW();
> + int cpu;
> + struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
> + struct ltb_pcpu_sched *pcpu_q;
> + s64 timestamp;
> + bool need_watchdog = false;
> + struct cpumask cpumask;
> +
> + npkts = 0;
> + bytes = 0;
It would be safer to use unsigned int for npkts and bytes.
These should never be negative.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 18:37 ` Stephen Hemminger
@ 2020-07-06 19:56 ` YU, Xiangning
0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 19:56 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
On 7/6/20 11:37 AM, Stephen Hemminger wrote:
> On Tue, 07 Jul 2020 02:08:13 +0800
> "YU, Xiangning" <xiangning.yu@alibaba-inc.com> wrote:
>
>> +static inline int ltb_drain(struct ltb_class *cl)
>> +{
>> + typeof(&cl->drain_queue) queue;
>> + struct sk_buff *skb;
>> + int npkts, bytes;
>> + unsigned long now = NOW();
>> + int cpu;
>> + struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
>> + struct ltb_pcpu_sched *pcpu_q;
>> + s64 timestamp;
>> + bool need_watchdog = false;
>> + struct cpumask cpumask;
>> +
>> + npkts = 0;
>> + bytes = 0;
>
> It would be safer to use unsigned int for npkts and bytes.
> These should never be negative.
>
Thank you Stephen. I will make these changes, including those mentioned in your previous email.
- Xiangning
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
2020-07-06 18:32 ` Stephen Hemminger
2020-07-06 18:37 ` Stephen Hemminger
@ 2020-07-06 20:10 ` Cong Wang
2020-07-06 20:34 ` YU, Xiangning
2020-07-06 20:29 ` David Miller
2020-07-07 15:42 ` kernel test robot
4 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-06 20:10 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
> + struct sk_buff **to_free)
> +{
> + struct ltb_sched *ltb = qdisc_priv(sch);
> + struct ltb_pcpu_sched *pcpu_q;
> + struct ltb_class *cl;
> + struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
> + int cpu;
> +
> + cpu = smp_processor_id();
> + pcpu_q = qdisc_priv(pcpu->qdisc);
> + ltb_skb_cb(skb)->cpu = cpu;
> +
> + cl = ltb_classify(sch, ltb, skb);
> + if (unlikely(!cl)) {
> + kfree_skb(skb);
> + return NET_XMIT_DROP;
> + }
> +
> + pcpu->active = true;
> + if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
> + kfree_skb(skb);
> + atomic64_inc(&cl->stat_drops);
> + return NET_XMIT_DROP;
> + }
How do you prevent out-of-order packets?
> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
...
> + ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
> + tasklet_init(<b->fanout_tasklet, ltb_fanout_tasklet,
> + (unsigned long)ltb);
> +
> + /* Bandwidth balancer, this logic can be implemented in user-land. */
> + init_waitqueue_head(<b->bwbalancer_wq);
> + ltb->bwbalancer_task =
> + kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
> + wake_up_process(ltb->bwbalancer_task);
Creating a kthread for each qdisc doesn't look good. Why do you
need a per-qdisc kthread or even a kernel thread at all?
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
` (2 preceding siblings ...)
2020-07-06 20:10 ` Cong Wang
@ 2020-07-06 20:29 ` David Miller
2020-07-06 22:08 ` YU, Xiangning
2020-07-07 15:42 ` kernel test robot
4 siblings, 1 reply; 22+ messages in thread
From: David Miller @ 2020-07-06 20:29 UTC (permalink / raw)
To: xiangning.yu; +Cc: netdev
From: "YU, Xiangning" <xiangning.yu@alibaba-inc.com>
Date: Tue, 07 Jul 2020 02:08:13 +0800
> Lockless Token Bucket (LTB) is a qdisc implementation that controls the
> use of outbound bandwidth on a shared link. With the help of lockless
> qdisc, and by decoupling rate limiting and bandwidth sharing, LTB is
> designed to scale in the cloud data centers.
>
> Signed-off-by: Xiangning Yu <xiangning.yu@alibaba-inc.com>
I'm very skeptical about having a kthread for each qdisc, that doesn't
sound like a good idea at all.
Also:
> +static inline struct ltb_skb_cb *ltb_skb_cb(const struct sk_buff *skb)
No inline functions in foo.c files please.
> +static inline s64 get_linkspeed(struct net_device *dev)
Likewise.
> +static inline int ltb_drain(struct ltb_class *cl)
> +{
> + typeof(&cl->drain_queue) queue;
Use the actual type not typeof().
> + struct sk_buff *skb;
> + int npkts, bytes;
> + unsigned long now = NOW();
> + int cpu;
> + struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
> + struct ltb_pcpu_sched *pcpu_q;
> + s64 timestamp;
> + bool need_watchdog = false;
> + struct cpumask cpumask;
Please order local variables in reverse christmas tree order.
> +static void ltb_aggregate(struct ltb_class *cl)
> +{
> + s64 timestamp = ktime_get_ns();
> + struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
> + int num_cpus = ltb->num_cpus;
> + int i;
Likewise.
> +static inline void ltb_fanout(struct ltb_sched *ltb)
> +{
No inline please.
> +/* How many classes within the same group want more bandwidth */
> +static inline int bw_class_want_more_count(struct list_head *head)
> +{
> + int n = 0;
> + struct ltb_class *cl;
No inline, and reverse christmas tree ordering for local variables.
> +/* Redistribute bandwidth among classes with the same priority */
> +static int bw_redistribute_prio(struct list_head *lhead, int bw_available,
> + int n, bool *all_reached_ceil)
> +{
> + struct ltb_class *cl;
> + int avg = 0;
> + int orig_bw_allocated;
> + int safe_loop = 0;
> +
Likewise.
> +static int bw_redistribute(struct ltb_sched *ltb, int bw_available)
> +{
> + int prio = 0;
> + int n;
> + int highest_non_saturated_prio = TC_LTB_NUMPRIO;
> + bool all_reached_ceil;
Likewise.
> +static void bw_balance(struct ltb_sched *ltb)
> +{
> + struct ltb_class *cl;
> + s64 link_speed = ltb->link_speed;
> + int bw_available = link_speed;
> + s64 total = 0;
> + int high = TC_LTB_NUMPRIO;
> + int is_light_traffic = 1;
> + int i;
Likewise.
And so on and so forth. This code needs a lot of style fixes
and removal of the per-qdisc kthread design.
Thank you.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 20:10 ` Cong Wang
@ 2020-07-06 20:34 ` YU, Xiangning
2020-07-07 20:06 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 20:34 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/6/20 1:10 PM, Cong Wang wrote:
> On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
>> + struct sk_buff **to_free)
>> +{
>> + struct ltb_sched *ltb = qdisc_priv(sch);
>> + struct ltb_pcpu_sched *pcpu_q;
>> + struct ltb_class *cl;
>> + struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
>> + int cpu;
>> +
>> + cpu = smp_processor_id();
>> + pcpu_q = qdisc_priv(pcpu->qdisc);
>> + ltb_skb_cb(skb)->cpu = cpu;
>> +
>> + cl = ltb_classify(sch, ltb, skb);
>> + if (unlikely(!cl)) {
>> + kfree_skb(skb);
>> + return NET_XMIT_DROP;
>> + }
>> +
>> + pcpu->active = true;
>> + if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
>> + kfree_skb(skb);
>> + atomic64_inc(&cl->stat_drops);
>> + return NET_XMIT_DROP;
>> + }
>
>
> How do you prevent out-of-order packets?
>
Hi Cong,
That's a good question. In theory there will be out of order packets during aggregation. While keep in mind this is per-class aggregation, and it runs at a high frequency, that the chance to have any left over skbs from the same TCP flow on many CPUs is low.
Also, based on real deployment experience, we haven't observed an increased out of order events even under heavy work load.
>
>> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
> ...
>> + ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
>> + tasklet_init(<b->fanout_tasklet, ltb_fanout_tasklet,
>> + (unsigned long)ltb);
>> +
>> + /* Bandwidth balancer, this logic can be implemented in user-land. */
>> + init_waitqueue_head(<b->bwbalancer_wq);
>> + ltb->bwbalancer_task =
>> + kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
>> + wake_up_process(ltb->bwbalancer_task);
>
> Creating a kthread for each qdisc doesn't look good. Why do you
> need a per-qdisc kthread or even a kernel thread at all?
>
We moved the bandwidth sharing out of the critical data path, that's why we use a kernel thread to balance the current maximum bandwidth used by each class periodically.
This part could be implemented at as timer. What's your suggestion?
Thanks,
- Xiangning
> Thanks.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 20:29 ` David Miller
@ 2020-07-06 22:08 ` YU, Xiangning
0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 22:08 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On 7/6/20 1:29 PM, David Miller wrote:
> From: "YU, Xiangning" <xiangning.yu@alibaba-inc.com>
> Date: Tue, 07 Jul 2020 02:08:13 +0800
>
>> Lockless Token Bucket (LTB) is a qdisc implementation that controls the
>> use of outbound bandwidth on a shared link. With the help of lockless
>> qdisc, and by decoupling rate limiting and bandwidth sharing, LTB is
>> designed to scale in the cloud data centers.
>>
>> Signed-off-by: Xiangning Yu <xiangning.yu@alibaba-inc.com>
>
> I'm very skeptical about having a kthread for each qdisc, that doesn't
> sound like a good idea at all.
>
> Also:
>
Hi David,
I can change the kthread to a timer, like what sfq does for perturbation. Hope that would be OK.
Will take care of the other comments too.
Thanks,
- Xiangning
>> +static inline struct ltb_skb_cb *ltb_skb_cb(const struct sk_buff *skb)
>
> No inline functions in foo.c files please.
>
>> +static inline s64 get_linkspeed(struct net_device *dev)
>
> Likewise.
>
>> +static inline int ltb_drain(struct ltb_class *cl)
>> +{
>> + typeof(&cl->drain_queue) queue;
>
> Use the actual type not typeof().
>
>> + struct sk_buff *skb;
>> + int npkts, bytes;
>> + unsigned long now = NOW();
>> + int cpu;
>> + struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
>> + struct ltb_pcpu_sched *pcpu_q;
>> + s64 timestamp;
>> + bool need_watchdog = false;
>> + struct cpumask cpumask;
>
> Please order local variables in reverse christmas tree order.
>
>> +static void ltb_aggregate(struct ltb_class *cl)
>> +{
>> + s64 timestamp = ktime_get_ns();
>> + struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
>> + int num_cpus = ltb->num_cpus;
>> + int i;
>
> Likewise.
>
>> +static inline void ltb_fanout(struct ltb_sched *ltb)
>> +{
>
> No inline please.
>
>> +/* How many classes within the same group want more bandwidth */
>> +static inline int bw_class_want_more_count(struct list_head *head)
>> +{
>> + int n = 0;
>> + struct ltb_class *cl;
>
> No inline, and reverse christmas tree ordering for local variables.
>
>> +/* Redistribute bandwidth among classes with the same priority */
>> +static int bw_redistribute_prio(struct list_head *lhead, int bw_available,
>> + int n, bool *all_reached_ceil)
>> +{
>> + struct ltb_class *cl;
>> + int avg = 0;
>> + int orig_bw_allocated;
>> + int safe_loop = 0;
>> +
>
> Likewise.
>
>> +static int bw_redistribute(struct ltb_sched *ltb, int bw_available)
>> +{
>> + int prio = 0;
>> + int n;
>> + int highest_non_saturated_prio = TC_LTB_NUMPRIO;
>> + bool all_reached_ceil;
>
> Likewise.
>
>> +static void bw_balance(struct ltb_sched *ltb)
>> +{
>> + struct ltb_class *cl;
>> + s64 link_speed = ltb->link_speed;
>> + int bw_available = link_speed;
>> + s64 total = 0;
>> + int high = TC_LTB_NUMPRIO;
>> + int is_light_traffic = 1;
>> + int i;
>
> Likewise.
>
> And so on and so forth. This code needs a lot of style fixes
> and removal of the per-qdisc kthread design.
>
> Thank you.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
@ 2020-07-07 15:42 ` kernel test robot
2020-07-06 18:37 ` Stephen Hemminger
` (3 subsequent siblings)
4 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2020-07-07 15:42 UTC (permalink / raw)
To: YU, Xiangning, netdev; +Cc: kbuild-all, clang-built-linux
[-- Attachment #1: Type: text/plain, Size: 3969 bytes --]
Hi Xiangning",
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/YU-Xiangning/Lockless-Token-Bucket-LTB-Qdisc/20200707-020918
base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 5bd6ff0c6fe6c138aebe8d3bf7163420386c7ca7
config: x86_64-allyesconfig (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 02946de3802d3bc65bc9f2eb9b8d4969b5a7add8)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
>> net/sched/sch_ltb.c:832:6: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
if (!opt)
^~~~
net/sched/sch_ltb.c:882:9: note: uninitialized use occurs here
return err;
^~~
net/sched/sch_ltb.c:832:2: note: remove the 'if' if its condition is always false
if (!opt)
^~~~~~~~~
net/sched/sch_ltb.c:830:9: note: initialize the variable 'err' to silence this warning
int err;
^
= 0
1 warning generated.
vim +832 net/sched/sch_ltb.c
817
818 static int ltb_change_class(struct Qdisc *sch, u32 classid,
819 u32 parentid, struct nlattr **tca,
820 unsigned long *arg, struct netlink_ext_ack *extack)
821 {
822 struct ltb_sched *ltb = qdisc_priv(sch);
823 struct ltb_class *cl = (struct ltb_class *)*arg, *parent;
824 struct nlattr *opt = tca[TCA_OPTIONS];
825 struct nlattr *tb[TCA_LTB_MAX + 1];
826 struct tc_ltb_opt *lopt;
827 u64 rate64, ceil64;
828 struct psched_ratecfg ratecfg, ceilcfg;
829 u32 prio;
830 int err;
831
> 832 if (!opt)
833 goto failure;
834
835 err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt, ltb_policy,
836 NULL);
837 if (err < 0)
838 goto failure;
839
840 err = -EINVAL;
841 if (!tb[TCA_LTB_PARMS])
842 goto failure;
843
844 parent = parentid == TC_H_ROOT ? NULL : ltb_find_class(sch, parentid);
845
846 lopt = nla_data(tb[TCA_LTB_PARMS]);
847 if (!lopt->rate.rate || !lopt->ceil.rate)
848 goto failure;
849
850 rate64 = tb[TCA_LTB_RATE64] ? nla_get_u64(tb[TCA_LTB_RATE64]) : 0;
851 ceil64 = tb[TCA_LTB_CEIL64] ? nla_get_u64(tb[TCA_LTB_CEIL64]) : 0;
852 if (rate64 > ceil64)
853 goto failure;
854
855 psched_ratecfg_precompute(&ratecfg, &lopt->rate, rate64);
856 psched_ratecfg_precompute(&ceilcfg, &lopt->ceil, ceil64);
857 prio = lopt->prio;
858 if (prio >= TC_LTB_NUMPRIO)
859 prio = TC_LTB_NUMPRIO - 1;
860
861 if (!cl) {
862 if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
863 ltb_find_class(sch, classid)) {
864 err = -EINVAL;
865 goto failure;
866 }
867 cl = ltb_alloc_class(sch, parent, classid, &ratecfg, &ceilcfg,
868 prio);
869 if (!cl) {
870 err = -ENOBUFS;
871 goto failure;
872 }
873 } else {
874 /* Modify existing class */
875 ltb_modify_class(sch, cl, &ratecfg, &ceilcfg, prio);
876 }
877 qdisc_class_hash_grow(sch, <b->clhash);
878 *arg = (unsigned long)cl;
879 return 0;
880
881 failure:
882 return err;
883 }
884
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 75320 bytes --]
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
@ 2020-07-07 15:42 ` kernel test robot
0 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2020-07-07 15:42 UTC (permalink / raw)
To: kbuild-all
[-- Attachment #1: Type: text/plain, Size: 4083 bytes --]
Hi Xiangning",
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/YU-Xiangning/Lockless-Token-Bucket-LTB-Qdisc/20200707-020918
base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 5bd6ff0c6fe6c138aebe8d3bf7163420386c7ca7
config: x86_64-allyesconfig (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 02946de3802d3bc65bc9f2eb9b8d4969b5a7add8)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
>> net/sched/sch_ltb.c:832:6: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
if (!opt)
^~~~
net/sched/sch_ltb.c:882:9: note: uninitialized use occurs here
return err;
^~~
net/sched/sch_ltb.c:832:2: note: remove the 'if' if its condition is always false
if (!opt)
^~~~~~~~~
net/sched/sch_ltb.c:830:9: note: initialize the variable 'err' to silence this warning
int err;
^
= 0
1 warning generated.
vim +832 net/sched/sch_ltb.c
817
818 static int ltb_change_class(struct Qdisc *sch, u32 classid,
819 u32 parentid, struct nlattr **tca,
820 unsigned long *arg, struct netlink_ext_ack *extack)
821 {
822 struct ltb_sched *ltb = qdisc_priv(sch);
823 struct ltb_class *cl = (struct ltb_class *)*arg, *parent;
824 struct nlattr *opt = tca[TCA_OPTIONS];
825 struct nlattr *tb[TCA_LTB_MAX + 1];
826 struct tc_ltb_opt *lopt;
827 u64 rate64, ceil64;
828 struct psched_ratecfg ratecfg, ceilcfg;
829 u32 prio;
830 int err;
831
> 832 if (!opt)
833 goto failure;
834
835 err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt, ltb_policy,
836 NULL);
837 if (err < 0)
838 goto failure;
839
840 err = -EINVAL;
841 if (!tb[TCA_LTB_PARMS])
842 goto failure;
843
844 parent = parentid == TC_H_ROOT ? NULL : ltb_find_class(sch, parentid);
845
846 lopt = nla_data(tb[TCA_LTB_PARMS]);
847 if (!lopt->rate.rate || !lopt->ceil.rate)
848 goto failure;
849
850 rate64 = tb[TCA_LTB_RATE64] ? nla_get_u64(tb[TCA_LTB_RATE64]) : 0;
851 ceil64 = tb[TCA_LTB_CEIL64] ? nla_get_u64(tb[TCA_LTB_CEIL64]) : 0;
852 if (rate64 > ceil64)
853 goto failure;
854
855 psched_ratecfg_precompute(&ratecfg, &lopt->rate, rate64);
856 psched_ratecfg_precompute(&ceilcfg, &lopt->ceil, ceil64);
857 prio = lopt->prio;
858 if (prio >= TC_LTB_NUMPRIO)
859 prio = TC_LTB_NUMPRIO - 1;
860
861 if (!cl) {
862 if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
863 ltb_find_class(sch, classid)) {
864 err = -EINVAL;
865 goto failure;
866 }
867 cl = ltb_alloc_class(sch, parent, classid, &ratecfg, &ceilcfg,
868 prio);
869 if (!cl) {
870 err = -ENOBUFS;
871 goto failure;
872 }
873 } else {
874 /* Modify existing class */
875 ltb_modify_class(sch, cl, &ratecfg, &ceilcfg, prio);
876 }
877 qdisc_class_hash_grow(sch, <b->clhash);
878 *arg = (unsigned long)cl;
879 return 0;
880
881 failure:
882 return err;
883 }
884
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 75320 bytes --]
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-06 20:34 ` YU, Xiangning
@ 2020-07-07 20:06 ` Cong Wang
2020-07-07 21:24 ` YU, Xiangning
0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-07 20:06 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Mon, Jul 6, 2020 at 1:34 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
>
>
> On 7/6/20 1:10 PM, Cong Wang wrote:
> > On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
> > <xiangning.yu@alibaba-inc.com> wrote:
> >> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
> >> + struct sk_buff **to_free)
> >> +{
> >> + struct ltb_sched *ltb = qdisc_priv(sch);
> >> + struct ltb_pcpu_sched *pcpu_q;
> >> + struct ltb_class *cl;
> >> + struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
> >> + int cpu;
> >> +
> >> + cpu = smp_processor_id();
> >> + pcpu_q = qdisc_priv(pcpu->qdisc);
> >> + ltb_skb_cb(skb)->cpu = cpu;
> >> +
> >> + cl = ltb_classify(sch, ltb, skb);
> >> + if (unlikely(!cl)) {
> >> + kfree_skb(skb);
> >> + return NET_XMIT_DROP;
> >> + }
> >> +
> >> + pcpu->active = true;
> >> + if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
> >> + kfree_skb(skb);
> >> + atomic64_inc(&cl->stat_drops);
> >> + return NET_XMIT_DROP;
> >> + }
> >
> >
> > How do you prevent out-of-order packets?
> >
>
> Hi Cong,
>
> That's a good question. In theory there will be out of order packets during aggregation. While keep in mind this is per-class aggregation, and it runs at a high frequency, that the chance to have any left over skbs from the same TCP flow on many CPUs is low.
>
> Also, based on real deployment experience, we haven't observed an increased out of order events even under heavy work load.
>
Yeah, but unless you always classify packets into proper flows, there
is always a chance to generate out-of-order packets here, which
means the default configuration is flawed.
> >
> >> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
> > ...
> >> + ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
> >> + tasklet_init(<b->fanout_tasklet, ltb_fanout_tasklet,
> >> + (unsigned long)ltb);
> >> +
> >> + /* Bandwidth balancer, this logic can be implemented in user-land. */
> >> + init_waitqueue_head(<b->bwbalancer_wq);
> >> + ltb->bwbalancer_task =
> >> + kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
> >> + wake_up_process(ltb->bwbalancer_task);
> >
> > Creating a kthread for each qdisc doesn't look good. Why do you
> > need a per-qdisc kthread or even a kernel thread at all?
> >
>
> We moved the bandwidth sharing out of the critical data path, that's why we use a kernel thread to balance the current maximum bandwidth used by each class periodically.
>
> This part could be implemented at as timer. What's your suggestion?
I doubt you can use a timer, as you call rtnl_trylock() there.
Why not use a delayed work?
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-07 20:06 ` Cong Wang
@ 2020-07-07 21:24 ` YU, Xiangning
2020-07-08 20:24 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-07 21:24 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/7/20 1:06 PM, Cong Wang wrote:
> On Mon, Jul 6, 2020 at 1:34 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>>
>>
>> On 7/6/20 1:10 PM, Cong Wang wrote:
>>> On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
>>> <xiangning.yu@alibaba-inc.com> wrote:
>>>> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
>>>> + struct sk_buff **to_free)
>>>> +{
>>>> + struct ltb_sched *ltb = qdisc_priv(sch);
>>>> + struct ltb_pcpu_sched *pcpu_q;
>>>> + struct ltb_class *cl;
>>>> + struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
>>>> + int cpu;
>>>> +
>>>> + cpu = smp_processor_id();
>>>> + pcpu_q = qdisc_priv(pcpu->qdisc);
>>>> + ltb_skb_cb(skb)->cpu = cpu;
>>>> +
>>>> + cl = ltb_classify(sch, ltb, skb);
>>>> + if (unlikely(!cl)) {
>>>> + kfree_skb(skb);
>>>> + return NET_XMIT_DROP;
>>>> + }
>>>> +
>>>> + pcpu->active = true;
>>>> + if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
>>>> + kfree_skb(skb);
>>>> + atomic64_inc(&cl->stat_drops);
>>>> + return NET_XMIT_DROP;
>>>> + }
>>>
>>>
>>> How do you prevent out-of-order packets?
>>>
>>
>> Hi Cong,
>>
>> That's a good question. In theory there will be out of order packets during aggregation. While keep in mind this is per-class aggregation, and it runs at a high frequency, that the chance to have any left over skbs from the same TCP flow on many CPUs is low.
>>
>> Also, based on real deployment experience, we haven't observed an increased out of order events even under heavy work load.
>>
>
> Yeah, but unless you always classify packets into proper flows, there
> is always a chance to generate out-of-order packets here, which
> means the default configuration is flawed.
>
The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
>
>>>
>>>> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
>>> ...
>>>> + ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
>>>> + tasklet_init(<b->fanout_tasklet, ltb_fanout_tasklet,
>>>> + (unsigned long)ltb);
>>>> +
>>>> + /* Bandwidth balancer, this logic can be implemented in user-land. */
>>>> + init_waitqueue_head(<b->bwbalancer_wq);
>>>> + ltb->bwbalancer_task =
>>>> + kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
>>>> + wake_up_process(ltb->bwbalancer_task);
>>>
>>> Creating a kthread for each qdisc doesn't look good. Why do you
>>> need a per-qdisc kthread or even a kernel thread at all?
>>>
>>
>> We moved the bandwidth sharing out of the critical data path, that's why we use a kernel thread to balance the current maximum bandwidth used by each class periodically.
>>
>> This part could be implemented at as timer. What's your suggestion?
>
> I doubt you can use a timer, as you call rtnl_trylock() there.
> Why not use a delayed work?
>
Thank you for the suggestion, I will replace it with a delayed work and send out the new patch.
- Xiangning
> Thanks.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-07 21:24 ` YU, Xiangning
@ 2020-07-08 20:24 ` Cong Wang
2020-07-08 21:07 ` YU, Xiangning
0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-08 20:24 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
>
> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
By "default configuration" I mean no additional configuration on top
of qdisc creation. If you have to rely on additional TC filters to
do the classification, it could be problematic. Same for setting
skb priority, right?
Also, you use a default class, this means all unclassified packets
share the same class, and a flow falls into this class could be still
out-of-order, right?
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-08 20:24 ` Cong Wang
@ 2020-07-08 21:07 ` YU, Xiangning
2020-07-10 5:04 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-08 21:07 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/8/20 1:24 PM, Cong Wang wrote:
> On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
>>
>> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
>
> By "default configuration" I mean no additional configuration on top
> of qdisc creation. If you have to rely on additional TC filters to
> do the classification, it could be problematic. Same for setting
> skb priority, right?
>
In this patch we don't rely on other TC filters. In our use case, socket priority is set on a per-flow basis, not per-skb basis.
> Also, you use a default class, this means all unclassified packets
> share the same class, and a flow falls into this class could be still
> out-of-order, right?
>
A flow will fall and only fall to this class. If we can keep the order within a flow, I'm not sure why we still have this issue?
Thanks,
- Xiangning
> Thanks.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-08 21:07 ` YU, Xiangning
@ 2020-07-10 5:04 ` Cong Wang
2020-07-10 5:20 ` Cong Wang
2020-07-10 5:48 ` YU, Xiangning
0 siblings, 2 replies; 22+ messages in thread
From: Cong Wang @ 2020-07-10 5:04 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Wed, Jul 8, 2020 at 2:07 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
>
>
> On 7/8/20 1:24 PM, Cong Wang wrote:
> > On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
> > <xiangning.yu@alibaba-inc.com> wrote:
> >>
> >> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
> >>
> >> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
> >
> > By "default configuration" I mean no additional configuration on top
> > of qdisc creation. If you have to rely on additional TC filters to
> > do the classification, it could be problematic. Same for setting
> > skb priority, right?
> >
>
> In this patch we don't rely on other TC filters. In our use case, socket priority is set on a per-flow basis, not per-skb basis.
Your use case is not the default configuration I mentioned.
>
> > Also, you use a default class, this means all unclassified packets
> > share the same class, and a flow falls into this class could be still
> > out-of-order, right?
> >
>
> A flow will fall and only fall to this class. If we can keep the order within a flow, I'm not sure why we still have this issue?
The issue here is obvious: you have to rely on either TC filters or
whatever sets skb priority to make packets in a flow in-order.
IOW, without these *additional* efforts, it is broken in terms of
out-of-order.
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 5:04 ` Cong Wang
@ 2020-07-10 5:20 ` Cong Wang
2020-07-10 6:06 ` YU, Xiangning
2020-07-10 5:48 ` YU, Xiangning
1 sibling, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-10 5:20 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> IOW, without these *additional* efforts, it is broken in terms of
> out-of-order.
>
Take a look at fq_codel, it provides a hash function for flow classification,
fq_codel_hash(), as default, thus its default configuration does not
have such issues. So, you probably want to provide such a hash
function too instead of a default class.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 5:04 ` Cong Wang
2020-07-10 5:20 ` Cong Wang
@ 2020-07-10 5:48 ` YU, Xiangning
2020-07-10 6:09 ` Cong Wang
1 sibling, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10 5:48 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/9/20 10:04 PM, Cong Wang wrote:
> On Wed, Jul 8, 2020 at 2:07 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>>
>>
>> On 7/8/20 1:24 PM, Cong Wang wrote:
>>> On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
>>> <xiangning.yu@alibaba-inc.com> wrote:
>>>>
>>>> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
>>>>
>>>> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
>>>
>>> By "default configuration" I mean no additional configuration on top
>>> of qdisc creation. If you have to rely on additional TC filters to
>>> do the classification, it could be problematic. Same for setting
>>> skb priority, right?
>>>
>>
>> In this patch we don't rely on other TC filters. In our use case, socket priority is set on a per-flow basis, not per-skb basis.
>
> Your use case is not the default configuration I mentioned.
>
>>
>>> Also, you use a default class, this means all unclassified packets
>>> share the same class, and a flow falls into this class could be still
>>> out-of-order, right?
>>>
>>
>> A flow will fall and only fall to this class. If we can keep the order within a flow, I'm not sure why we still have this issue?
>
> The issue here is obvious: you have to rely on either TC filters or
> whatever sets skb priority to make packets in a flow in-order.
>> IOW, without these *additional* efforts, it is broken in terms of
> out-of-order.
>
Well, we do ask packets from a flow to be classified to a single class, not multiple ones. It doesn't have to be socket priority, it could be five tuple hash, or even container classid.
I think it's ok to have this requirement, even if we use htb, I would suggest the same. Why do you think this is a problem?
Thanks,
- Xiangning
> Thanks.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 5:20 ` Cong Wang
@ 2020-07-10 6:06 ` YU, Xiangning
2020-07-10 6:21 ` Cong Wang
0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10 6:06 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/9/20 10:20 PM, Cong Wang wrote:
> On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> IOW, without these *additional* efforts, it is broken in terms of
>> out-of-order.
>>
>
> Take a look at fq_codel, it provides a hash function for flow classification,
> fq_codel_hash(), as default, thus its default configuration does not
> have such issues. So, you probably want to provide such a hash
> function too instead of a default class.
>
If I understand this code correctly, this socket hash value identifies a flow. Essentially it serves the same purpose as socket priority. In this patch, we use a similar classification method like htb, but without filters.
We could provide a hash function, but I'm a bit confused about the problem we are trying to solve.
Thanks,
- Xiangning
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 5:48 ` YU, Xiangning
@ 2020-07-10 6:09 ` Cong Wang
2020-07-10 17:03 ` YU, Xiangning
0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-10 6:09 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Thu, Jul 9, 2020 at 10:49 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
> Well, we do ask packets from a flow to be classified to a single class, not multiple ones. It doesn't have to be socket priority, it could be five tuple hash, or even container classid.
I don't see how it is so in your code, without skb priority your code
simply falls back to default class:
+ /* Allow to select a class by setting skb->priority */
+ if (likely(skb->priority != 0)) {
+ cl = ltb_find_class(sch, skb->priority);
+ if (cl)
+ return cl;
+ }
+ return rcu_dereference_bh(ltb->default_cls);
Mind to be more specific here?
BTW, your qdisc does not even support TC filters, does it?
At least I don't see that tcf_classify() is called.
>
> I think it's ok to have this requirement, even if we use htb, I would suggest the same. Why do you think this is a problem?
Because HTB does not have a per-cpu queue for each class,
yours does, cl->aggr_queues[cpu], if your point here is why we
don't blame HTB.
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 6:06 ` YU, Xiangning
@ 2020-07-10 6:21 ` Cong Wang
2020-07-10 18:01 ` YU, Xiangning
0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-10 6:21 UTC (permalink / raw)
To: YU, Xiangning; +Cc: Linux Kernel Network Developers
On Thu, Jul 9, 2020 at 11:07 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
>
> On 7/9/20 10:20 PM, Cong Wang wrote:
> > On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >> IOW, without these *additional* efforts, it is broken in terms of
> >> out-of-order.
> >>
> >
> > Take a look at fq_codel, it provides a hash function for flow classification,
> > fq_codel_hash(), as default, thus its default configuration does not
> > have such issues. So, you probably want to provide such a hash
> > function too instead of a default class.
> >
> If I understand this code correctly, this socket hash value identifies a flow. Essentially it serves the same purpose as socket priority. In this patch, we use a similar classification method like htb, but without filters.
How is it any similar to HTB? HTB does not have a per-cpu queue
for each class. This is a huge difference.
>
> We could provide a hash function, but I'm a bit confused about the problem we are trying to solve.
Probably more than that, you need to ensure the packets in a same flow
are queued on the same queue.
Let say you have two packets P1 and P2 from the same flow (P1 is before P2),
you can classify them into the same class of course, but with per-cpu queues
they can be sent out in a wrong order too:
send(P1) on CPU1 -> classify() returns default class -> P1 is queued on
the CPU1 queue of default class
(Now process is migrated to CPU2)
send(P2) on CPU2 -> classify() returns default class -> P2 is queued on
the CPU2 queue of default class
P2 is dequeued on CPU2 before P1 dequeued on CPU1.
Now, out of order. :)
Hope it is clear now.
Thanks.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 6:09 ` Cong Wang
@ 2020-07-10 17:03 ` YU, Xiangning
0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10 17:03 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/9/20 11:09 PM, Cong Wang wrote:
> On Thu, Jul 9, 2020 at 10:49 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>> Well, we do ask packets from a flow to be classified to a single class, not multiple ones. It doesn't have to be socket priority, it could be five tuple hash, or even container classid.
>
> I don't see how it is so in your code, without skb priority your code
> simply falls back to default class:
>
> + /* Allow to select a class by setting skb->priority */
> + if (likely(skb->priority != 0)) {
> + cl = ltb_find_class(sch, skb->priority);
> + if (cl)
> + return cl;
> + }
> + return rcu_dereference_bh(ltb->default_cls);
>
> Mind to be more specific here?
>
The application will call setsockopt() to set priority. If no match if found, it will use the default class.
In real deployment we have another classifier. Please feel free to suggest what is best choice for it.
> BTW, your qdisc does not even support TC filters, does it?
> At least I don't see that tcf_classify() is called.
>
You are correct we don't use generic tc filters. I leave it unsupported in this patch too.
>
>>
>> I think it's ok to have this requirement, even if we use htb, I would suggest the same. Why do you think this is a problem?
>
> Because HTB does not have a per-cpu queue for each class,
> yours does, cl->aggr_queues[cpu], if your point here is why we
> don't blame HTB.
>
My point is: it's OK to ask a flow to be classified to a single class.
Thanks,
- Xiangning
> Thanks.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
2020-07-10 6:21 ` Cong Wang
@ 2020-07-10 18:01 ` YU, Xiangning
0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10 18:01 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers
On 7/9/20 11:21 PM, Cong Wang wrote:
> On Thu, Jul 9, 2020 at 11:07 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>>
>> On 7/9/20 10:20 PM, Cong Wang wrote:
>>> On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>> IOW, without these *additional* efforts, it is broken in terms of
>>>> out-of-order.
>>>>
>>>
>>> Take a look at fq_codel, it provides a hash function for flow classification,
>>> fq_codel_hash(), as default, thus its default configuration does not
>>> have such issues. So, you probably want to provide such a hash
>>> function too instead of a default class.
>>>
>> If I understand this code correctly, this socket hash value identifies a flow. Essentially it serves the same purpose as socket priority. In this patch, we use a similar classification method like htb, but without filters.
>
> How is it any similar to HTB? HTB does not have a per-cpu queue
> for each class. This is a huge difference.
I said 'similar classification method like htb'. Not similar to HTB in overall design. :)
>
>>
>> We could provide a hash function, but I'm a bit confused about the problem we are trying to solve.
>
> Probably more than that, you need to ensure the packets in a same flow
> are queued on the same queue.
>
> Let say you have two packets P1 and P2 from the same flow (P1 is before P2),
> you can classify them into the same class of course, but with per-cpu queues
> they can be sent out in a wrong order too:
>
> send(P1) on CPU1 -> classify() returns default class -> P1 is queued on
> the CPU1 queue of default class
>
> (Now process is migrated to CPU2)
>
> send(P2) on CPU2 -> classify() returns default class -> P2 is queued on
> the CPU2 queue of default class
>
> P2 is dequeued on CPU2 before P1 dequeued on CPU1.
>
> Now, out of order. :)
>
> Hope it is clear now.
The assumption is that packet scheduler is faster than thread migration. If we constantly take that long to send one packet, we need to fix it.
Under light load, CPU1 is free enough to immediately trigger aggregation. Under heavy load, some other CPU could also help trigger aggregation in between.
As I responded in my first email, this is possible in theory. We are running a much lower kernel version, and the normal case is to have multiple flows/threads running in a large class, so far haven't seen big trouble with this approach.
But it's been years, if the above assumption changes, we do need to rethink about it.
Thanks,
- Xiangning
>
> Thanks.
>
^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2020-07-10 18:01 UTC | newest]
Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
2020-07-06 18:32 ` Stephen Hemminger
2020-07-06 18:37 ` Stephen Hemminger
2020-07-06 19:56 ` YU, Xiangning
2020-07-06 20:10 ` Cong Wang
2020-07-06 20:34 ` YU, Xiangning
2020-07-07 20:06 ` Cong Wang
2020-07-07 21:24 ` YU, Xiangning
2020-07-08 20:24 ` Cong Wang
2020-07-08 21:07 ` YU, Xiangning
2020-07-10 5:04 ` Cong Wang
2020-07-10 5:20 ` Cong Wang
2020-07-10 6:06 ` YU, Xiangning
2020-07-10 6:21 ` Cong Wang
2020-07-10 18:01 ` YU, Xiangning
2020-07-10 5:48 ` YU, Xiangning
2020-07-10 6:09 ` Cong Wang
2020-07-10 17:03 ` YU, Xiangning
2020-07-06 20:29 ` David Miller
2020-07-06 22:08 ` YU, Xiangning
2020-07-07 15:42 ` kernel test robot
2020-07-07 15:42 ` kernel test robot
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.