All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
@ 2020-07-06 18:08 YU, Xiangning
  2020-07-06 18:32 ` Stephen Hemminger
                   ` (4 more replies)
  0 siblings, 5 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 18:08 UTC (permalink / raw)
  To: netdev

Lockless Token Bucket (LTB) is a qdisc implementation that controls the
use of outbound bandwidth on a shared link. With the help of lockless
qdisc, and by decoupling rate limiting and bandwidth sharing, LTB is
designed to scale in the cloud data centers.

Signed-off-by: Xiangning Yu <xiangning.yu@alibaba-inc.com>
---
 include/uapi/linux/pkt_sched.h |   35 ++
 net/sched/Kconfig              |   12 +
 net/sched/Makefile             |    1 +
 net/sched/sch_ltb.c            | 1280 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1328 insertions(+)
 create mode 100644 net/sched/sch_ltb.c

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 9e7c2c6..310a627 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -447,6 +447,41 @@ struct tc_htb_xstats {
 	__s32 ctokens;
 };
 
+/* LTB section */
+
+#define TC_LTB_PROTOVER	3 /* the same as LTB and TC's major */
+#define TC_LTB_NUMPRIO	16
+enum {
+	TCA_LTB_UNSPEC,
+	TCA_LTB_PARMS,
+	TCA_LTB_INIT,
+	TCA_LTB_RATE64,
+	TCA_LTB_CEIL64,
+	TCA_LTB_PAD,
+	__TCA_LTB_MAX,
+};
+#define TCA_LTB_MAX (__TCA_LTB_MAX - 1)
+
+struct tc_ltb_opt {
+	struct tc_ratespec rate;
+	struct tc_ratespec ceil;
+	__u64 measured;
+	__u64 allocated;
+	__u64 high_water;
+	__u32 prio;
+};
+
+struct tc_ltb_glob {
+	__u32 version;          /* to match LTB/TC */
+	__u32 defcls;           /* default class number */
+};
+
+struct tc_ltb_xstats {
+	__u64 measured;
+	__u64 allocated;
+	__u64 high_water;
+};
+
 /* HFSC section */
 
 struct tc_hfsc_qopt {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a3b37d8..9a8adb6 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -76,6 +76,18 @@ config NET_SCH_HTB
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_htb.
 
+config NET_SCH_LTB
+	tristate "Lockless Token Bucket (LTB)"
+	help
+	  Say Y here if you want to use the Lockless Token Buckets (LTB)
+	  packet scheduling algorithm.
+
+	  LTB is very similar to HTB regarding its goals however is has
+	  different implementation and different algorithm.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_ltb.
+
 config NET_SCH_HFSC
 	tristate "Hierarchical Fair Service Curve (HFSC)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 66bbf9a..6caa34d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_NET_ACT_GATE)	+= act_gate.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
+obj-$(CONFIG_NET_SCH_LTB)	+= sch_ltb.o
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
diff --git a/net/sched/sch_ltb.c b/net/sched/sch_ltb.c
new file mode 100644
index 0000000..494b15f
--- /dev/null
+++ b/net/sched/sch_ltb.c
@@ -0,0 +1,1280 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* net/sched/sch_ltb.c Lockless Token Bucket.
+ *
+ * Authors:	Xiangning Yu <xiangning.yu@alibaba-inc.com>
+ *		Ke Ma <k.ma@alibaba-inc.com>
+ *		Jianjun Duan <jianjun.duan@alibaba-inc.com>
+ *		Kun Liu <shubo.lk@alibaba-inc.com>
+ */
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/if_vlan.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/atomic.h>
+#include <linux/kfifo.h>
+#include <linux/kallsyms.h>
+#include <linux/irq_work.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/ethtool.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+#define	LTB_VERSION		0x30001
+#define	LTB_CLASS_CONDEMED	1
+#define	HIGH_FREQ_INTERVAL	1000	/* ns */
+#define	LOW_FREQ_INTERVAL	50	/* sampling rate, in ms */
+#define	SHADOW_CLASSID		0
+
+#define	BYTES_PER_JIFF(bps)	((bps) / HZ)
+#define	BYTES_PER_INTERVAL(bps)	(LOW_FREQ_INTERVAL * BYTES_PER_JIFF(bps))
+#define	MINBW			(10 * 1000 * 1000L)
+#define	HIGH_THRESHOLD		80
+#define	SUPPRESS_THRESHOLD	90
+#define	MAX_CPU_COUNT		128	/* make it dynamic */
+#define	SKB_QLEN		512
+#define	NOW()			(jiffies / LOW_FREQ_INTERVAL)
+#define	BPS2MBPS(x)		((x) * 8 / 1000000) /* Bps to Mbps */
+
+static struct Qdisc_ops ltb_pcpu_qdisc_ops;
+
+static const struct nla_policy ltb_policy[TCA_LTB_MAX + 1] = {
+	[TCA_LTB_PARMS]	= { .len = sizeof(struct tc_ltb_opt) },
+	[TCA_LTB_INIT] = { .len = sizeof(struct tc_ltb_glob) },
+	[TCA_LTB_RATE64] = { .type = NLA_U64 },
+	[TCA_LTB_CEIL64] = { .type = NLA_U64 },
+};
+
+struct ltb_class {
+	struct Qdisc_class_common common;
+	struct psched_ratecfg ratecfg;
+	struct psched_ratecfg ceilcfg;
+	u32 prio;
+	struct ltb_class *parent;
+	struct Qdisc *qdisc;
+	struct Qdisc *root_qdisc;
+	u32 classid;
+	struct list_head pnode;
+	unsigned long state; ____cacheline_aligned_in_smp
+
+	/* Aggr/drain context only */
+	s64 next_timestamp; ____cacheline_aligned_in_smp
+	int num_cpus;
+	int last_cpu;
+	s64 bw_used;
+	s64 last_bytes;
+	s64 last_timestamp;
+	s64 stat_bytes;
+	s64 stat_packets;
+	atomic64_t stat_drops;
+
+	/* Balance thread only */
+	s64 rate; ____cacheline_aligned_in_smp
+	s64 ceil;
+	s64 high_water;
+	int drop_delay;
+	s64 bw_allocated;
+	bool want_more;
+
+	/* Shared b/w aggr/drain thread and balancer */
+	unsigned long curr_interval; ____cacheline_aligned_in_smp
+	s64 bw_measured;	/* Measured actual bandwidth */
+	s64 maxbw;	/* Calculated bandwidth */
+
+	STRUCT_KFIFO(struct sk_buff *, SKB_QLEN) aggr_queues[MAX_CPU_COUNT];
+	____cacheline_aligned_in_smp
+	STRUCT_KFIFO(struct sk_buff *, SKB_QLEN * MAX_CPU_COUNT) drain_queue;
+	____cacheline_aligned_in_smp
+	STRUCT_KFIFO(struct sk_buff *, SKB_QLEN) fanout_queues[MAX_CPU_COUNT];
+	____cacheline_aligned_in_smp
+
+	struct tasklet_struct aggr_tasklet;
+	struct hrtimer aggr_timer;
+};
+
+struct ltb_pcpu_data {
+	struct Qdisc *qdisc; ____cacheline_aligned_in_smp
+	bool active;
+};
+
+/* Root qdisc private data */
+struct ltb_sched {
+	struct task_struct *bwbalancer_task;
+	wait_queue_head_t bwbalancer_wq;
+
+	int num_cpus;
+	s64 link_speed;
+	struct Qdisc *root_qdisc;
+	struct net_device *dev;
+
+	struct ltb_pcpu_data *pcpu_data; ____cacheline_aligned_in_smp
+	struct tasklet_struct fanout_tasklet;
+
+	struct ltb_class *default_cls;
+	struct ltb_class *shadow_cls; /* If there is no class created */
+	u32 default_classid;
+
+	rwlock_t prio_rows_lock;
+	struct list_head prio_rows[TC_LTB_NUMPRIO]; /* Priority list */
+	struct Qdisc_class_hash clhash;
+};
+
+/* Per-cpu qdisc private data */
+struct ltb_pcpu_sched {
+	struct ltb_sched *ltb;
+	struct Qdisc *qdisc;
+	int cpu;
+	struct irq_work fanout_irq_work;
+	s64 last_irq_timestamp;
+};
+
+/* The cpu where skb is from */
+struct ltb_skb_cb {
+	int cpu;
+};
+
+static inline struct ltb_skb_cb *ltb_skb_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct ltb_skb_cb));
+	return (struct ltb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline s64 get_linkspeed(struct net_device *dev)
+{
+	struct ethtool_link_ksettings ecmd;
+
+	ASSERT_RTNL();
+	if (netif_running(dev) && !__ethtool_get_link_ksettings(dev, &ecmd))
+		/* Convert to bytes per second */
+		return ecmd.base.speed * 1000 * 1000L / 8;
+	return 0;
+}
+
+static int ltb_update_linkspeed(struct ltb_sched *ltb)
+{
+	s64 linkspeed;
+
+	/* Avoid race with kthread_stop() */
+	if (!rtnl_trylock())
+		return -1;
+
+	linkspeed = get_linkspeed(ltb->dev);
+	if (ltb->link_speed != linkspeed)
+		ltb->link_speed = linkspeed;
+	rtnl_unlock();
+	return 0;
+}
+
+static inline int ltb_drain(struct ltb_class *cl)
+{
+	typeof(&cl->drain_queue) queue;
+	struct sk_buff *skb;
+	int npkts, bytes;
+	unsigned long now = NOW();
+	int cpu;
+	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
+	struct ltb_pcpu_sched *pcpu_q;
+	s64 timestamp;
+	bool need_watchdog = false;
+	struct cpumask cpumask;
+
+	npkts = 0;
+	bytes = 0;
+	cpumask_clear(&cpumask);
+	queue = &cl->drain_queue;
+	while (kfifo_peek(queue, &skb) > 0) {
+		int len = qdisc_pkt_len(skb);
+
+		if (cl->curr_interval != now) {
+			cl->curr_interval = now;
+			timestamp = ktime_get_ns();
+			cl->bw_measured = (cl->stat_bytes - cl->last_bytes) *
+				NSEC_PER_SEC / (timestamp - cl->last_timestamp);
+			cl->last_bytes = cl->stat_bytes;
+			cl->last_timestamp = timestamp;
+			cl->bw_used = 0;
+		} else if (len + cl->bw_used > cl->maxbw) {
+			need_watchdog = true;
+			break;
+		}
+		kfifo_skip(queue);
+		cl->bw_used += len;
+
+		/* Fanout */
+		cpu = ltb_skb_cb(skb)->cpu;
+		ltb_skb_cb(skb)->cpu = 0;
+		if (unlikely(kfifo_put(&cl->fanout_queues[cpu], skb) == 0)) {
+			kfree_skb(skb);
+			atomic64_inc(&cl->stat_drops);
+		} else {
+			/* Account for Generic Segmentation Offload(gso). */
+			cl->stat_bytes += len;
+			cl->stat_packets += skb_is_gso(skb) ?
+			    skb_shinfo(skb)->gso_segs : 1;
+			cpumask_set_cpu(cpu, &cpumask);
+		}
+	}
+
+	for_each_cpu(cpu, &cpumask) {
+		struct Qdisc *q = per_cpu_ptr(ltb->pcpu_data, cpu)->qdisc;
+
+		pcpu_q = (struct ltb_pcpu_sched *)qdisc_priv(q);
+		if (!(q->state & __QDISC_STATE_SCHED) && !qdisc_is_running(q))
+			irq_work_queue_on(&pcpu_q->fanout_irq_work, cpu);
+	}
+
+	return need_watchdog;
+}
+
+static void ltb_aggregate(struct ltb_class *cl)
+{
+	s64 timestamp = ktime_get_ns();
+	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
+	int num_cpus = ltb->num_cpus;
+	int i;
+
+	/* The worker might wake up more often than required */
+	if (cl->next_timestamp > timestamp)
+		/* Try again to keep the pipeline running */
+		goto watchdog;
+
+	cl->next_timestamp = timestamp + HIGH_FREQ_INTERVAL;
+
+	/* Aggregate sk_buff from all CPUs. The memory footprint here should
+	 * be fine because we don't touch each packet.
+	 *
+	 * It's possible to see out of order packets here. While within 1us,
+	 * there won't be too many packets for a single flow, and the Linux
+	 * scheduler is not expected to schedule an application too often
+	 * within this tiny time gap, i.e. 1/1000 jiffes.
+	 */
+	for (i = 0; i < num_cpus; i++) {
+		/* Process CPUs in a round-robin fashion */
+		typeof(&cl->aggr_queues[0]) queue;
+		int queue_len, drain_room;
+		int j;
+
+		queue = &cl->aggr_queues[(i + cl->last_cpu) % num_cpus];
+		queue_len = kfifo_len(queue);
+		drain_room = kfifo_avail(&cl->drain_queue);
+		if (drain_room == 0)
+			break;
+
+		queue_len = queue_len < drain_room ? queue_len : drain_room;
+		for (j = 0; j < queue_len; j++) {
+			struct sk_buff *skb;
+
+			if (kfifo_get(queue, &skb)) {
+				if (unlikely(kfifo_put(&cl->drain_queue,
+						       skb) == 0)) {
+					kfree_skb(skb);
+					atomic64_inc(&cl->stat_drops);
+				}
+			}
+		}
+	}
+	cl->last_cpu++;
+	if (cl->last_cpu == num_cpus)
+		cl->last_cpu = 0;
+
+	if (ltb_drain(cl) == false)
+		return;
+
+watchdog:
+	if (!test_bit(LTB_CLASS_CONDEMED, &cl->state))
+		hrtimer_start(&cl->aggr_timer,
+			      ns_to_ktime(1000 + ktime_get_ns()),
+			      HRTIMER_MODE_ABS_PINNED);
+}
+
+static enum hrtimer_restart ltb_aggr_watchdog(struct hrtimer *timer)
+{
+	struct ltb_class *cl = container_of(timer,
+			     struct ltb_class, aggr_timer);
+
+	if (!test_bit(LTB_CLASS_CONDEMED, &cl->state))
+		tasklet_schedule(&cl->aggr_tasklet);
+
+	return HRTIMER_NORESTART;
+}
+
+static void ltb_aggr_tasklet(unsigned long arg)
+{
+	struct ltb_class *cl = (struct ltb_class *)arg;
+
+	rcu_read_lock_bh();
+	if (!test_bit(LTB_CLASS_CONDEMED, &cl->state))
+		ltb_aggregate(cl);
+	rcu_read_unlock_bh();
+}
+
+static inline void ltb_fanout(struct ltb_sched *ltb)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < ltb->num_cpus; cpu++) {
+		struct Qdisc *q = per_cpu_ptr(ltb->pcpu_data, cpu)->qdisc;
+		struct ltb_pcpu_sched *pcpu_q =
+			(struct ltb_pcpu_sched *)qdisc_priv(q);
+
+		if (q->q.qlen > 0 && !(q->state & __QDISC_STATE_SCHED) &&
+		    !qdisc_is_running(q))
+			irq_work_queue_on(&pcpu_q->fanout_irq_work, cpu);
+	}
+}
+
+static void ltb_fanout_tasklet(unsigned long data)
+{
+	struct ltb_sched *ltb = (struct ltb_sched *)data;
+
+	ltb_fanout(ltb);
+}
+
+static void ltb_fanout_irq_tx_func(struct irq_work *work)
+{
+	struct ltb_pcpu_sched *pcpu_q =
+	    container_of(work, struct ltb_pcpu_sched, fanout_irq_work);
+
+	__netif_schedule(pcpu_q->qdisc);
+}
+
+/* How many classes within the same group want more bandwidth */
+static inline int bw_class_want_more_count(struct list_head *head)
+{
+	int n = 0;
+	struct ltb_class *cl;
+
+	list_for_each_entry(cl, head, pnode) {
+		if (cl->want_more)
+			n++;
+	}
+	return n;
+}
+
+/* Redistribute bandwidth among classes with the same priority */
+static int bw_redistribute_prio(struct list_head *lhead, int bw_available,
+				int n, bool *all_reached_ceil)
+{
+	struct ltb_class *cl;
+	int avg = 0;
+	int orig_bw_allocated;
+	int safe_loop = 0;
+
+	do {
+		if (n > 0)
+			avg = bw_available / n;
+		list_for_each_entry(cl, lhead, pnode) {
+			if (!cl->want_more)
+				continue;
+
+			/* Try to allocate as much as possible */
+			orig_bw_allocated = cl->bw_allocated;
+			cl->bw_allocated = min_t(s64, (cl->bw_allocated + avg),
+						 cl->ceil);
+			/* Significantly larger than high water */
+			if (cl->bw_allocated > cl->high_water * 120 / 100)
+				cl->bw_allocated = cl->high_water;
+			bw_available -= cl->bw_allocated - orig_bw_allocated;
+			if (cl->bw_allocated >= cl->high_water ||
+			    cl->bw_allocated == cl->ceil) {
+				cl->want_more = false;
+				n--;
+			}
+		}
+	} while (bw_available > 0 && n > 0 && safe_loop++ < 2);
+
+	*all_reached_ceil = true;
+	list_for_each_entry(cl, lhead, pnode) {
+		if (cl->bw_allocated != cl->ceil)
+			*all_reached_ceil = false;
+	}
+
+	return bw_available;
+}
+
+static void bw_suppress_lower(struct ltb_sched *ltb, int high)
+{
+	int prio;
+
+	read_lock_bh(&ltb->prio_rows_lock);
+	for (prio = TC_LTB_NUMPRIO - 1; prio > high; prio--) {
+		struct ltb_class *cl;
+
+		list_for_each_entry(cl, &ltb->prio_rows[prio], pnode) {
+			if (cl->bw_allocated > cl->rate) {
+				cl->bw_allocated = max_t(s64,
+							 cl->bw_measured *
+							 90 / 100, cl->rate);
+			}
+		}
+	}
+	read_unlock_bh(&ltb->prio_rows_lock);
+}
+
+static int bw_redistribute(struct ltb_sched *ltb, int bw_available)
+{
+	int prio = 0;
+	int n;
+	int highest_non_saturated_prio = TC_LTB_NUMPRIO;
+	bool all_reached_ceil;
+
+	read_lock_bh(&ltb->prio_rows_lock);
+	for (; prio < TC_LTB_NUMPRIO; prio++) {
+		struct list_head *head = &ltb->prio_rows[prio];
+
+		all_reached_ceil = true;
+
+		n = bw_class_want_more_count(head);
+		bw_available = bw_redistribute_prio(head, bw_available,
+						    n, &all_reached_ceil);
+		if (!all_reached_ceil && highest_non_saturated_prio > prio)
+			highest_non_saturated_prio = prio;
+
+		if (bw_available < 0)
+			break;
+	}
+	read_unlock_bh(&ltb->prio_rows_lock);
+	return highest_non_saturated_prio;
+}
+
+static void bw_sync_all(struct ltb_sched *ltb, int bw_available,
+			int is_light_traffic)
+{
+	struct ltb_class *cl;
+	int i;
+
+	for (i = 0; i < ltb->clhash.hashsize; i++) {
+		hlist_for_each_entry_rcu(cl, &ltb->clhash.hash[i],
+					 common.hnode) {
+			if (cl->classid == SHADOW_CLASSID)
+				continue;
+
+			if (is_light_traffic)
+				cl->bw_allocated = min_t(s64, cl->ceil,
+							 cl->bw_allocated +
+							 bw_available);
+			cl->maxbw = BYTES_PER_INTERVAL((s64)cl->bw_allocated);
+			/* Maxbw will be visiable eventually. */
+			smp_mb();
+		}
+	}
+}
+
+static void bw_balance(struct ltb_sched *ltb)
+{
+	struct ltb_class *cl;
+	s64 link_speed = ltb->link_speed;
+	int bw_available = link_speed;
+	s64 total = 0;
+	int high = TC_LTB_NUMPRIO;
+	int is_light_traffic = 1;
+	int i;
+
+	if (unlikely(link_speed <= 0))
+		return;
+
+	for (i = 0; i < ltb->clhash.hashsize; i++) {
+		hlist_for_each_entry_rcu(cl, &ltb->clhash.hash[i],
+					 common.hnode) {
+			if (cl->classid == SHADOW_CLASSID)
+				continue;
+
+			/* It's been a while the bw measurement has stopped */
+			if (NOW() - cl->curr_interval > 2 &&
+			    cl->bw_measured != 0)
+				cl->bw_measured = 0;
+
+			if (cl->bw_measured > cl->high_water * 95 / 100) {
+				/* Increase */
+				if (cl->high_water < cl->rate)
+					cl->high_water = min_t(s64,
+							       cl->high_water *
+							       2, cl->rate);
+				else
+					cl->high_water =
+					    cl->high_water * 120 / 100;
+				cl->high_water = min_t(s64, cl->ceil,
+						       cl->high_water);
+				if (cl->drop_delay != 0)
+					cl->drop_delay = 0;
+			} else if (cl->bw_measured <
+			    cl->high_water * 85 / 100) {
+				/* Drop */
+				cl->drop_delay++;
+				if (cl->drop_delay == 5) {
+					cl->high_water =
+					    cl->bw_measured * 110 / 100;
+					cl->drop_delay = 0;
+				}
+			} else {
+				/* Stable */
+				cl->high_water = cl->bw_allocated;
+				if (cl->drop_delay != 0)
+					cl->drop_delay = 0;
+			}
+
+			cl->high_water = max_t(s64, cl->high_water, MINBW);
+			cl->bw_allocated = min_t(s64, cl->rate, cl->high_water);
+			bw_available -= cl->bw_allocated;
+			if (cl->bw_allocated < cl->high_water)
+				cl->want_more = true;
+			else
+				cl->want_more = false;
+			total += cl->bw_measured;
+		}
+	}
+
+	if (total > HIGH_THRESHOLD * ltb->link_speed / 100) {
+		is_light_traffic  = 0;
+
+		/* Redistribute the remaining bandwidth by priority
+		 */
+		if (bw_available > 0)
+			high = bw_redistribute(ltb, bw_available);
+
+		/* The link is near satuarated, we need to suppress
+		 * those classes that:
+		 *	- are not of the highest priority that haven't
+		 *	reached all ceiling.
+		 *	- consume more than rate.
+		 *
+		 * This will give the higher priority class a better chance
+		 * to gain full speed.
+		 */
+		if (total > SUPPRESS_THRESHOLD * ltb->link_speed / 100)
+			bw_suppress_lower(ltb, high);
+	}
+	bw_sync_all(ltb, bw_available, is_light_traffic);
+}
+
+static int ltb_bw_balancer_kthread(void *arg)
+{
+	struct ltb_sched *ltb = (struct ltb_sched *)arg;
+
+	for (;;) {
+		wait_event_interruptible_timeout(ltb->bwbalancer_wq,
+						 kthread_should_stop(),
+						 LOW_FREQ_INTERVAL);
+		if (kthread_should_stop())
+			break;
+
+		if (ltb_update_linkspeed(ltb) != 0)
+			continue;
+
+		rcu_read_lock_bh();
+		bw_balance(ltb);
+		rcu_read_unlock_bh();
+	}
+	return 0;
+}
+
+static int ltb_parse_opts(struct nlattr *opt, u32 *defcls)
+{
+	struct nlattr *tb[TCA_LTB_MAX + 1];
+	struct tc_ltb_glob *gopt;
+	int err;
+
+	err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt,
+					  ltb_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_LTB_INIT])
+		return -EINVAL;
+
+	gopt = nla_data(tb[TCA_LTB_INIT]);
+	if (gopt->version != LTB_VERSION >> 16)
+		return -EINVAL;
+
+	if (defcls)
+		*defcls = gopt->defcls;
+	return 0;
+}
+
+static int ltb_pcpu_init(struct Qdisc *sch, struct nlattr *opt,
+			 struct netlink_ext_ack *extack)
+{
+	struct ltb_pcpu_sched *pcpu_q =
+		(struct ltb_pcpu_sched *)qdisc_priv(sch);
+
+	memset(pcpu_q, 0, sizeof(*pcpu_q));
+	pcpu_q->qdisc = sch;
+	init_irq_work(&pcpu_q->fanout_irq_work, ltb_fanout_irq_tx_func);
+	return 0;
+}
+
+static struct sk_buff *ltb_pcpu_class_dequeue(struct ltb_pcpu_sched *pcpu_q,
+					      struct ltb_class *cl)
+{
+	struct sk_buff *skb;
+	typeof(&cl->fanout_queues[0]) queue;
+
+	queue = &cl->fanout_queues[pcpu_q->cpu];
+	if (kfifo_peek(queue, &skb) > 0) {
+		kfifo_skip(queue);
+		pcpu_q->qdisc->q.qlen--;
+		return skb;
+	}
+
+	return NULL;
+}
+
+static struct sk_buff *ltb_pcpu_dequeue(struct Qdisc *sch)
+{
+	struct ltb_sched *ltb;
+	struct ltb_pcpu_sched *pcpu_q;
+	struct ltb_class *cl;
+	struct sk_buff *skb;
+	int i;
+
+	pcpu_q = (struct ltb_pcpu_sched *)qdisc_priv(sch);
+	ltb = pcpu_q->ltb;
+
+	for (i = 0; i < ltb->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &ltb->clhash.hash[i], common.hnode) {
+			skb = ltb_pcpu_class_dequeue(pcpu_q, cl);
+			if (skb)
+				return skb;
+		}
+	}
+	return NULL;
+}
+
+static inline struct ltb_class *ltb_find_class(struct Qdisc *sch, u32 handle)
+{
+	struct ltb_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, handle);
+	if (!clc)
+		return NULL;
+
+	return container_of(clc, struct ltb_class, common);
+}
+
+static struct ltb_class *ltb_alloc_class(struct Qdisc *sch,
+					 struct ltb_class *parent, u32 classid,
+					 struct psched_ratecfg *ratecfg,
+					 struct psched_ratecfg *ceilcfg,
+					 u32 prio)
+{
+	struct ltb_sched *ltb  = qdisc_priv(sch);
+	struct ltb_class *cl;
+	int i;
+
+	if (ratecfg->rate_bytes_ps > ceilcfg->rate_bytes_ps ||
+	    prio < 0 || prio >= TC_LTB_NUMPRIO)
+		return NULL;
+
+	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+	if (!cl)
+		return NULL;
+
+	cl->common.classid = classid;
+	cl->parent = parent;
+	cl->ratecfg = *ratecfg;
+	cl->ceilcfg = *ceilcfg;
+	cl->prio = prio;
+	cl->classid = classid;
+	cl->root_qdisc = sch;
+	cl->num_cpus = ltb->num_cpus;
+	cl->last_cpu = 0;
+	cl->ceil = ceilcfg->rate_bytes_ps;
+	cl->rate = ratecfg->rate_bytes_ps;
+	cl->bw_allocated = ratecfg->rate_bytes_ps;
+	cl->high_water = cl->bw_allocated * 110 / 100;
+	cl->maxbw = BYTES_PER_INTERVAL((s64)ratecfg->rate_bytes_ps);
+
+	INIT_KFIFO(cl->drain_queue);
+	for (i = 0; i < cl->num_cpus; i++) {
+		INIT_KFIFO(cl->aggr_queues[i]);
+		INIT_KFIFO(cl->fanout_queues[i]);
+	}
+	hrtimer_init(&cl->aggr_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS_PINNED);
+	cl->aggr_timer.function = ltb_aggr_watchdog;
+	tasklet_init(&cl->aggr_tasklet, ltb_aggr_tasklet,
+		     (unsigned long)cl);
+
+	if (classid == ltb->default_classid)
+		rcu_assign_pointer(ltb->default_cls, cl);
+	if (classid != SHADOW_CLASSID) {
+		write_lock_bh(&ltb->prio_rows_lock);
+		list_add(&cl->pnode, &ltb->prio_rows[prio]);
+		write_unlock_bh(&ltb->prio_rows_lock);
+	}
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&ltb->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	return cl;
+}
+
+static int ltb_modify_class(struct Qdisc *sch, struct ltb_class *cl,
+			    struct psched_ratecfg *ratecfg,
+			    struct psched_ratecfg *ceilcfg,
+			    u32 prio)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+
+	rcu_read_lock_bh();
+	cl->ratecfg = *ratecfg;
+	cl->ceilcfg = *ceilcfg;
+	cl->prio = prio;
+	cl->rate = ratecfg->rate_bytes_ps;
+	cl->ceil = ceilcfg->rate_bytes_ps;
+	cl->bw_allocated = ratecfg->rate_bytes_ps;
+	cl->high_water = cl->bw_allocated * 110 / 100;
+	cl->maxbw = BYTES_PER_INTERVAL((s64)ratecfg->rate_bytes_ps);
+
+	write_lock_bh(&ltb->prio_rows_lock);
+	list_del(&cl->pnode);
+	list_add(&cl->pnode, &ltb->prio_rows[prio]);
+	write_unlock_bh(&ltb->prio_rows_lock);
+
+	rcu_read_unlock_bh();
+
+	return 0;
+}
+
+static void ltb_destroy_class(struct Qdisc *sch, struct ltb_class *cl)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+	struct sk_buff *skb;
+	int i;
+
+	if (ltb->default_classid == cl->classid)
+		rcu_assign_pointer(ltb->default_cls, ltb->shadow_cls);
+	cl->state |= LTB_CLASS_CONDEMED;
+	if (cl->classid != SHADOW_CLASSID) {
+		write_lock_bh(&ltb->prio_rows_lock);
+		list_del(&cl->pnode);
+		write_unlock_bh(&ltb->prio_rows_lock);
+	}
+
+	hrtimer_cancel(&cl->aggr_timer);
+	tasklet_kill(&cl->aggr_tasklet);
+
+	/* Cleanup pending packets */
+	for (i = 0; i < cl->num_cpus; i++) {
+		while (kfifo_get(&cl->aggr_queues[i], &skb) > 0)
+			kfree_skb(skb);
+
+		while (kfifo_get(&cl->fanout_queues[i], &skb) > 0)
+			kfree_skb(skb);
+	}
+	while (kfifo_get(&cl->drain_queue, &skb) > 0)
+		kfree_skb(skb);
+
+	kfree(cl);
+}
+
+static int ltb_graft_class(struct Qdisc *sch, unsigned long arg,
+			   struct Qdisc *new, struct Qdisc **old,
+			   struct netlink_ext_ack *extack)
+{
+	struct ltb_class *cl = (struct ltb_class *)arg;
+
+	if (!new)
+		return -EINVAL;
+
+	*old = qdisc_replace(sch, new, &cl->qdisc);
+	return 0;
+}
+
+static struct Qdisc *ltb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct ltb_class *cl = (struct ltb_class *)arg;
+
+	return cl->qdisc;
+}
+
+static void ltb_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static unsigned long ltb_find(struct Qdisc *sch, u32 handle)
+{
+	return (unsigned long)ltb_find_class(sch, handle);
+}
+
+static int ltb_change_class(struct Qdisc *sch, u32 classid,
+			    u32 parentid, struct nlattr **tca,
+			    unsigned long *arg, struct netlink_ext_ack *extack)
+{
+	struct ltb_sched *ltb  = qdisc_priv(sch);
+	struct ltb_class *cl = (struct ltb_class *)*arg, *parent;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_LTB_MAX + 1];
+	struct tc_ltb_opt *lopt;
+	u64 rate64, ceil64;
+	struct psched_ratecfg ratecfg, ceilcfg;
+	u32 prio;
+	int err;
+
+	if (!opt)
+		goto failure;
+
+	err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt, ltb_policy,
+					  NULL);
+	if (err < 0)
+		goto failure;
+
+	err = -EINVAL;
+	if (!tb[TCA_LTB_PARMS])
+		goto failure;
+
+	parent = parentid == TC_H_ROOT ? NULL : ltb_find_class(sch, parentid);
+
+	lopt = nla_data(tb[TCA_LTB_PARMS]);
+	if (!lopt->rate.rate || !lopt->ceil.rate)
+		goto failure;
+
+	rate64 = tb[TCA_LTB_RATE64] ? nla_get_u64(tb[TCA_LTB_RATE64]) : 0;
+	ceil64 = tb[TCA_LTB_CEIL64] ? nla_get_u64(tb[TCA_LTB_CEIL64]) : 0;
+	if (rate64 > ceil64)
+		goto failure;
+
+	psched_ratecfg_precompute(&ratecfg, &lopt->rate, rate64);
+	psched_ratecfg_precompute(&ceilcfg, &lopt->ceil, ceil64);
+	prio = lopt->prio;
+	if (prio >= TC_LTB_NUMPRIO)
+		prio = TC_LTB_NUMPRIO - 1;
+
+	if (!cl) {
+		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
+		    ltb_find_class(sch, classid)) {
+			err = -EINVAL;
+			goto failure;
+		}
+		cl = ltb_alloc_class(sch, parent, classid, &ratecfg, &ceilcfg,
+				     prio);
+		if (!cl) {
+			err = -ENOBUFS;
+			goto failure;
+		}
+	} else {
+		/* Modify existing class */
+		ltb_modify_class(sch, cl, &ratecfg, &ceilcfg, prio);
+	}
+	qdisc_class_hash_grow(sch, &ltb->clhash);
+	*arg = (unsigned long)cl;
+	return 0;
+
+failure:
+	return err;
+}
+
+static int ltb_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+	struct ltb_class *cl = (struct ltb_class *)arg;
+
+	sch_tree_lock(sch);
+	if (cl->qdisc)
+		qdisc_purge_queue(cl->qdisc);
+	qdisc_class_hash_remove(&ltb->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	ltb_destroy_class(sch, cl);
+	return 0;
+}
+
+static void ltb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct ltb_sched *q = qdisc_priv(sch);
+	struct ltb_class *cl;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			/* We don't want to walk the shadow class */
+			if (cl->classid == SHADOW_CLASSID)
+				continue;
+
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static int ltb_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct ltb_class *cl = (struct ltb_class *)arg;
+	struct nlattr *nest;
+	struct tc_ltb_opt opt;
+
+	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
+	tcm->tcm_handle = cl->common.classid;
+
+	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	memset(&opt, 0, sizeof(opt));
+	psched_ratecfg_getrate(&opt.rate, &cl->ratecfg);
+	psched_ratecfg_getrate(&opt.ceil, &cl->ceilcfg);
+
+	opt.measured = BPS2MBPS(cl->bw_measured);
+	opt.allocated = BPS2MBPS(cl->bw_allocated);
+	opt.high_water = BPS2MBPS(cl->high_water);
+	opt.prio = cl->prio;
+
+	if (nla_put(skb, TCA_LTB_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if ((cl->ratecfg.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64_64bit(skb, TCA_LTB_RATE64, cl->ratecfg.rate_bytes_ps,
+			      TCA_LTB_PAD))
+		goto nla_put_failure;
+	if ((cl->ceilcfg.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64_64bit(skb, TCA_LTB_CEIL64, cl->ceilcfg.rate_bytes_ps,
+			      TCA_LTB_PAD))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int ltb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct ltb_class *cl = (struct ltb_class *)arg;
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue qstats;
+	struct tc_ltb_xstats xstats;
+
+	memset(&bstats, 0, sizeof(bstats));
+	bstats.bytes = cl->stat_bytes;
+	bstats.packets = cl->stat_packets;
+	memset(&qstats, 0, sizeof(qstats));
+	qstats.drops = cl->stat_drops.counter;
+	memset(&xstats, 0, sizeof(xstats));
+	xstats.measured = BPS2MBPS(cl->bw_measured);
+	xstats.allocated = BPS2MBPS(cl->bw_allocated);
+	xstats.high_water = BPS2MBPS(cl->high_water);
+	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+				  d, NULL, &bstats) < 0 ||
+	    gnet_stats_copy_queue(d, NULL, &qstats, 0) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static struct ltb_class *ltb_classify(struct Qdisc *sch,
+				      struct ltb_sched *ltb,
+				      struct sk_buff *skb)
+{
+	struct ltb_class *cl;
+
+	/* Allow to select a class by setting skb->priority */
+	if (likely(skb->priority != 0)) {
+		cl = ltb_find_class(sch, skb->priority);
+		if (cl)
+			return cl;
+	}
+	return rcu_dereference_bh(ltb->default_cls);
+}
+
+static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
+		       struct sk_buff **to_free)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+	struct ltb_pcpu_sched *pcpu_q;
+	struct ltb_class *cl;
+	struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
+	int cpu;
+
+	cpu = smp_processor_id();
+	pcpu_q = qdisc_priv(pcpu->qdisc);
+	ltb_skb_cb(skb)->cpu = cpu;
+
+	cl = ltb_classify(sch, ltb, skb);
+	if (unlikely(!cl)) {
+		kfree_skb(skb);
+		return NET_XMIT_DROP;
+	}
+
+	pcpu->active = true;
+	if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
+		kfree_skb(skb);
+		atomic64_inc(&cl->stat_drops);
+		return NET_XMIT_DROP;
+	}
+
+	sch->q.qlen = 1;
+	pcpu_q->qdisc->q.qlen++;
+	tasklet_schedule(&cl->aggr_tasklet);
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *ltb_dequeue(struct Qdisc *sch)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+	struct ltb_pcpu_data *pcpu;
+
+	pcpu = this_cpu_ptr(ltb->pcpu_data);
+
+	if (likely(pcpu->active))
+		pcpu->active = false;
+	else
+		tasklet_schedule(&ltb->fanout_tasklet);
+
+	return NULL;
+}
+
+static void ltb_reset(struct Qdisc *sch)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+	struct ltb_class *cl;
+	int i;
+
+	sch->q.qlen = 0;
+	for (i = 0; i < ltb->num_cpus; i++)
+		qdisc_reset(per_cpu_ptr(ltb->pcpu_data, i)->qdisc);
+
+	for (i = 0; i < ltb->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &ltb->clhash.hash[i], common.hnode) {
+			if (cl->qdisc)
+				qdisc_reset(cl->qdisc);
+		}
+	}
+}
+
+static void ltb_destroy(struct Qdisc *sch)
+{
+	struct ltb_sched *ltb = qdisc_priv(sch);
+	struct hlist_node *tmp;
+	struct ltb_class *cl;
+	int i;
+
+	sch->q.qlen = 0;
+	ltb->default_cls = NULL;
+	ltb->shadow_cls = NULL;
+	tasklet_kill(&ltb->fanout_tasklet);
+	if (ltb->bwbalancer_task) {
+		kthread_stop(ltb->bwbalancer_task);
+		ltb->bwbalancer_task = NULL;
+	}
+
+	for (i = 0; i < ltb->num_cpus; i++)
+		qdisc_put(per_cpu_ptr(ltb->pcpu_data, i)->qdisc);
+
+	for (i = 0; i < ltb->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, tmp, &ltb->clhash.hash[i],
+					  common.hnode)
+			ltb_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&ltb->clhash);
+	free_percpu(ltb->pcpu_data);
+}
+
+static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
+{
+	struct ltb_sched *ltb = (struct ltb_sched *)qdisc_priv(sch);
+	struct Qdisc *q;
+	int err, i;
+	struct ltb_pcpu_sched *pcpu_q;
+	struct net_device *dev = qdisc_dev(sch);
+	u32 default_classid = 0;
+	struct psched_ratecfg ratecfg;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (opt) {
+		err = ltb_parse_opts(opt, &default_classid);
+		if (err != 0)
+			return err;
+	}
+
+	memset(ltb, 0, sizeof(*ltb));
+	rwlock_init(&ltb->prio_rows_lock);
+	for (i = 0; i < TC_LTB_NUMPRIO; i++)
+		INIT_LIST_HEAD(&ltb->prio_rows[i]);
+
+	ltb->root_qdisc = sch;
+	ltb->dev = dev;
+	ltb->num_cpus = num_online_cpus();
+	if (ltb->num_cpus > MAX_CPU_COUNT)
+		return -EOPNOTSUPP;
+
+	ltb->link_speed = get_linkspeed(ltb->dev);
+	if (ltb->link_speed <= 0)
+		pr_warn("Failed to obtain link speed\n");
+
+	err = qdisc_class_hash_init(&ltb->clhash);
+	if (err < 0)
+		return err;
+
+	ltb->pcpu_data = alloc_percpu_gfp(struct ltb_pcpu_data,
+					  GFP_KERNEL | __GFP_ZERO);
+	if (!ltb->pcpu_data) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	for (i = 0; i < ltb->num_cpus; i++) {
+		q = qdisc_create_dflt(sch->dev_queue,
+				      &ltb_pcpu_qdisc_ops, 0, NULL);
+		if (!q) {
+			err = -ENODEV;
+			goto error;
+		}
+		/* These cannot be initialized in qdisc_init() */
+		pcpu_q = (struct ltb_pcpu_sched *)qdisc_priv(q);
+		pcpu_q->cpu = i;
+		pcpu_q->ltb = ltb;
+
+		per_cpu_ptr(ltb->pcpu_data, i)->qdisc = q;
+		per_cpu_ptr(ltb->pcpu_data, i)->active = false;
+	}
+
+	ltb->default_classid = TC_H_MAKE(TC_H_MAJ(sch->handle),
+					 default_classid);
+	ratecfg.rate_bytes_ps = ltb->link_speed;
+	ltb->shadow_cls = ltb_alloc_class(sch, NULL, SHADOW_CLASSID,
+					  &ratecfg, &ratecfg, 0);
+	if (!ltb->shadow_cls) {
+		err = -EINVAL;
+		goto error;
+	}
+	ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
+	tasklet_init(&ltb->fanout_tasklet, ltb_fanout_tasklet,
+		     (unsigned long)ltb);
+
+	/* Bandwidth balancer, this logic can be implemented in user-land. */
+	init_waitqueue_head(&ltb->bwbalancer_wq);
+	ltb->bwbalancer_task =
+	    kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
+	wake_up_process(ltb->bwbalancer_task);
+
+	sch->flags |= TCQ_F_NOLOCK;
+	return 0;
+
+error:
+	for (i = 0; i < ltb->num_cpus; i++) {
+		struct ltb_pcpu_data *pcpu = per_cpu_ptr(ltb->pcpu_data, i);
+
+		if (pcpu->qdisc) {
+			qdisc_put(pcpu->qdisc);
+			pcpu->qdisc = NULL;
+		}
+	}
+	if (ltb->pcpu_data) {
+		free_percpu(ltb->pcpu_data);
+		ltb->pcpu_data = NULL;
+	}
+	if (ltb->bwbalancer_task) {
+		kthread_stop(ltb->bwbalancer_task);
+		ltb->bwbalancer_task = NULL;
+	}
+	qdisc_class_hash_destroy(&ltb->clhash);
+	return err;
+}
+
+static int ltb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct ltb_sched *ltb  = qdisc_priv(sch);
+	struct nlattr *nest;
+	struct tc_ltb_glob gopt;
+
+	gopt.version = LTB_VERSION;
+	gopt.defcls = ltb->default_classid;
+
+	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_LTB_INIT, sizeof(gopt), &gopt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops ltb_pcpu_qdisc_ops __read_mostly = {
+	.cl_ops		= NULL,
+	.id		= "ltb_percore",
+	.priv_size	= sizeof(struct ltb_sched),
+	.enqueue	= NULL,
+	.dequeue	= ltb_pcpu_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.init		= ltb_pcpu_init,
+	.dump		= NULL,
+	.owner		= THIS_MODULE,
+};
+
+static const struct Qdisc_class_ops ltb_class_ops = {
+	.graft		= ltb_graft_class,
+	.leaf		= ltb_leaf,
+	.qlen_notify	= ltb_qlen_notify,
+	.find		= ltb_find,
+	.change		= ltb_change_class,
+	.delete		= ltb_delete_class,
+	.walk		= ltb_walk,
+	.dump		= ltb_dump_class,
+	.dump_stats	= ltb_dump_class_stats,
+};
+
+static struct Qdisc_ops ltb_qdisc_ops __read_mostly = {
+	.cl_ops		= &ltb_class_ops,
+	.id		= "ltb",
+	.priv_size	= sizeof(struct ltb_sched),
+	.enqueue	= ltb_enqueue,
+	.dequeue	= ltb_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.init		= ltb_init,
+	.reset		= ltb_reset,
+	.destroy	= ltb_destroy,
+	.dump		= ltb_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init ltb_module_init(void)
+{
+	return register_qdisc(&ltb_qdisc_ops);
+}
+
+static void __exit ltb_module_exit(void)
+{
+	unregister_qdisc(&ltb_qdisc_ops);
+}
+
+module_init(ltb_module_init)
+module_exit(ltb_module_exit)
+MODULE_LICENSE("GPL");
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
@ 2020-07-06 18:32 ` Stephen Hemminger
  2020-07-06 18:37 ` Stephen Hemminger
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 22+ messages in thread
From: Stephen Hemminger @ 2020-07-06 18:32 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: netdev

On Tue, 07 Jul 2020 02:08:13 +0800
"YU, Xiangning" <xiangning.yu@alibaba-inc.com> wrote:

> +static inline s64 get_linkspeed(struct net_device *dev)

This is not performance sensitive and should not be marked inline.


> +{
> +	struct ethtool_link_ksettings ecmd;
> +
> +	ASSERT_RTNL();
> +	if (netif_running(dev) && !__ethtool_get_link_ksettings(dev, &ecmd))
> +		/* Convert to bytes per second */
> +		return ecmd.base.speed * 1000 * 1000L / 8;

You need to be more careful in handling of devices that return unknown for speed.

> +	return 0;
> +}
> +

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
  2020-07-06 18:32 ` Stephen Hemminger
@ 2020-07-06 18:37 ` Stephen Hemminger
  2020-07-06 19:56   ` YU, Xiangning
  2020-07-06 20:10 ` Cong Wang
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 22+ messages in thread
From: Stephen Hemminger @ 2020-07-06 18:37 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: netdev

On Tue, 07 Jul 2020 02:08:13 +0800
"YU, Xiangning" <xiangning.yu@alibaba-inc.com> wrote:

> +static inline int ltb_drain(struct ltb_class *cl)
> +{
> +	typeof(&cl->drain_queue) queue;
> +	struct sk_buff *skb;
> +	int npkts, bytes;
> +	unsigned long now = NOW();
> +	int cpu;
> +	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
> +	struct ltb_pcpu_sched *pcpu_q;
> +	s64 timestamp;
> +	bool need_watchdog = false;
> +	struct cpumask cpumask;
> +
> +	npkts = 0;
> +	bytes = 0;

It would be safer to use unsigned int for npkts and bytes.
These should never be negative.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 18:37 ` Stephen Hemminger
@ 2020-07-06 19:56   ` YU, Xiangning
  0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 19:56 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev



On 7/6/20 11:37 AM, Stephen Hemminger wrote:
> On Tue, 07 Jul 2020 02:08:13 +0800
> "YU, Xiangning" <xiangning.yu@alibaba-inc.com> wrote:
> 
>> +static inline int ltb_drain(struct ltb_class *cl)
>> +{
>> +	typeof(&cl->drain_queue) queue;
>> +	struct sk_buff *skb;
>> +	int npkts, bytes;
>> +	unsigned long now = NOW();
>> +	int cpu;
>> +	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
>> +	struct ltb_pcpu_sched *pcpu_q;
>> +	s64 timestamp;
>> +	bool need_watchdog = false;
>> +	struct cpumask cpumask;
>> +
>> +	npkts = 0;
>> +	bytes = 0;
> 
> It would be safer to use unsigned int for npkts and bytes.
> These should never be negative.
> 

Thank you Stephen. I will make these changes, including those mentioned in your previous email.
 
- Xiangning

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
  2020-07-06 18:32 ` Stephen Hemminger
  2020-07-06 18:37 ` Stephen Hemminger
@ 2020-07-06 20:10 ` Cong Wang
  2020-07-06 20:34   ` YU, Xiangning
  2020-07-06 20:29 ` David Miller
  2020-07-07 15:42   ` kernel test robot
  4 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-06 20:10 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
> +                      struct sk_buff **to_free)
> +{
> +       struct ltb_sched *ltb = qdisc_priv(sch);
> +       struct ltb_pcpu_sched *pcpu_q;
> +       struct ltb_class *cl;
> +       struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
> +       int cpu;
> +
> +       cpu = smp_processor_id();
> +       pcpu_q = qdisc_priv(pcpu->qdisc);
> +       ltb_skb_cb(skb)->cpu = cpu;
> +
> +       cl = ltb_classify(sch, ltb, skb);
> +       if (unlikely(!cl)) {
> +               kfree_skb(skb);
> +               return NET_XMIT_DROP;
> +       }
> +
> +       pcpu->active = true;
> +       if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
> +               kfree_skb(skb);
> +               atomic64_inc(&cl->stat_drops);
> +               return NET_XMIT_DROP;
> +       }


How do you prevent out-of-order packets?


> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
...
> +       ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
> +       tasklet_init(&ltb->fanout_tasklet, ltb_fanout_tasklet,
> +                    (unsigned long)ltb);
> +
> +       /* Bandwidth balancer, this logic can be implemented in user-land. */
> +       init_waitqueue_head(&ltb->bwbalancer_wq);
> +       ltb->bwbalancer_task =
> +           kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
> +       wake_up_process(ltb->bwbalancer_task);

Creating a kthread for each qdisc doesn't look good. Why do you
need a per-qdisc kthread or even a kernel thread at all?

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
                   ` (2 preceding siblings ...)
  2020-07-06 20:10 ` Cong Wang
@ 2020-07-06 20:29 ` David Miller
  2020-07-06 22:08   ` YU, Xiangning
  2020-07-07 15:42   ` kernel test robot
  4 siblings, 1 reply; 22+ messages in thread
From: David Miller @ 2020-07-06 20:29 UTC (permalink / raw)
  To: xiangning.yu; +Cc: netdev

From: "YU, Xiangning" <xiangning.yu@alibaba-inc.com>
Date: Tue, 07 Jul 2020 02:08:13 +0800

> Lockless Token Bucket (LTB) is a qdisc implementation that controls the
> use of outbound bandwidth on a shared link. With the help of lockless
> qdisc, and by decoupling rate limiting and bandwidth sharing, LTB is
> designed to scale in the cloud data centers.
> 
> Signed-off-by: Xiangning Yu <xiangning.yu@alibaba-inc.com>

I'm very skeptical about having a kthread for each qdisc, that doesn't
sound like a good idea at all.

Also:

> +static inline struct ltb_skb_cb *ltb_skb_cb(const struct sk_buff *skb)

No inline functions in foo.c files please.

> +static inline s64 get_linkspeed(struct net_device *dev)

Likewise.

> +static inline int ltb_drain(struct ltb_class *cl)
> +{
> +	typeof(&cl->drain_queue) queue;

Use the actual type not typeof().

> +	struct sk_buff *skb;
> +	int npkts, bytes;
> +	unsigned long now = NOW();
> +	int cpu;
> +	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
> +	struct ltb_pcpu_sched *pcpu_q;
> +	s64 timestamp;
> +	bool need_watchdog = false;
> +	struct cpumask cpumask;

Please order local variables in reverse christmas tree order.

> +static void ltb_aggregate(struct ltb_class *cl)
> +{
> +	s64 timestamp = ktime_get_ns();
> +	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
> +	int num_cpus = ltb->num_cpus;
> +	int i;

Likewise.

> +static inline void ltb_fanout(struct ltb_sched *ltb)
> +{

No inline please.

> +/* How many classes within the same group want more bandwidth */
> +static inline int bw_class_want_more_count(struct list_head *head)
> +{
> +	int n = 0;
> +	struct ltb_class *cl;

No inline, and reverse christmas tree ordering for local variables.

> +/* Redistribute bandwidth among classes with the same priority */
> +static int bw_redistribute_prio(struct list_head *lhead, int bw_available,
> +				int n, bool *all_reached_ceil)
> +{
> +	struct ltb_class *cl;
> +	int avg = 0;
> +	int orig_bw_allocated;
> +	int safe_loop = 0;
> +

Likewise.

> +static int bw_redistribute(struct ltb_sched *ltb, int bw_available)
> +{
> +	int prio = 0;
> +	int n;
> +	int highest_non_saturated_prio = TC_LTB_NUMPRIO;
> +	bool all_reached_ceil;

Likewise.

> +static void bw_balance(struct ltb_sched *ltb)
> +{
> +	struct ltb_class *cl;
> +	s64 link_speed = ltb->link_speed;
> +	int bw_available = link_speed;
> +	s64 total = 0;
> +	int high = TC_LTB_NUMPRIO;
> +	int is_light_traffic = 1;
> +	int i;

Likewise.

And so on and so forth.  This code needs a lot of style fixes
and removal of the per-qdisc kthread design.

Thank you.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 20:10 ` Cong Wang
@ 2020-07-06 20:34   ` YU, Xiangning
  2020-07-07 20:06     ` Cong Wang
  0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 20:34 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers



On 7/6/20 1:10 PM, Cong Wang wrote:
> On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
>> +                      struct sk_buff **to_free)
>> +{
>> +       struct ltb_sched *ltb = qdisc_priv(sch);
>> +       struct ltb_pcpu_sched *pcpu_q;
>> +       struct ltb_class *cl;
>> +       struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
>> +       int cpu;
>> +
>> +       cpu = smp_processor_id();
>> +       pcpu_q = qdisc_priv(pcpu->qdisc);
>> +       ltb_skb_cb(skb)->cpu = cpu;
>> +
>> +       cl = ltb_classify(sch, ltb, skb);
>> +       if (unlikely(!cl)) {
>> +               kfree_skb(skb);
>> +               return NET_XMIT_DROP;
>> +       }
>> +
>> +       pcpu->active = true;
>> +       if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
>> +               kfree_skb(skb);
>> +               atomic64_inc(&cl->stat_drops);
>> +               return NET_XMIT_DROP;
>> +       }
> 
> 
> How do you prevent out-of-order packets?
> 

Hi Cong,

That's a good question. In theory there will be out of order packets during aggregation. While keep in mind this is per-class aggregation, and it runs at a high frequency, that the chance to have any left over skbs from the same TCP flow on many CPUs is low.

Also, based on real deployment experience, we haven't observed an increased out of order events even under heavy work load.

> 
>> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
> ...
>> +       ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
>> +       tasklet_init(&ltb->fanout_tasklet, ltb_fanout_tasklet,
>> +                    (unsigned long)ltb);
>> +
>> +       /* Bandwidth balancer, this logic can be implemented in user-land. */
>> +       init_waitqueue_head(&ltb->bwbalancer_wq);
>> +       ltb->bwbalancer_task =
>> +           kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
>> +       wake_up_process(ltb->bwbalancer_task);
> 
> Creating a kthread for each qdisc doesn't look good. Why do you
> need a per-qdisc kthread or even a kernel thread at all?
> 

We moved the bandwidth sharing out of the critical data path, that's why we use a kernel thread to balance the current maximum bandwidth used by each class periodically.

This part could be implemented at as timer. What's your suggestion?

Thanks,
- Xiangning

> Thanks.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 20:29 ` David Miller
@ 2020-07-06 22:08   ` YU, Xiangning
  0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-06 22:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev



On 7/6/20 1:29 PM, David Miller wrote:
> From: "YU, Xiangning" <xiangning.yu@alibaba-inc.com>
> Date: Tue, 07 Jul 2020 02:08:13 +0800
> 
>> Lockless Token Bucket (LTB) is a qdisc implementation that controls the
>> use of outbound bandwidth on a shared link. With the help of lockless
>> qdisc, and by decoupling rate limiting and bandwidth sharing, LTB is
>> designed to scale in the cloud data centers.
>>
>> Signed-off-by: Xiangning Yu <xiangning.yu@alibaba-inc.com>
> 
> I'm very skeptical about having a kthread for each qdisc, that doesn't
> sound like a good idea at all.
> 
> Also:
> 

Hi David,

I can change the kthread to a timer, like what sfq does for perturbation. Hope that would be OK. 

Will take care of the other comments too.

Thanks,
- Xiangning

>> +static inline struct ltb_skb_cb *ltb_skb_cb(const struct sk_buff *skb)
> 
> No inline functions in foo.c files please.
> 
>> +static inline s64 get_linkspeed(struct net_device *dev)
> 
> Likewise.
> 
>> +static inline int ltb_drain(struct ltb_class *cl)
>> +{
>> +	typeof(&cl->drain_queue) queue;
> 
> Use the actual type not typeof().
> 
>> +	struct sk_buff *skb;
>> +	int npkts, bytes;
>> +	unsigned long now = NOW();
>> +	int cpu;
>> +	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
>> +	struct ltb_pcpu_sched *pcpu_q;
>> +	s64 timestamp;
>> +	bool need_watchdog = false;
>> +	struct cpumask cpumask;
> 
> Please order local variables in reverse christmas tree order.
> 
>> +static void ltb_aggregate(struct ltb_class *cl)
>> +{
>> +	s64 timestamp = ktime_get_ns();
>> +	struct ltb_sched *ltb = qdisc_priv(cl->root_qdisc);
>> +	int num_cpus = ltb->num_cpus;
>> +	int i;
> 
> Likewise.
> 
>> +static inline void ltb_fanout(struct ltb_sched *ltb)
>> +{
> 
> No inline please.
> 
>> +/* How many classes within the same group want more bandwidth */
>> +static inline int bw_class_want_more_count(struct list_head *head)
>> +{
>> +	int n = 0;
>> +	struct ltb_class *cl;
> 
> No inline, and reverse christmas tree ordering for local variables.
> 
>> +/* Redistribute bandwidth among classes with the same priority */
>> +static int bw_redistribute_prio(struct list_head *lhead, int bw_available,
>> +				int n, bool *all_reached_ceil)
>> +{
>> +	struct ltb_class *cl;
>> +	int avg = 0;
>> +	int orig_bw_allocated;
>> +	int safe_loop = 0;
>> +
> 
> Likewise.
> 
>> +static int bw_redistribute(struct ltb_sched *ltb, int bw_available)
>> +{
>> +	int prio = 0;
>> +	int n;
>> +	int highest_non_saturated_prio = TC_LTB_NUMPRIO;
>> +	bool all_reached_ceil;
> 
> Likewise.
> 
>> +static void bw_balance(struct ltb_sched *ltb)
>> +{
>> +	struct ltb_class *cl;
>> +	s64 link_speed = ltb->link_speed;
>> +	int bw_available = link_speed;
>> +	s64 total = 0;
>> +	int high = TC_LTB_NUMPRIO;
>> +	int is_light_traffic = 1;
>> +	int i;
> 
> Likewise.
> 
> And so on and so forth.  This code needs a lot of style fixes
> and removal of the per-qdisc kthread design.
> 
> Thank you.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
@ 2020-07-07 15:42   ` kernel test robot
  2020-07-06 18:37 ` Stephen Hemminger
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2020-07-07 15:42 UTC (permalink / raw)
  To: YU, Xiangning, netdev; +Cc: kbuild-all, clang-built-linux

[-- Attachment #1: Type: text/plain, Size: 3969 bytes --]

Hi Xiangning",

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/YU-Xiangning/Lockless-Token-Bucket-LTB-Qdisc/20200707-020918
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 5bd6ff0c6fe6c138aebe8d3bf7163420386c7ca7
config: x86_64-allyesconfig (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 02946de3802d3bc65bc9f2eb9b8d4969b5a7add8)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/sched/sch_ltb.c:832:6: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
           if (!opt)
               ^~~~
   net/sched/sch_ltb.c:882:9: note: uninitialized use occurs here
           return err;
                  ^~~
   net/sched/sch_ltb.c:832:2: note: remove the 'if' if its condition is always false
           if (!opt)
           ^~~~~~~~~
   net/sched/sch_ltb.c:830:9: note: initialize the variable 'err' to silence this warning
           int err;
                  ^
                   = 0
   1 warning generated.

vim +832 net/sched/sch_ltb.c

   817	
   818	static int ltb_change_class(struct Qdisc *sch, u32 classid,
   819				    u32 parentid, struct nlattr **tca,
   820				    unsigned long *arg, struct netlink_ext_ack *extack)
   821	{
   822		struct ltb_sched *ltb  = qdisc_priv(sch);
   823		struct ltb_class *cl = (struct ltb_class *)*arg, *parent;
   824		struct nlattr *opt = tca[TCA_OPTIONS];
   825		struct nlattr *tb[TCA_LTB_MAX + 1];
   826		struct tc_ltb_opt *lopt;
   827		u64 rate64, ceil64;
   828		struct psched_ratecfg ratecfg, ceilcfg;
   829		u32 prio;
   830		int err;
   831	
 > 832		if (!opt)
   833			goto failure;
   834	
   835		err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt, ltb_policy,
   836						  NULL);
   837		if (err < 0)
   838			goto failure;
   839	
   840		err = -EINVAL;
   841		if (!tb[TCA_LTB_PARMS])
   842			goto failure;
   843	
   844		parent = parentid == TC_H_ROOT ? NULL : ltb_find_class(sch, parentid);
   845	
   846		lopt = nla_data(tb[TCA_LTB_PARMS]);
   847		if (!lopt->rate.rate || !lopt->ceil.rate)
   848			goto failure;
   849	
   850		rate64 = tb[TCA_LTB_RATE64] ? nla_get_u64(tb[TCA_LTB_RATE64]) : 0;
   851		ceil64 = tb[TCA_LTB_CEIL64] ? nla_get_u64(tb[TCA_LTB_CEIL64]) : 0;
   852		if (rate64 > ceil64)
   853			goto failure;
   854	
   855		psched_ratecfg_precompute(&ratecfg, &lopt->rate, rate64);
   856		psched_ratecfg_precompute(&ceilcfg, &lopt->ceil, ceil64);
   857		prio = lopt->prio;
   858		if (prio >= TC_LTB_NUMPRIO)
   859			prio = TC_LTB_NUMPRIO - 1;
   860	
   861		if (!cl) {
   862			if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
   863			    ltb_find_class(sch, classid)) {
   864				err = -EINVAL;
   865				goto failure;
   866			}
   867			cl = ltb_alloc_class(sch, parent, classid, &ratecfg, &ceilcfg,
   868					     prio);
   869			if (!cl) {
   870				err = -ENOBUFS;
   871				goto failure;
   872			}
   873		} else {
   874			/* Modify existing class */
   875			ltb_modify_class(sch, cl, &ratecfg, &ceilcfg, prio);
   876		}
   877		qdisc_class_hash_grow(sch, &ltb->clhash);
   878		*arg = (unsigned long)cl;
   879		return 0;
   880	
   881	failure:
   882		return err;
   883	}
   884	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 75320 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
@ 2020-07-07 15:42   ` kernel test robot
  0 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2020-07-07 15:42 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 4083 bytes --]

Hi Xiangning",

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/YU-Xiangning/Lockless-Token-Bucket-LTB-Qdisc/20200707-020918
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 5bd6ff0c6fe6c138aebe8d3bf7163420386c7ca7
config: x86_64-allyesconfig (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 02946de3802d3bc65bc9f2eb9b8d4969b5a7add8)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/sched/sch_ltb.c:832:6: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
           if (!opt)
               ^~~~
   net/sched/sch_ltb.c:882:9: note: uninitialized use occurs here
           return err;
                  ^~~
   net/sched/sch_ltb.c:832:2: note: remove the 'if' if its condition is always false
           if (!opt)
           ^~~~~~~~~
   net/sched/sch_ltb.c:830:9: note: initialize the variable 'err' to silence this warning
           int err;
                  ^
                   = 0
   1 warning generated.

vim +832 net/sched/sch_ltb.c

   817	
   818	static int ltb_change_class(struct Qdisc *sch, u32 classid,
   819				    u32 parentid, struct nlattr **tca,
   820				    unsigned long *arg, struct netlink_ext_ack *extack)
   821	{
   822		struct ltb_sched *ltb  = qdisc_priv(sch);
   823		struct ltb_class *cl = (struct ltb_class *)*arg, *parent;
   824		struct nlattr *opt = tca[TCA_OPTIONS];
   825		struct nlattr *tb[TCA_LTB_MAX + 1];
   826		struct tc_ltb_opt *lopt;
   827		u64 rate64, ceil64;
   828		struct psched_ratecfg ratecfg, ceilcfg;
   829		u32 prio;
   830		int err;
   831	
 > 832		if (!opt)
   833			goto failure;
   834	
   835		err = nla_parse_nested_deprecated(tb, TCA_LTB_MAX, opt, ltb_policy,
   836						  NULL);
   837		if (err < 0)
   838			goto failure;
   839	
   840		err = -EINVAL;
   841		if (!tb[TCA_LTB_PARMS])
   842			goto failure;
   843	
   844		parent = parentid == TC_H_ROOT ? NULL : ltb_find_class(sch, parentid);
   845	
   846		lopt = nla_data(tb[TCA_LTB_PARMS]);
   847		if (!lopt->rate.rate || !lopt->ceil.rate)
   848			goto failure;
   849	
   850		rate64 = tb[TCA_LTB_RATE64] ? nla_get_u64(tb[TCA_LTB_RATE64]) : 0;
   851		ceil64 = tb[TCA_LTB_CEIL64] ? nla_get_u64(tb[TCA_LTB_CEIL64]) : 0;
   852		if (rate64 > ceil64)
   853			goto failure;
   854	
   855		psched_ratecfg_precompute(&ratecfg, &lopt->rate, rate64);
   856		psched_ratecfg_precompute(&ceilcfg, &lopt->ceil, ceil64);
   857		prio = lopt->prio;
   858		if (prio >= TC_LTB_NUMPRIO)
   859			prio = TC_LTB_NUMPRIO - 1;
   860	
   861		if (!cl) {
   862			if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
   863			    ltb_find_class(sch, classid)) {
   864				err = -EINVAL;
   865				goto failure;
   866			}
   867			cl = ltb_alloc_class(sch, parent, classid, &ratecfg, &ceilcfg,
   868					     prio);
   869			if (!cl) {
   870				err = -ENOBUFS;
   871				goto failure;
   872			}
   873		} else {
   874			/* Modify existing class */
   875			ltb_modify_class(sch, cl, &ratecfg, &ceilcfg, prio);
   876		}
   877		qdisc_class_hash_grow(sch, &ltb->clhash);
   878		*arg = (unsigned long)cl;
   879		return 0;
   880	
   881	failure:
   882		return err;
   883	}
   884	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 75320 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-06 20:34   ` YU, Xiangning
@ 2020-07-07 20:06     ` Cong Wang
  2020-07-07 21:24       ` YU, Xiangning
  0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-07 20:06 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Mon, Jul 6, 2020 at 1:34 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
>
>
> On 7/6/20 1:10 PM, Cong Wang wrote:
> > On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
> > <xiangning.yu@alibaba-inc.com> wrote:
> >> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
> >> +                      struct sk_buff **to_free)
> >> +{
> >> +       struct ltb_sched *ltb = qdisc_priv(sch);
> >> +       struct ltb_pcpu_sched *pcpu_q;
> >> +       struct ltb_class *cl;
> >> +       struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
> >> +       int cpu;
> >> +
> >> +       cpu = smp_processor_id();
> >> +       pcpu_q = qdisc_priv(pcpu->qdisc);
> >> +       ltb_skb_cb(skb)->cpu = cpu;
> >> +
> >> +       cl = ltb_classify(sch, ltb, skb);
> >> +       if (unlikely(!cl)) {
> >> +               kfree_skb(skb);
> >> +               return NET_XMIT_DROP;
> >> +       }
> >> +
> >> +       pcpu->active = true;
> >> +       if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
> >> +               kfree_skb(skb);
> >> +               atomic64_inc(&cl->stat_drops);
> >> +               return NET_XMIT_DROP;
> >> +       }
> >
> >
> > How do you prevent out-of-order packets?
> >
>
> Hi Cong,
>
> That's a good question. In theory there will be out of order packets during aggregation. While keep in mind this is per-class aggregation, and it runs at a high frequency, that the chance to have any left over skbs from the same TCP flow on many CPUs is low.
>
> Also, based on real deployment experience, we haven't observed an increased out of order events even under heavy work load.
>

Yeah, but unless you always classify packets into proper flows, there
is always a chance to generate out-of-order packets here, which
means the default configuration is flawed.


> >
> >> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
> > ...
> >> +       ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
> >> +       tasklet_init(&ltb->fanout_tasklet, ltb_fanout_tasklet,
> >> +                    (unsigned long)ltb);
> >> +
> >> +       /* Bandwidth balancer, this logic can be implemented in user-land. */
> >> +       init_waitqueue_head(&ltb->bwbalancer_wq);
> >> +       ltb->bwbalancer_task =
> >> +           kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
> >> +       wake_up_process(ltb->bwbalancer_task);
> >
> > Creating a kthread for each qdisc doesn't look good. Why do you
> > need a per-qdisc kthread or even a kernel thread at all?
> >
>
> We moved the bandwidth sharing out of the critical data path, that's why we use a kernel thread to balance the current maximum bandwidth used by each class periodically.
>
> This part could be implemented at as timer. What's your suggestion?

I doubt you can use a timer, as you call rtnl_trylock() there.
Why not use a delayed work?

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-07 20:06     ` Cong Wang
@ 2020-07-07 21:24       ` YU, Xiangning
  2020-07-08 20:24         ` Cong Wang
  0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-07 21:24 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers



On 7/7/20 1:06 PM, Cong Wang wrote:
> On Mon, Jul 6, 2020 at 1:34 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>>
>>
>> On 7/6/20 1:10 PM, Cong Wang wrote:
>>> On Mon, Jul 6, 2020 at 11:11 AM YU, Xiangning
>>> <xiangning.yu@alibaba-inc.com> wrote:
>>>> +static int ltb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
>>>> +                      struct sk_buff **to_free)
>>>> +{
>>>> +       struct ltb_sched *ltb = qdisc_priv(sch);
>>>> +       struct ltb_pcpu_sched *pcpu_q;
>>>> +       struct ltb_class *cl;
>>>> +       struct ltb_pcpu_data *pcpu = this_cpu_ptr(ltb->pcpu_data);
>>>> +       int cpu;
>>>> +
>>>> +       cpu = smp_processor_id();
>>>> +       pcpu_q = qdisc_priv(pcpu->qdisc);
>>>> +       ltb_skb_cb(skb)->cpu = cpu;
>>>> +
>>>> +       cl = ltb_classify(sch, ltb, skb);
>>>> +       if (unlikely(!cl)) {
>>>> +               kfree_skb(skb);
>>>> +               return NET_XMIT_DROP;
>>>> +       }
>>>> +
>>>> +       pcpu->active = true;
>>>> +       if (unlikely(kfifo_put(&cl->aggr_queues[cpu], skb) == 0)) {
>>>> +               kfree_skb(skb);
>>>> +               atomic64_inc(&cl->stat_drops);
>>>> +               return NET_XMIT_DROP;
>>>> +       }
>>>
>>>
>>> How do you prevent out-of-order packets?
>>>
>>
>> Hi Cong,
>>
>> That's a good question. In theory there will be out of order packets during aggregation. While keep in mind this is per-class aggregation, and it runs at a high frequency, that the chance to have any left over skbs from the same TCP flow on many CPUs is low.
>>
>> Also, based on real deployment experience, we haven't observed an increased out of order events even under heavy work load.
>>
> 
> Yeah, but unless you always classify packets into proper flows, there
> is always a chance to generate out-of-order packets here, which
> means the default configuration is flawed.
>

The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.

Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either. 

> 
>>>
>>>> +static int ltb_init(struct Qdisc *sch, struct nlattr *opt,
>>> ...
>>>> +       ltb->default_cls = ltb->shadow_cls; /* Default hasn't been created */
>>>> +       tasklet_init(&ltb->fanout_tasklet, ltb_fanout_tasklet,
>>>> +                    (unsigned long)ltb);
>>>> +
>>>> +       /* Bandwidth balancer, this logic can be implemented in user-land. */
>>>> +       init_waitqueue_head(&ltb->bwbalancer_wq);
>>>> +       ltb->bwbalancer_task =
>>>> +           kthread_create(ltb_bw_balancer_kthread, ltb, "ltb-balancer");
>>>> +       wake_up_process(ltb->bwbalancer_task);
>>>
>>> Creating a kthread for each qdisc doesn't look good. Why do you
>>> need a per-qdisc kthread or even a kernel thread at all?
>>>
>>
>> We moved the bandwidth sharing out of the critical data path, that's why we use a kernel thread to balance the current maximum bandwidth used by each class periodically.
>>
>> This part could be implemented at as timer. What's your suggestion?
> 
> I doubt you can use a timer, as you call rtnl_trylock() there.
> Why not use a delayed work?
> 

Thank you for the suggestion, I will replace it with a delayed work and send out the new patch.

- Xiangning

> Thanks.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-07 21:24       ` YU, Xiangning
@ 2020-07-08 20:24         ` Cong Wang
  2020-07-08 21:07           ` YU, Xiangning
  0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-08 20:24 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
>
> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.

By "default configuration" I mean no additional configuration on top
of qdisc creation. If you have to rely on additional TC filters to
do the classification, it could be problematic. Same for setting
skb priority, right?

Also, you use a default class, this means all unclassified packets
share the same class, and a flow falls into this class could be still
out-of-order, right?

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-08 20:24         ` Cong Wang
@ 2020-07-08 21:07           ` YU, Xiangning
  2020-07-10  5:04             ` Cong Wang
  0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-08 21:07 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers



On 7/8/20 1:24 PM, Cong Wang wrote:
> On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
>>
>> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
> 
> By "default configuration" I mean no additional configuration on top
> of qdisc creation. If you have to rely on additional TC filters to
> do the classification, it could be problematic. Same for setting
> skb priority, right?
> 

In this patch we don't rely on other TC filters. In our use case, socket priority is set on a per-flow basis, not per-skb basis.

> Also, you use a default class, this means all unclassified packets
> share the same class, and a flow falls into this class could be still
> out-of-order, right?
> 

A flow will fall and only fall to this class. If we can keep the order within a flow, I'm not sure why we still have this issue?
 
Thanks,
- Xiangning

> Thanks.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-08 21:07           ` YU, Xiangning
@ 2020-07-10  5:04             ` Cong Wang
  2020-07-10  5:20               ` Cong Wang
  2020-07-10  5:48               ` YU, Xiangning
  0 siblings, 2 replies; 22+ messages in thread
From: Cong Wang @ 2020-07-10  5:04 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Wed, Jul 8, 2020 at 2:07 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
>
>
> On 7/8/20 1:24 PM, Cong Wang wrote:
> > On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
> > <xiangning.yu@alibaba-inc.com> wrote:
> >>
> >> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
> >>
> >> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
> >
> > By "default configuration" I mean no additional configuration on top
> > of qdisc creation. If you have to rely on additional TC filters to
> > do the classification, it could be problematic. Same for setting
> > skb priority, right?
> >
>
> In this patch we don't rely on other TC filters. In our use case, socket priority is set on a per-flow basis, not per-skb basis.

Your use case is not the default configuration I mentioned.

>
> > Also, you use a default class, this means all unclassified packets
> > share the same class, and a flow falls into this class could be still
> > out-of-order, right?
> >
>
> A flow will fall and only fall to this class. If we can keep the order within a flow, I'm not sure why we still have this issue?

The issue here is obvious: you have to rely on either TC filters or
whatever sets skb priority to make packets in a flow in-order.

IOW, without these *additional* efforts, it is broken in terms of
out-of-order.

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  5:04             ` Cong Wang
@ 2020-07-10  5:20               ` Cong Wang
  2020-07-10  6:06                 ` YU, Xiangning
  2020-07-10  5:48               ` YU, Xiangning
  1 sibling, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-10  5:20 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> IOW, without these *additional* efforts, it is broken in terms of
> out-of-order.
>

Take a look at fq_codel, it provides a hash function for flow classification,
fq_codel_hash(), as default, thus its default configuration does not
have such issues. So, you probably want to provide such a hash
function too instead of a default class.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  5:04             ` Cong Wang
  2020-07-10  5:20               ` Cong Wang
@ 2020-07-10  5:48               ` YU, Xiangning
  2020-07-10  6:09                 ` Cong Wang
  1 sibling, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10  5:48 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers



On 7/9/20 10:04 PM, Cong Wang wrote:
> On Wed, Jul 8, 2020 at 2:07 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>>
>>
>> On 7/8/20 1:24 PM, Cong Wang wrote:
>>> On Tue, Jul 7, 2020 at 2:24 PM YU, Xiangning
>>> <xiangning.yu@alibaba-inc.com> wrote:
>>>>
>>>> The key is to avoid classifying packets from a same flow into different classes. So we use socket priority to classify packets. It's always going to be correctly classified.
>>>>
>>>> Not sure what do you mean by default configuration. But we create a shadow class when the qdisc is created. Before any other classes are created, all packets from any flow will be classified to this same shadow class, there won't be any incorrect classified packets either.
>>>
>>> By "default configuration" I mean no additional configuration on top
>>> of qdisc creation. If you have to rely on additional TC filters to
>>> do the classification, it could be problematic. Same for setting
>>> skb priority, right?
>>>
>>
>> In this patch we don't rely on other TC filters. In our use case, socket priority is set on a per-flow basis, not per-skb basis.
> 
> Your use case is not the default configuration I mentioned.
> 
>>
>>> Also, you use a default class, this means all unclassified packets
>>> share the same class, and a flow falls into this class could be still
>>> out-of-order, right?
>>>
>>
>> A flow will fall and only fall to this class. If we can keep the order within a flow, I'm not sure why we still have this issue?
> 
> The issue here is obvious: you have to rely on either TC filters or
> whatever sets skb priority to make packets in a flow in-order.
>> IOW, without these *additional* efforts, it is broken in terms of
> out-of-order.
> 

Well, we do ask packets from a flow to be classified to a single class, not multiple ones. It doesn't have to be socket priority, it could be five tuple hash, or even container classid.

I think it's ok to have this requirement, even if we use htb, I would suggest the same. Why do you think this is a problem?

Thanks,
- Xiangning

> Thanks.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  5:20               ` Cong Wang
@ 2020-07-10  6:06                 ` YU, Xiangning
  2020-07-10  6:21                   ` Cong Wang
  0 siblings, 1 reply; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10  6:06 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers


On 7/9/20 10:20 PM, Cong Wang wrote:
> On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> IOW, without these *additional* efforts, it is broken in terms of
>> out-of-order.
>>
> 
> Take a look at fq_codel, it provides a hash function for flow classification,
> fq_codel_hash(), as default, thus its default configuration does not
> have such issues. So, you probably want to provide such a hash
> function too instead of a default class.
> 
If I understand this code correctly, this socket hash value identifies a flow. Essentially it serves the same purpose as socket priority. In this patch, we use a similar classification method like htb, but without filters.

We could provide a hash function, but I'm a bit confused about the problem we are trying to solve.

Thanks,
- Xiangning


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  5:48               ` YU, Xiangning
@ 2020-07-10  6:09                 ` Cong Wang
  2020-07-10 17:03                   ` YU, Xiangning
  0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-10  6:09 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Thu, Jul 9, 2020 at 10:49 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
> Well, we do ask packets from a flow to be classified to a single class, not multiple ones. It doesn't have to be socket priority, it could be five tuple hash, or even container classid.

I don't see how it is so in your code, without skb priority your code
simply falls back to default class:

+       /* Allow to select a class by setting skb->priority */
+       if (likely(skb->priority != 0)) {
+               cl = ltb_find_class(sch, skb->priority);
+               if (cl)
+                       return cl;
+       }
+       return rcu_dereference_bh(ltb->default_cls);

Mind to be more specific here?

BTW, your qdisc does not even support TC filters, does it?
At least I don't see that tcf_classify() is called.


>
> I think it's ok to have this requirement, even if we use htb, I would suggest the same. Why do you think this is a problem?

Because HTB does not have a per-cpu queue for each class,
yours does, cl->aggr_queues[cpu], if your point here is why we
don't blame HTB.

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  6:06                 ` YU, Xiangning
@ 2020-07-10  6:21                   ` Cong Wang
  2020-07-10 18:01                     ` YU, Xiangning
  0 siblings, 1 reply; 22+ messages in thread
From: Cong Wang @ 2020-07-10  6:21 UTC (permalink / raw)
  To: YU, Xiangning; +Cc: Linux Kernel Network Developers

On Thu, Jul 9, 2020 at 11:07 PM YU, Xiangning
<xiangning.yu@alibaba-inc.com> wrote:
>
>
> On 7/9/20 10:20 PM, Cong Wang wrote:
> > On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >> IOW, without these *additional* efforts, it is broken in terms of
> >> out-of-order.
> >>
> >
> > Take a look at fq_codel, it provides a hash function for flow classification,
> > fq_codel_hash(), as default, thus its default configuration does not
> > have such issues. So, you probably want to provide such a hash
> > function too instead of a default class.
> >
> If I understand this code correctly, this socket hash value identifies a flow. Essentially it serves the same purpose as socket priority. In this patch, we use a similar classification method like htb, but without filters.

How is it any similar to HTB? HTB does not have a per-cpu queue
for each class. This is a huge difference.

>
> We could provide a hash function, but I'm a bit confused about the problem we are trying to solve.

Probably more than that, you need to ensure the packets in a same flow
are queued on the same queue.

Let say you have two packets P1 and P2 from the same flow (P1 is before P2),
you can classify them into the same class of course, but with per-cpu queues
they can be sent out in a wrong order too:

send(P1) on CPU1 -> classify() returns default class -> P1 is queued on
the CPU1 queue of default class

(Now process is migrated to CPU2)

send(P2) on CPU2 -> classify() returns default class -> P2 is queued on
the CPU2 queue of default class

P2 is dequeued on CPU2 before P1 dequeued on CPU1.

Now, out of order. :)

Hope it is clear now.

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  6:09                 ` Cong Wang
@ 2020-07-10 17:03                   ` YU, Xiangning
  0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10 17:03 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers



On 7/9/20 11:09 PM, Cong Wang wrote:
> On Thu, Jul 9, 2020 at 10:49 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>> Well, we do ask packets from a flow to be classified to a single class, not multiple ones. It doesn't have to be socket priority, it could be five tuple hash, or even container classid.
> 
> I don't see how it is so in your code, without skb priority your code
> simply falls back to default class:
> 
> +       /* Allow to select a class by setting skb->priority */
> +       if (likely(skb->priority != 0)) {
> +               cl = ltb_find_class(sch, skb->priority);
> +               if (cl)
> +                       return cl;
> +       }
> +       return rcu_dereference_bh(ltb->default_cls);
> 
> Mind to be more specific here?
> 

The application will call setsockopt() to set priority. If no match if found, it will use the default class.

In real deployment we have another classifier. Please feel free to suggest what is best choice for it. 

> BTW, your qdisc does not even support TC filters, does it?
> At least I don't see that tcf_classify() is called.
>

You are correct we don't use generic tc filters. I leave it unsupported in this patch too.
 
> 
>>
>> I think it's ok to have this requirement, even if we use htb, I would suggest the same. Why do you think this is a problem?
> 
> Because HTB does not have a per-cpu queue for each class,
> yours does, cl->aggr_queues[cpu], if your point here is why we
> don't blame HTB.
>

My point is: it's OK to ask a flow to be classified to a single class.

Thanks,
- Xiangning

> Thanks.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc
  2020-07-10  6:21                   ` Cong Wang
@ 2020-07-10 18:01                     ` YU, Xiangning
  0 siblings, 0 replies; 22+ messages in thread
From: YU, Xiangning @ 2020-07-10 18:01 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers



On 7/9/20 11:21 PM, Cong Wang wrote:
> On Thu, Jul 9, 2020 at 11:07 PM YU, Xiangning
> <xiangning.yu@alibaba-inc.com> wrote:
>>
>>
>> On 7/9/20 10:20 PM, Cong Wang wrote:
>>> On Thu, Jul 9, 2020 at 10:04 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>> IOW, without these *additional* efforts, it is broken in terms of
>>>> out-of-order.
>>>>
>>>
>>> Take a look at fq_codel, it provides a hash function for flow classification,
>>> fq_codel_hash(), as default, thus its default configuration does not
>>> have such issues. So, you probably want to provide such a hash
>>> function too instead of a default class.
>>>
>> If I understand this code correctly, this socket hash value identifies a flow. Essentially it serves the same purpose as socket priority. In this patch, we use a similar classification method like htb, but without filters.
> 
> How is it any similar to HTB? HTB does not have a per-cpu queue
> for each class. This is a huge difference.

I said 'similar classification method like htb'. Not similar to HTB in overall design. :)
 
> 
>>
>> We could provide a hash function, but I'm a bit confused about the problem we are trying to solve.
> 
> Probably more than that, you need to ensure the packets in a same flow
> are queued on the same queue.
> 
> Let say you have two packets P1 and P2 from the same flow (P1 is before P2),
> you can classify them into the same class of course, but with per-cpu queues
> they can be sent out in a wrong order too:
> 
> send(P1) on CPU1 -> classify() returns default class -> P1 is queued on
> the CPU1 queue of default class
> 
> (Now process is migrated to CPU2)
> 
> send(P2) on CPU2 -> classify() returns default class -> P2 is queued on
> the CPU2 queue of default class
> 
> P2 is dequeued on CPU2 before P1 dequeued on CPU1.
> 
> Now, out of order. :)
> 
> Hope it is clear now.

The assumption is that packet scheduler is faster than thread migration. If we constantly take that long to send one packet, we need to fix it.

Under light load, CPU1 is free enough to immediately trigger aggregation. Under heavy load, some other CPU could also help trigger aggregation in between.

As I responded in my first email, this is possible in theory. We are running a much lower kernel version, and the normal case is to have multiple flows/threads running in a large class, so far haven't seen big trouble with this approach.

But it's been years, if the above assumption changes, we do need to rethink about it.

Thanks,
- Xiangning
> 
> Thanks.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-07-10 18:01 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-06 18:08 [PATCH net-next 2/2] net: sched: Lockless Token Bucket (LTB) Qdisc YU, Xiangning
2020-07-06 18:32 ` Stephen Hemminger
2020-07-06 18:37 ` Stephen Hemminger
2020-07-06 19:56   ` YU, Xiangning
2020-07-06 20:10 ` Cong Wang
2020-07-06 20:34   ` YU, Xiangning
2020-07-07 20:06     ` Cong Wang
2020-07-07 21:24       ` YU, Xiangning
2020-07-08 20:24         ` Cong Wang
2020-07-08 21:07           ` YU, Xiangning
2020-07-10  5:04             ` Cong Wang
2020-07-10  5:20               ` Cong Wang
2020-07-10  6:06                 ` YU, Xiangning
2020-07-10  6:21                   ` Cong Wang
2020-07-10 18:01                     ` YU, Xiangning
2020-07-10  5:48               ` YU, Xiangning
2020-07-10  6:09                 ` Cong Wang
2020-07-10 17:03                   ` YU, Xiangning
2020-07-06 20:29 ` David Miller
2020-07-06 22:08   ` YU, Xiangning
2020-07-07 15:42 ` kernel test robot
2020-07-07 15:42   ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.