All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] net:sched: add gkprio scheduler
@ 2018-05-07  9:36 Nishanth Devarajan
  2018-05-08  5:24 ` Cong Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Nishanth Devarajan @ 2018-05-07  9:36 UTC (permalink / raw)
  To: xiyou.wangcong, jiri, jhs, davem; +Cc: netdev, doucette, michel

net/sched: add gkprio scheduler

Gkprio (Gatekeeper Priority Queue) is a queueing discipline that prioritizes
IPv4 and IPv6 packets accordingly to their DSCP field. Although Gkprio can be
employed in any QoS scenario in which a higher DSCP field means a higher
priority packet, Gkprio was concieved as a solution for denial-of-service
defenses that need to route packets with different priorities.

Signed-off-by: Nishanth Devarajan <ndev2021@gmail.com>
Reviewed-by: Cody Doucette <doucette@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Reviewed-by: Sachin Paryani <sachin.paryani@gmail.com>
---
 include/uapi/linux/pkt_sched.h |  11 ++
 net/sched/Kconfig              |  13 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_gkprio.c         | 316 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 341 insertions(+)
 create mode 100644 net/sched/sch_gkprio.c

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096..de8b5ca 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -124,6 +124,17 @@ struct tc_fifo_qopt {
 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
 };
 
+/* GKPRIO section */
+
+struct tc_gkprio_qopt {
+	__u32	limit; 	    	/* Queue length in packets. */
+	__u16	noip_dfltp; 	/* Default priority for non-IP packets. */
+
+	/* Stats. */
+	__u16 highest_prio; 	/* Highest priority currently in queue.  */
+	__u16 lowest_prio;  	/* Lowest priority currently in queue. */
+};
+
 /* PRIO section */
 
 #define TCQ_PRIO_BANDS	16
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169f..9c47857 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -240,6 +240,19 @@ config NET_SCH_MQPRIO
 
 	  If unsure, say N.
 
+config NET_SCH_GKPRIO
+	tristate "Gatekeeper priority queue scheduler (GKPRIO)"
+	help
+	  Say Y here if you want to use the Gatekeeper priority queue
+	  scheduler. This schedules packets according to priorities based on
+	  the DSCP (IPv4) and DS (IPv6) fields, which is useful for request
+	  packets in DoS mitigation systems such as Gatekeeper.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_gkprio.
+
+	  If unsure, say N.
+
 config NET_SCH_CHOKE
 	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d38..93a1fdb 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
+obj-$(CONFIG_NET_SCH_GKPRIO)	+= sch_gkprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
diff --git a/net/sched/sch_gkprio.c b/net/sched/sch_gkprio.c
new file mode 100644
index 0000000..ad1227c
--- /dev/null
+++ b/net/sched/sch_gkprio.c
@@ -0,0 +1,316 @@
+/*
+ * net/sched/sch_gkprio.c  Gatekeeper Priority Queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nishanth Devarajan, <ndev_2021@gmail.com>
+ *	        original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+/* Packets are assigned priorities [0, 63] due to the IP DSCP field limits. */
+#define GKPRIO_MAX_PRIORITY 64
+
+/*	  Gatekeeper Priority Queue
+ *	=================================
+ *
+ * This qdisc schedules a packet according to the value (0-63) of its DSCP
+ * (IPv4) or DS (IPv6) field, where a higher value places the packet closer
+ * to the exit of the queue. Non-IP packets are assigned a default priority
+ * specified to GKPRIO; if none is specified, default priority is set
+ * to 0. When the queue is full, the lowest priority packet in the queue is
+ * dropped to make room for the packet to be added if it has higher priority.
+ * If the packet to be added has lower priority than all packets in the queue,
+ * it is dropped.
+ *
+ * Without the Gatekeeper priority queue, queue length limits must be imposed
+ * for individual queues, and there is no easy way to enforce a global queue
+ * length limit across all priorities. With the Gatekeeper queue, a global
+ * queue length limit can be enforced while not restricting the queue lengths
+ * of individual priorities.
+ *
+ * This is especially useful for a denial-of-service defense system; like
+ * Gatekeeper, which prioritizes packets in flows that demonstrate expected
+ * behavior of legitimate users. The queue is flexible to allow any number
+ * of packets of any priority up to the global limit of the scheduler
+ * without risking resource overconsumption by a flood of low priority packets.
+ *
+ * The Gatekeper standalone codebase is found here:
+ *
+ *		https://github.com/AltraMayor/gatekeeper
+ */
+
+struct gkprio_sched_data {
+	/* Parameters. */
+	u32 max_limit;
+	u16 noip_dfltp;
+
+	/* Queue state. */
+	struct sk_buff_head qdiscs[GKPRIO_MAX_PRIORITY];
+	u16 highest_prio;
+	u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct gkprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* GK queue is empty, return 0 (default highest priority setting). */
+	return 0;
+}
+
+static u16 calc_new_low_prio(const struct gkprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* GK queue is empty, return GKPRIO_MAX_PRIORITY - 1
+	 * (default lowest priority setting).
+	 */
+	return GKPRIO_MAX_PRIORITY - 1;
+}
+
+static int gkprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *qdisc;
+	struct sk_buff_head *lp_qdisc;
+	struct sk_buff *to_drop;
+	int wlen;
+	u16 prio, lp;
+
+	/* Obtain the priority of @skb. */
+	wlen = skb_network_offset(skb);
+	switch (tc_skb_protocol(skb)) {
+	case htons(ETH_P_IP):
+		wlen += sizeof(struct iphdr);
+		if (!pskb_may_pull(skb, wlen))
+			goto drop;
+		prio = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+		break;
+
+	case htons(ETH_P_IPV6):
+		wlen += sizeof(struct ipv6hdr);
+		if (!pskb_may_pull(skb, wlen))
+			goto drop;
+		prio = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+		break;
+
+	default:
+		prio = q->noip_dfltp;
+		break;
+	}
+
+	qdisc = &q->qdiscs[prio];
+
+	if (sch->q.qlen < q->max_limit) {
+		__skb_queue_tail(qdisc, skb);
+		qdisc_qstats_backlog_inc(sch, skb);
+
+		/* Check to update highest and lowest priorities. */
+		if (prio > q->highest_prio)
+			q->highest_prio = prio;
+
+		if (prio < q->lowest_prio)
+			q->lowest_prio = prio;
+
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+
+	/* If this packet has the lowest priority, drop it. */
+	lp = q->lowest_prio;
+	if (prio <= lp)
+		return qdisc_drop(skb, sch, to_free);
+
+	/* Drop the packet at the tail of the lowest priority qdisc. */
+	lp_qdisc = &q->qdiscs[lp];
+	to_drop = __skb_dequeue_tail(lp_qdisc);
+	BUG_ON(!to_drop);
+	qdisc_qstats_backlog_dec(sch, to_drop);
+	qdisc_drop(to_drop, sch, to_free);
+
+	__skb_queue_tail(qdisc, skb);
+	qdisc_qstats_backlog_inc(sch, skb);
+
+	/* Check to update highest and lowest priorities. */
+	if (skb_queue_empty(lp_qdisc)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->lowest_prio = prio;
+			q->highest_prio = prio;
+		} else {
+			q->lowest_prio = calc_new_low_prio(q);
+		}
+	}
+
+	if (prio > q->highest_prio)
+		q->highest_prio = prio;
+
+	return NET_XMIT_SUCCESS;
+drop:
+	qdisc_drop(skb, sch, to_free);
+	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+static struct sk_buff *gkprio_dequeue(struct Qdisc *sch)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+	struct sk_buff *skb = __skb_dequeue(hpq);
+
+	if (unlikely(!skb))
+		return NULL;
+
+	sch->q.qlen--;
+	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_bstats_update(sch, skb);
+
+	/* Update highest priority field. */
+	if (skb_queue_empty(hpq)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->highest_prio = 0;
+			q->lowest_prio = GKPRIO_MAX_PRIORITY - 1;
+		} else {
+			q->highest_prio = calc_new_high_prio(q);
+		}
+	}
+	return skb;
+}
+
+static int gkprio_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct tc_gkprio_qopt *ctl = nla_data(opt);
+	unsigned int min_limit = 1;
+
+	if (ctl->limit == (typeof(ctl->limit))-1)
+		q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
+	else if (ctl->limit < 1 || ctl->limit > qdisc_dev(sch)->tx_queue_len)
+		return -EINVAL;
+	else
+		q->max_limit = ctl->limit;
+
+	if (ctl->noip_dfltp == (typeof(ctl->noip_dfltp))-1)
+		q->noip_dfltp = 0;
+	else if (ctl->noip_dfltp >= GKPRIO_MAX_PRIORITY)
+		return -EINVAL;
+	else
+		q->noip_dfltp = ctl->noip_dfltp;
+
+	return 0;
+}
+
+static int gkprio_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+	unsigned int min_limit = 1;
+
+	/* Initialise all queues, one for each possible priority. */
+	for (prio = 0; prio < GKPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_head_init(&q->qdiscs[prio]);
+
+	q->highest_prio = 0;
+	q->lowest_prio = GKPRIO_MAX_PRIORITY - 1;
+	if (!opt) {
+		q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
+		q->noip_dfltp = 0;
+		return 0;
+	}
+	return gkprio_change(sch, opt, extack);
+}
+
+static int gkprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct tc_gkprio_qopt opt;
+
+	opt.limit = q->max_limit;
+	opt.noip_dfltp = q->noip_dfltp;
+	opt.highest_prio = q->highest_prio;
+	opt.lowest_prio = q->lowest_prio;
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		return -1;
+
+	return skb->len;
+}
+
+static void gkprio_reset(struct Qdisc *sch)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	for (prio = 0; prio < GKPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+	q->highest_prio = 0;
+	q->lowest_prio = GKPRIO_MAX_PRIORITY - 1;
+}
+
+static void gkprio_destroy(struct Qdisc *sch)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < GKPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+}
+
+struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
+	.id		=	"gkprio",
+	.priv_size	=	sizeof(struct gkprio_sched_data),
+	.enqueue	=	gkprio_enqueue,
+	.dequeue	=	gkprio_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	gkprio_init,
+	.reset		=	gkprio_reset,
+	.change		=	gkprio_change,
+	.dump		=	gkprio_dump,
+	.destroy	=	gkprio_destroy,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init gkprio_module_init(void)
+{
+	return register_qdisc(&gkprio_qdisc_ops);
+}
+
+static void __exit gkprio_module_exit(void)
+{
+	unregister_qdisc(&gkprio_qdisc_ops);
+}
+
+module_init(gkprio_module_init)
+module_exit(gkprio_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-07  9:36 [PATCH net-next] net:sched: add gkprio scheduler Nishanth Devarajan
@ 2018-05-08  5:24 ` Cong Wang
  2018-05-08 10:12   ` Nishanth Devarajan
  0 siblings, 1 reply; 16+ messages in thread
From: Cong Wang @ 2018-05-08  5:24 UTC (permalink / raw)
  To: Nishanth Devarajan
  Cc: Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers, doucette, michel

On Mon, May 7, 2018 at 2:36 AM, Nishanth Devarajan <ndev2021@gmail.com> wrote:
> net/sched: add gkprio scheduler
>
> Gkprio (Gatekeeper Priority Queue) is a queueing discipline that prioritizes
> IPv4 and IPv6 packets accordingly to their DSCP field. Although Gkprio can be
> employed in any QoS scenario in which a higher DSCP field means a higher
> priority packet, Gkprio was concieved as a solution for denial-of-service
> defenses that need to route packets with different priorities.


Can we give it a better name? "Gatekeeper" is meaningless if we read
it alone, it ties to your Gatekeeper project which is more than just this
kernel module. Maybe "DS Priority Queue"?

Overall it looks good to me, just one thing below:

> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
> +       .id             =       "gkprio",
> +       .priv_size      =       sizeof(struct gkprio_sched_data),
> +       .enqueue        =       gkprio_enqueue,
> +       .dequeue        =       gkprio_dequeue,
> +       .peek           =       qdisc_peek_dequeued,
> +       .init           =       gkprio_init,
> +       .reset          =       gkprio_reset,
> +       .change         =       gkprio_change,
> +       .dump           =       gkprio_dump,
> +       .destroy        =       gkprio_destroy,
> +       .owner          =       THIS_MODULE,
> +};

You probably want to add Qdisc_class_ops here so that you can
dump the stats of each internal queue.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-08  5:24 ` Cong Wang
@ 2018-05-08 10:12   ` Nishanth Devarajan
  2018-05-08 12:59     ` Michel Machado
  0 siblings, 1 reply; 16+ messages in thread
From: Nishanth Devarajan @ 2018-05-08 10:12 UTC (permalink / raw)
  To: Cong Wang; +Cc: jiri, jhs, davem, netdev, doucette, michel

On Mon, May 07, 2018 at 10:24:51PM -0700, Cong Wang wrote:
> On Mon, May 7, 2018 at 2:36 AM, Nishanth Devarajan <ndev2021@gmail.com> wrote:
> > net/sched: add gkprio scheduler
> >
> > Gkprio (Gatekeeper Priority Queue) is a queueing discipline that prioritizes
> > IPv4 and IPv6 packets accordingly to their DSCP field. Although Gkprio can be
> > employed in any QoS scenario in which a higher DSCP field means a higher
> > priority packet, Gkprio was concieved as a solution for denial-of-service
> > defenses that need to route packets with different priorities.
> 
> 
> Can we give it a better name? "Gatekeeper" is meaningless if we read
> it alone, it ties to your Gatekeeper project which is more than just this
> kernel module. Maybe "DS Priority Queue"?
> 

Yes, we should be able to come up with a better name, we'll work on it.

> Overall it looks good to me, just one thing below:
> 
> > +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
> > +       .id             =       "gkprio",
> > +       .priv_size      =       sizeof(struct gkprio_sched_data),
> > +       .enqueue        =       gkprio_enqueue,
> > +       .dequeue        =       gkprio_dequeue,
> > +       .peek           =       qdisc_peek_dequeued,
> > +       .init           =       gkprio_init,
> > +       .reset          =       gkprio_reset,
> > +       .change         =       gkprio_change,
> > +       .dump           =       gkprio_dump,
> > +       .destroy        =       gkprio_destroy,
> > +       .owner          =       THIS_MODULE,
> > +};
> 
> You probably want to add Qdisc_class_ops here so that you can
> dump the stats of each internal queue.

Alright, will make some changes and send in a v2.

Thanks,
Nishanth

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-08 10:12   ` Nishanth Devarajan
@ 2018-05-08 12:59     ` Michel Machado
  2018-05-08 13:29       ` Jamal Hadi Salim
  2018-05-09  2:24       ` Cong Wang
  0 siblings, 2 replies; 16+ messages in thread
From: Michel Machado @ 2018-05-08 12:59 UTC (permalink / raw)
  To: Nishanth Devarajan, Cong Wang; +Cc: jiri, jhs, davem, netdev, doucette

>> Overall it looks good to me, just one thing below:
>>
>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>> +       .id             =       "gkprio",
>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>> +       .enqueue        =       gkprio_enqueue,
>>> +       .dequeue        =       gkprio_dequeue,
>>> +       .peek           =       qdisc_peek_dequeued,
>>> +       .init           =       gkprio_init,
>>> +       .reset          =       gkprio_reset,
>>> +       .change         =       gkprio_change,
>>> +       .dump           =       gkprio_dump,
>>> +       .destroy        =       gkprio_destroy,
>>> +       .owner          =       THIS_MODULE,
>>> +};
>>
>> You probably want to add Qdisc_class_ops here so that you can
>> dump the stats of each internal queue.

Hi Cong,

    In the production scenario we are targeting, this priority queue 
must be classless; being classful would only bloat the code for us. I 
don't see making this queue classful as a problem per se, but I suggest 
leaving it as a future improvement for when someone can come up with a 
useful scenario for it.

[ ]'s
Michel Machado

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-08 12:59     ` Michel Machado
@ 2018-05-08 13:29       ` Jamal Hadi Salim
  2018-05-08 14:56         ` Michel Machado
  2018-05-09  2:27         ` Cong Wang
  2018-05-09  2:24       ` Cong Wang
  1 sibling, 2 replies; 16+ messages in thread
From: Jamal Hadi Salim @ 2018-05-08 13:29 UTC (permalink / raw)
  To: Michel Machado, Nishanth Devarajan, Cong Wang
  Cc: jiri, davem, netdev, doucette

On 08/05/18 08:59 AM, Michel Machado wrote:
>>> Overall it looks good to me, just one thing below:
>>>
>>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>>> +       .id             =       "gkprio",
>>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>>> +       .enqueue        =       gkprio_enqueue,
>>>> +       .dequeue        =       gkprio_dequeue,
>>>> +       .peek           =       qdisc_peek_dequeued,
>>>> +       .init           =       gkprio_init,
>>>> +       .reset          =       gkprio_reset,
>>>> +       .change         =       gkprio_change,
>>>> +       .dump           =       gkprio_dump,
>>>> +       .destroy        =       gkprio_destroy,
>>>> +       .owner          =       THIS_MODULE,
>>>> +};
>>>
>>> You probably want to add Qdisc_class_ops here so that you can
>>> dump the stats of each internal queue.
> 
> Hi Cong,
> 
>     In the production scenario we are targeting, this priority queue 
> must be classless; being classful would only bloat the code for us. I 
> don't see making this queue classful as a problem per se, but I suggest 
> leaving it as a future improvement for when someone can come up with a 
> useful scenario for it.

I am actually struggling with this whole thing.
Have you considered using skb->prio instead of peeking into the packet
header.
Also have you looked at the dsmark qdisc?

cheers,
jamal

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-08 13:29       ` Jamal Hadi Salim
@ 2018-05-08 14:56         ` Michel Machado
  2018-05-09  2:27         ` Cong Wang
  1 sibling, 0 replies; 16+ messages in thread
From: Michel Machado @ 2018-05-08 14:56 UTC (permalink / raw)
  To: Jamal Hadi Salim, Nishanth Devarajan, Cong Wang
  Cc: jiri, davem, netdev, doucette

On 05/08/2018 09:29 AM, Jamal Hadi Salim wrote:
> On 08/05/18 08:59 AM, Michel Machado wrote:
>>>> Overall it looks good to me, just one thing below:
>>>>
>>>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>>>> +       .id             =       "gkprio",
>>>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>>>> +       .enqueue        =       gkprio_enqueue,
>>>>> +       .dequeue        =       gkprio_dequeue,
>>>>> +       .peek           =       qdisc_peek_dequeued,
>>>>> +       .init           =       gkprio_init,
>>>>> +       .reset          =       gkprio_reset,
>>>>> +       .change         =       gkprio_change,
>>>>> +       .dump           =       gkprio_dump,
>>>>> +       .destroy        =       gkprio_destroy,
>>>>> +       .owner          =       THIS_MODULE,
>>>>> +};
>>>>
>>>> You probably want to add Qdisc_class_ops here so that you can
>>>> dump the stats of each internal queue.
>>
>> Hi Cong,
>>
>>     In the production scenario we are targeting, this priority queue 
>> must be classless; being classful would only bloat the code for us. I 
>> don't see making this queue classful as a problem per se, but I 
>> suggest leaving it as a future improvement for when someone can come 
>> up with a useful scenario for it.
> 
> I am actually struggling with this whole thing.
> Have you considered using skb->prio instead of peeking into the packet
> header.
> Also have you looked at the dsmark qdisc?

As far as I know, skb->priority (skb->prio has been renamed) is unsigned 
for packets that come from the network. DSprio, adopting Cong's name 
suggestion, is most useful "merging" packets that come from different 
network interfaces.

Had we relied on DSmark to mark skb->tc_index with the DS field, we 
would have forced anyone using DSprio to use DSmark. This may sound as a 
good idea, but DSmark always requires writable socket buffers while 
setting skb->tc_index with the DS field of the packet (see 
dsmark_enqueue()), what means that the kernel may drop high priority 
packets instead of low priority packets due to memory pressure.

[ ]'s
Michel Machado

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-08 12:59     ` Michel Machado
  2018-05-08 13:29       ` Jamal Hadi Salim
@ 2018-05-09  2:24       ` Cong Wang
  2018-05-09 14:09         ` Michel Machado
  1 sibling, 1 reply; 16+ messages in thread
From: Cong Wang @ 2018-05-09  2:24 UTC (permalink / raw)
  To: Michel Machado
  Cc: Nishanth Devarajan, Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On Tue, May 8, 2018 at 5:59 AM, Michel Machado <michel@digirati.com.br> wrote:
>>> Overall it looks good to me, just one thing below:
>>>
>>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>>> +       .id             =       "gkprio",
>>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>>> +       .enqueue        =       gkprio_enqueue,
>>>> +       .dequeue        =       gkprio_dequeue,
>>>> +       .peek           =       qdisc_peek_dequeued,
>>>> +       .init           =       gkprio_init,
>>>> +       .reset          =       gkprio_reset,
>>>> +       .change         =       gkprio_change,
>>>> +       .dump           =       gkprio_dump,
>>>> +       .destroy        =       gkprio_destroy,
>>>> +       .owner          =       THIS_MODULE,
>>>> +};
>>>
>>>
>>> You probably want to add Qdisc_class_ops here so that you can
>>> dump the stats of each internal queue.
>
>
> Hi Cong,
>
>    In the production scenario we are targeting, this priority queue must be
> classless; being classful would only bloat the code for us. I don't see
> making this queue classful as a problem per se, but I suggest leaving it as
> a future improvement for when someone can come up with a useful scenario for
> it.


Take a look at sch_prio, it is fairly simple since your internal
queues are just an array... Per-queue stats are quite useful
in production, we definitely want to observe which queues are
full which are not.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-08 13:29       ` Jamal Hadi Salim
  2018-05-08 14:56         ` Michel Machado
@ 2018-05-09  2:27         ` Cong Wang
  2018-05-09 14:43           ` Jamal Hadi Salim
  1 sibling, 1 reply; 16+ messages in thread
From: Cong Wang @ 2018-05-09  2:27 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Michel Machado, Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On Tue, May 8, 2018 at 6:29 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> Have you considered using skb->prio instead of peeking into the packet
> header.
> Also have you looked at the dsmark qdisc?
>

dsmark modifies ds fields, while this one just maps ds fields into
different queues.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-09  2:24       ` Cong Wang
@ 2018-05-09 14:09         ` Michel Machado
  2018-05-10 17:38           ` Cong Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Machado @ 2018-05-09 14:09 UTC (permalink / raw)
  To: Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On 05/08/2018 10:24 PM, Cong Wang wrote:
> On Tue, May 8, 2018 at 5:59 AM, Michel Machado <michel@digirati.com.br> wrote:
>>>> Overall it looks good to me, just one thing below:
>>>>
>>>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>>>> +       .id             =       "gkprio",
>>>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>>>> +       .enqueue        =       gkprio_enqueue,
>>>>> +       .dequeue        =       gkprio_dequeue,
>>>>> +       .peek           =       qdisc_peek_dequeued,
>>>>> +       .init           =       gkprio_init,
>>>>> +       .reset          =       gkprio_reset,
>>>>> +       .change         =       gkprio_change,
>>>>> +       .dump           =       gkprio_dump,
>>>>> +       .destroy        =       gkprio_destroy,
>>>>> +       .owner          =       THIS_MODULE,
>>>>> +};
>>>>
>>>>
>>>> You probably want to add Qdisc_class_ops here so that you can
>>>> dump the stats of each internal queue.
>>
>>
>> Hi Cong,
>>
>>     In the production scenario we are targeting, this priority queue must be
>> classless; being classful would only bloat the code for us. I don't see
>> making this queue classful as a problem per se, but I suggest leaving it as
>> a future improvement for when someone can come up with a useful scenario for
>> it.
> 
> 
> Take a look at sch_prio, it is fairly simple since your internal
> queues are just an array... Per-queue stats are quite useful
> in production, we definitely want to observe which queues are
> full which are not.
> 

DSprio cannot add Qdisc_class_ops without a rewrite of other queue 
disciplines, which doesn't seem desirable. Since the method cops->leaf 
is required (see register_qdisc()), we would need to replace the array 
struct sk_buff_head qdiscs[GKPRIO_MAX_PRIORITY] in struct 
gkprio_sched_data with the array struct Qdisc 
*queues[GKPRIO_MAX_PRIORITY] to be able to return a Qdisc in 
dsprio_leaf(). The problem with this change is that Qdisc does not have 
a method to dequeue from its tail. This new method may not even make 
sense in other queue disciplines. But without this method, 
gkprio_enqueue() cannot drop the lowest priority packet when the queue 
is full and an incoming packet has higher priority.

Nevertheless, I see your point on being able to observe the distribution 
of queued packets per priority. A solution for that would be to add the 
array __u32 qlen[GKPRIO_MAX_PRIORITY] in struct tc_gkprio_qopt. This 
solution even avoids adding overhead in the critical paths of DSprio. Do 
you see a better solution?

By the way, I've used GKPRIO_MAX_PRIORITY and other names that include 
"gkprio" above to reflect the version 1 of this patch that we are 
discussing. We will rename these identifiers for version 2 of this patch 
to replace "gkprio" with "dsprio".

[ ]'s
Michel Machado

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-09  2:27         ` Cong Wang
@ 2018-05-09 14:43           ` Jamal Hadi Salim
  2018-05-09 17:37             ` Michel Machado
  0 siblings, 1 reply; 16+ messages in thread
From: Jamal Hadi Salim @ 2018-05-09 14:43 UTC (permalink / raw)
  To: Cong Wang
  Cc: Michel Machado, Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On 08/05/18 10:27 PM, Cong Wang wrote:
> On Tue, May 8, 2018 at 6:29 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>> Have you considered using skb->prio instead of peeking into the packet
>> header.
>> Also have you looked at the dsmark qdisc?
>>
> 
> dsmark modifies ds fields, while this one just maps ds fields into
> different queues.
> 

Yeah, I was thinking more of re-using it for the purpose of mapping to
queues - but would require a lot more work.

once skbprio is set by something[1] then this qdisc could be used by
other subsystems (8021q, sockets etc); so i would argue for removal
of the embedded classification and instead maybe writing a simple
extension to skbmod to mark skbprio based on ds.

I find the cleverness in changing the highest/low prios confusing.
It looks error-prone (I guess that is why there is a BUG check)
To the authors: Is there a document/paper on the theory of this thing
as to why no explicit queues are "faster"?

Some other feedback:

1) I agree that using multiple queues as in prio qdisc would make it
more manageable; does not necessarily need to be classful if you
use implicit skbprio classification. i.e on equeue use a priority
map to select a queue; on dequeue always dequeu from highest prio
until it has no more packets to send.

2) Dropping already enqueued packets will not work well for
local feedback (__NET_XMIT_BYPASS return code is about the
packet that has been dropped from earlier enqueueing because
it is lower priority - it does not  signify anything with
current skb to which actually just got enqueud).
Perhaps (off top of my head) is to always enqueue packets on
high priority when their limit is exceeded as long as lower prio has
some space. Means youd have to increment low prio accounting if their
space is used.

cheers,
jamal

[1] something like:
tc filter add match all ip action skbmod inheritdsfield
tc filter add match all ip6 action skbmod inheritdsfield

inheritdsfield maps ds to skb->prioriry

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-09 14:43           ` Jamal Hadi Salim
@ 2018-05-09 17:37             ` Michel Machado
  2018-05-12 14:48               ` Jamal Hadi Salim
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Machado @ 2018-05-09 17:37 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On 05/09/2018 10:43 AM, Jamal Hadi Salim wrote:
> On 08/05/18 10:27 PM, Cong Wang wrote:
>> On Tue, May 8, 2018 at 6:29 AM, Jamal Hadi Salim <jhs@mojatatu.com> 
>> wrote:
>>> Have you considered using skb->prio instead of peeking into the packet
>>> header.
>>> Also have you looked at the dsmark qdisc?
>>>
>>
>> dsmark modifies ds fields, while this one just maps ds fields into
>> different queues.
>>
> 
> Yeah, I was thinking more of re-using it for the purpose of mapping to
> queues - but would require a lot more work.
> 
> once skbprio is set by something[1] then this qdisc could be used by
> other subsystems (8021q, sockets etc); so i would argue for removal
> of the embedded classification and instead maybe writing a simple
> extension to skbmod to mark skbprio based on ds.

I like the suggestion of extending skbmod to mark skbprio based on ds. 
Given that DSprio would no longer depend on the DS field, would you have 
a name suggestion for this new queue discipline since the name "prio" is 
currently in use?

What should be the range of priorities that this new queue discipline 
would accept? skb->prioriry is of type __u32, but supporting 2^32 
priorities would require too large of an array to index packets by 
priority; the DS field is only 6 bits long. Do you have a use case in 
mind to guide us here?

> I find the cleverness in changing the highest/low prios confusing.
> It looks error-prone (I guess that is why there is a BUG check)
> To the authors: Is there a document/paper on the theory of this thing
> as to why no explicit queues are "faster"?

The priority orientation in GKprio is due to two factors: failing safe 
and elegance. If zero were the highest priority, any operational mistake 
that leads not-classified packets through GKprio would potentially 
disrupt the system. We are humans, we'll make mistakes. The elegance 
aspect comes from the fact that the assigned priority is not massaged to 
fit the DS field. We find it helpful while inspecting packets on the wire.

The reason for us to avoid explicit queues in GKprio, which could change 
the behavior within a given priority, is to closely abide to the 
expected behavior assumed to prove Theorem 4.1 in the paper "Portcullis: 
Protecting Connection Setup from Denial-of-Capability Attacks":

https://dl.acm.org/citation.cfm?id=1282413

> 1) I agree that using multiple queues as in prio qdisc would make it
> more manageable; does not necessarily need to be classful if you
> use implicit skbprio classification. i.e on equeue use a priority
> map to select a queue; on dequeue always dequeu from highest prio
> until it has no more packets to send.

In my reply to Cong, I point out that there is a technical limitation 
in the interface of queue disciplines that forbids GKprio to have 
explicit sub-queues:

https://www.mail-archive.com/netdev@vger.kernel.org/msg234201.html

> 2) Dropping already enqueued packets will not work well for
> local feedback (__NET_XMIT_BYPASS return code is about the
> packet that has been dropped from earlier enqueueing because
> it is lower priority - it does not  signify anything with
> current skb to which actually just got enqueud).
> Perhaps (off top of my head) is to always enqueue packets on
> high priority when their limit is exceeded as long as lower prio has
> some space. Means youd have to increment low prio accounting if their
> space is used.

I don't understand the point you are making here. Could you develop it 
further?

[ ]'s
Michel Machado

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-09 14:09         ` Michel Machado
@ 2018-05-10 17:38           ` Cong Wang
  2018-05-10 19:06             ` Michel Machado
  0 siblings, 1 reply; 16+ messages in thread
From: Cong Wang @ 2018-05-10 17:38 UTC (permalink / raw)
  To: Michel Machado
  Cc: Nishanth Devarajan, Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On Wed, May 9, 2018 at 7:09 AM, Michel Machado <michel@digirati.com.br> wrote:
> On 05/08/2018 10:24 PM, Cong Wang wrote:
>>
>> On Tue, May 8, 2018 at 5:59 AM, Michel Machado <michel@digirati.com.br>
>> wrote:
>>>>>
>>>>> Overall it looks good to me, just one thing below:
>>>>>
>>>>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>>>>> +       .id             =       "gkprio",
>>>>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>>>>> +       .enqueue        =       gkprio_enqueue,
>>>>>> +       .dequeue        =       gkprio_dequeue,
>>>>>> +       .peek           =       qdisc_peek_dequeued,
>>>>>> +       .init           =       gkprio_init,
>>>>>> +       .reset          =       gkprio_reset,
>>>>>> +       .change         =       gkprio_change,
>>>>>> +       .dump           =       gkprio_dump,
>>>>>> +       .destroy        =       gkprio_destroy,
>>>>>> +       .owner          =       THIS_MODULE,
>>>>>> +};
>>>>>
>>>>>
>>>>>
>>>>> You probably want to add Qdisc_class_ops here so that you can
>>>>> dump the stats of each internal queue.
>>>
>>>
>>>
>>> Hi Cong,
>>>
>>>     In the production scenario we are targeting, this priority queue must
>>> be
>>> classless; being classful would only bloat the code for us. I don't see
>>> making this queue classful as a problem per se, but I suggest leaving it
>>> as
>>> a future improvement for when someone can come up with a useful scenario
>>> for
>>> it.
>>
>>
>>
>> Take a look at sch_prio, it is fairly simple since your internal
>> queues are just an array... Per-queue stats are quite useful
>> in production, we definitely want to observe which queues are
>> full which are not.
>>
>
> DSprio cannot add Qdisc_class_ops without a rewrite of other queue
> disciplines, which doesn't seem desirable. Since the method cops->leaf is
> required (see register_qdisc()), we would need to replace the array struct
> sk_buff_head qdiscs[GKPRIO_MAX_PRIORITY] in struct gkprio_sched_data with
> the array struct Qdisc *queues[GKPRIO_MAX_PRIORITY] to be able to return a
> Qdisc in dsprio_leaf(). The problem with this change is that Qdisc does not
> have a method to dequeue from its tail. This new method may not even make
> sense in other queue disciplines. But without this method, gkprio_enqueue()
> cannot drop the lowest priority packet when the queue is full and an
> incoming packet has higher priority.

Sorry for giving you a bad example. Take a look at sch_fq_codel instead,
it returns NULL for ->leaf() and maps its internal flows to classes.

I thought sch_prio uses internal qdiscs, but I was wrong, as you noticed
it actually exposes them to user via classes.

My point is never to make it classful, just want to expose the useful stats,
like how fq_codel dumps its internal flows.


>
> Nevertheless, I see your point on being able to observe the distribution of
> queued packets per priority. A solution for that would be to add the array
> __u32 qlen[GKPRIO_MAX_PRIORITY] in struct tc_gkprio_qopt. This solution even
> avoids adding overhead in the critical paths of DSprio. Do you see a better
> solution?

I believe you can return NULL for ->leaf() and don't need to worry about
->graft() either. ;)


>
> By the way, I've used GKPRIO_MAX_PRIORITY and other names that include
> "gkprio" above to reflect the version 1 of this patch that we are
> discussing. We will rename these identifiers for version 2 of this patch to
> replace "gkprio" with "dsprio".
>

Sounds good.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-10 17:38           ` Cong Wang
@ 2018-05-10 19:06             ` Michel Machado
  0 siblings, 0 replies; 16+ messages in thread
From: Michel Machado @ 2018-05-10 19:06 UTC (permalink / raw)
  To: Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers, Cody Doucette

On 05/10/2018 01:38 PM, Cong Wang wrote:
> On Wed, May 9, 2018 at 7:09 AM, Michel Machado <michel@digirati.com.br> wrote:
>> On 05/08/2018 10:24 PM, Cong Wang wrote:
>>>
>>> On Tue, May 8, 2018 at 5:59 AM, Michel Machado <michel@digirati.com.br>
>>> wrote:
>>>>>>
>>>>>> Overall it looks good to me, just one thing below:
>>>>>>
>>>>>>> +struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
>>>>>>> +       .id             =       "gkprio",
>>>>>>> +       .priv_size      =       sizeof(struct gkprio_sched_data),
>>>>>>> +       .enqueue        =       gkprio_enqueue,
>>>>>>> +       .dequeue        =       gkprio_dequeue,
>>>>>>> +       .peek           =       qdisc_peek_dequeued,
>>>>>>> +       .init           =       gkprio_init,
>>>>>>> +       .reset          =       gkprio_reset,
>>>>>>> +       .change         =       gkprio_change,
>>>>>>> +       .dump           =       gkprio_dump,
>>>>>>> +       .destroy        =       gkprio_destroy,
>>>>>>> +       .owner          =       THIS_MODULE,
>>>>>>> +};
>>>>>>
>>>>>>
>>>>>>
>>>>>> You probably want to add Qdisc_class_ops here so that you can
>>>>>> dump the stats of each internal queue.
>>>>
>>>>
>>>>
>>>> Hi Cong,
>>>>
>>>>      In the production scenario we are targeting, this priority queue must
>>>> be
>>>> classless; being classful would only bloat the code for us. I don't see
>>>> making this queue classful as a problem per se, but I suggest leaving it
>>>> as
>>>> a future improvement for when someone can come up with a useful scenario
>>>> for
>>>> it.
>>>
>>>
>>>
>>> Take a look at sch_prio, it is fairly simple since your internal
>>> queues are just an array... Per-queue stats are quite useful
>>> in production, we definitely want to observe which queues are
>>> full which are not.
>>>
>>
>> DSprio cannot add Qdisc_class_ops without a rewrite of other queue
>> disciplines, which doesn't seem desirable. Since the method cops->leaf is
>> required (see register_qdisc()), we would need to replace the array struct
>> sk_buff_head qdiscs[GKPRIO_MAX_PRIORITY] in struct gkprio_sched_data with
>> the array struct Qdisc *queues[GKPRIO_MAX_PRIORITY] to be able to return a
>> Qdisc in dsprio_leaf(). The problem with this change is that Qdisc does not
>> have a method to dequeue from its tail. This new method may not even make
>> sense in other queue disciplines. But without this method, gkprio_enqueue()
>> cannot drop the lowest priority packet when the queue is full and an
>> incoming packet has higher priority.
> 
> Sorry for giving you a bad example. Take a look at sch_fq_codel instead,
> it returns NULL for ->leaf() and maps its internal flows to classes.
> 
> I thought sch_prio uses internal qdiscs, but I was wrong, as you noticed
> it actually exposes them to user via classes.
> 
> My point is never to make it classful, just want to expose the useful stats,
> like how fq_codel dumps its internal flows.
> 
> 
>>
>> Nevertheless, I see your point on being able to observe the distribution of
>> queued packets per priority. A solution for that would be to add the array
>> __u32 qlen[GKPRIO_MAX_PRIORITY] in struct tc_gkprio_qopt. This solution even
>> avoids adding overhead in the critical paths of DSprio. Do you see a better
>> solution?
> 
> I believe you can return NULL for ->leaf() and don't need to worry about
> ->graft() either. ;)

Thank you for pointing sch_fq_codel out. We'll follow its example.

[ ]'s
Michel Machado

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-09 17:37             ` Michel Machado
@ 2018-05-12 14:48               ` Jamal Hadi Salim
  2018-05-14 14:08                 ` Michel Machado
  0 siblings, 1 reply; 16+ messages in thread
From: Jamal Hadi Salim @ 2018-05-12 14:48 UTC (permalink / raw)
  To: Michel Machado, Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette

Sorry for the latency..

On 09/05/18 01:37 PM, Michel Machado wrote:
> On 05/09/2018 10:43 AM, Jamal Hadi Salim wrote:
>> On 08/05/18 10:27 PM, Cong Wang wrote:
>>> On Tue, May 8, 2018 at 6:29 AM, Jamal Hadi Salim <jhs@mojatatu.com> 
>>> wrote:

> 
> I like the suggestion of extending skbmod to mark skbprio based on ds. 
> Given that DSprio would no longer depend on the DS field, would you have 
> a name suggestion for this new queue discipline since the name "prio" is 
> currently in use?
> 

Not sure what to call it.
My struggle is still with the intended end goal of the qdisc.
It looks like prio qdisc except for the enqueue part which attempts
to use a shared global queue size for all prios. I would have
pointed to other approaches which use global priority queue pool
which do early congestion detection like RED or variants like GRED but
those use average values of the queue lengths not instantenous values 
such as you do.
I am tempted to say - based on my current understanding - that you dont
need a new qdisc; rather you need to map your dsfields to skbprio
(via skbmod) and stick with prio qdisc. I also think the skbmod
mapping is useful regardless of this need.

> What should be the range of priorities that this new queue discipline 
> would accept? skb->prioriry is of type __u32, but supporting 2^32 
> priorities would require too large of an array to index packets by 
> priority; the DS field is only 6 bits long. Do you have a use case in 
> mind to guide us here?
>

Look at the priomap or prio2band arrangement on prio qdisc
or pfifo_fast qdisc. You take an skbprio as an index into the array
and retrieve a queue to enqueue to. The size of the array is 16.
In the past this was based IIRC on ip precedence + 1 bit. Those map
similarly to DS fields (calls selectors, assured forwarding etc). So
no need to even increase the array beyond current 16.

>> I find the cleverness in changing the highest/low prios confusing.
>> It looks error-prone (I guess that is why there is a BUG check)
>> To the authors: Is there a document/paper on the theory of this thing
>> as to why no explicit queues are "faster"?
> 
> The priority orientation in GKprio is due to two factors: failing safe 
> and elegance. If zero were the highest priority, any operational mistake 
> that leads not-classified packets through GKprio would potentially 
> disrupt the system. We are humans, we'll make mistakes. The elegance 
> aspect comes from the fact that the assigned priority is not massaged to 
> fit the DS field. We find it helpful while inspecting packets on the wire.
> 
> The reason for us to avoid explicit queues in GKprio, which could change 
> the behavior within a given priority, is to closely abide to the 
> expected behavior assumed to prove Theorem 4.1 in the paper "Portcullis: 
> Protecting Connection Setup from Denial-of-Capability Attacks":
> 
> https://dl.acm.org/citation.cfm?id=1282413
> 

Paper seems to be under paywall. Googling didnt help.
My concern is still the science behind this; if you had written up
some test setup which shows how you concluded this was a better
approach at DOS prevention and showed some numbers it would have
helped greatly clarify.

>> 1) I agree that using multiple queues as in prio qdisc would make it
>> more manageable; does not necessarily need to be classful if you
>> use implicit skbprio classification. i.e on equeue use a priority
>> map to select a queue; on dequeue always dequeu from highest prio
>> until it has no more packets to send.
> 
> In my reply to Cong, I point out that there is a technical limitation in 
> the interface of queue disciplines that forbids GKprio to have explicit 
> sub-queues:
> 
> https://www.mail-archive.com/netdev@vger.kernel.org/msg234201.html
> 
>> 2) Dropping already enqueued packets will not work well for
>> local feedback (__NET_XMIT_BYPASS return code is about the
>> packet that has been dropped from earlier enqueueing because
>> it is lower priority - it does not  signify anything with
>> current skb to which actually just got enqueud).
>> Perhaps (off top of my head) is to always enqueue packets on
>> high priority when their limit is exceeded as long as lower prio has
>> some space. Means youd have to increment low prio accounting if their
>> space is used.
> 
> I don't understand the point you are making here. Could you develop it 
> further?
> 

Sorry - I was meaning NET_XMIT_CN
If you drop an already enqueued packet - it makes sense to signify as
such using NET_XMIT_CN
this does not make sense for forwarded packets but it does
for locally sourced packets.

cheers,
jamal

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-12 14:48               ` Jamal Hadi Salim
@ 2018-05-14 14:08                 ` Michel Machado
  2018-05-16 12:18                   ` Jamal Hadi Salim
  0 siblings, 1 reply; 16+ messages in thread
From: Michel Machado @ 2018-05-14 14:08 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette

> On 09/05/18 01:37 PM, Michel Machado wrote:
>> On 05/09/2018 10:43 AM, Jamal Hadi Salim wrote:
>>> On 08/05/18 10:27 PM, Cong Wang wrote:
>>>> On Tue, May 8, 2018 at 6:29 AM, Jamal Hadi Salim <jhs@mojatatu.com> 
>>>> wrote:
> 
>>
>> I like the suggestion of extending skbmod to mark skbprio based on ds. 
>> Given that DSprio would no longer depend on the DS field, would you 
>> have a name suggestion for this new queue discipline since the name 
>> "prio" is currently in use?
>>
> 
> Not sure what to call it.
> My struggle is still with the intended end goal of the qdisc.
> It looks like prio qdisc except for the enqueue part which attempts
> to use a shared global queue size for all prios. I would have
> pointed to other approaches which use global priority queue pool
> which do early congestion detection like RED or variants like GRED but
> those use average values of the queue lengths not instantenous values 
> such as you do.
> I am tempted to say - based on my current understanding - that you dont
> need a new qdisc; rather you need to map your dsfields to skbprio
> (via skbmod) and stick with prio qdisc. I also think the skbmod
> mapping is useful regardless of this need.

A simplified description of what DSprio is meant to do is as follows: 
when a link is overloaded at a router, DSprio makes this router drop the 
packets of lower priority. These priorities are assigned by Gatekeeper 
in such a way that well behaving sources are favored (Theorem 4.1 of the 
Portcullis paper pointed out in my previous email). Moreover, attackers 
cannot do much better than well behaving sources (Theorem 4.2). This 
description is simplified because it omits many other components of 
Gatekeeper that affects the packets that goes to DSprio.

Like you, I'm all in for less code. If someone can instruct us on how to 
accomplish the same thing that our patch is doing, we would be happy to 
withdraw it. We have submitted this patch because we want to lower the 
bar to deploy Gatekeeper as much as possible, and requiring network 
operators willing to deploy Gatekeeper to keep patching the kernel is an 
operational burden.

>> What should be the range of priorities that this new queue discipline 
>> would accept? skb->prioriry is of type __u32, but supporting 2^32 
>> priorities would require too large of an array to index packets by 
>> priority; the DS field is only 6 bits long. Do you have a use case in 
>> mind to guide us here?
>>
> 
> Look at the priomap or prio2band arrangement on prio qdisc
> or pfifo_fast qdisc. You take an skbprio as an index into the array
> and retrieve a queue to enqueue to. The size of the array is 16.
> In the past this was based IIRC on ip precedence + 1 bit. Those map
> similarly to DS fields (calls selectors, assured forwarding etc). So
> no need to even increase the array beyond current 16.

What application is this change supposed to enable or help? I think this 
change should be left for when one can explain the need for it.

>>> 2) Dropping already enqueued packets will not work well for
>>> local feedback (__NET_XMIT_BYPASS return code is about the
>>> packet that has been dropped from earlier enqueueing because
>>> it is lower priority - it does not  signify anything with
>>> current skb to which actually just got enqueud).
>>> Perhaps (off top of my head) is to always enqueue packets on
>>> high priority when their limit is exceeded as long as lower prio has
>>> some space. Means youd have to increment low prio accounting if their
>>> space is used.
>>
>> I don't understand the point you are making here. Could you develop it 
>> further?
>>
> 
> Sorry - I was meaning NET_XMIT_CN
> If you drop an already enqueued packet - it makes sense to signify as
> such using NET_XMIT_CN
> this does not make sense for forwarded packets but it does
> for locally sourced packets.

Thank you for bringing this detail to our attention; we've overlooked 
the return code NET_XMIT_CN. We'll adopt it when the queue is full and 
the lowest priority packet in the queue is being dropped to make room 
for the higher-priority, incoming packet.

[ ]'s
Michel Machado

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] net:sched: add gkprio scheduler
  2018-05-14 14:08                 ` Michel Machado
@ 2018-05-16 12:18                   ` Jamal Hadi Salim
  0 siblings, 0 replies; 16+ messages in thread
From: Jamal Hadi Salim @ 2018-05-16 12:18 UTC (permalink / raw)
  To: Michel Machado, Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette

Sorry I dropped this.

On 14/05/18 10:08 AM, Michel Machado wrote:
>> On 09/05/18 01:37 PM, Michel Machado wrote:

> 
> A simplified description of what DSprio is meant to do is as follows: 
> when a link is overloaded at a router, DSprio makes this router drop the 
> packets of lower priority.

Makes sense. Any priority based work-conserving scheduler will work
fine. The only small difference you have with prio qdisc is you
drop an enqueued low prio packet to make room for a new higher prio
queue. Can you look at pfifo_head_drop qdisc to see if it suffices? It
may not be: In such a case, I would suggest a hybrid between
pfifo_head_drop and pfifo_fast for the new qdisc.
[Cong has suggested to write a classful qdisc but it may be sufficient
to just replicate what pfifo_fast does since it tracks virtual queues]

> These priorities are assigned by Gatekeeper 
> in such a way that well behaving sources are favored (Theorem 4.1 of the 
> Portcullis paper pointed out in my previous email). Moreover, attackers 
> cannot do much better than well behaving sources (Theorem 4.2). This 
> description is simplified because it omits many other components of 
> Gatekeeper that affects the packets that goes to DSprio.
> 

I am sorry - I have no access to this document so dont know what these
theorems are. I understand your requirements. 1) You are looking to use
priority identifiers to select queues. 2) You want to prioritize
treatment of favorably tagged packets. The enqueueing will drop
lower priority packets to make space for higher priority under
congestion. Did i miss anything?
For #1 my suggestion is to use skbmod to set the priority tag.
For #2 if you didnt have to drop at enqueue time you could have
used any of the existing priority favoring qdiscs which recognize
skb->priority. Otherwise as i suggested above look at
pfifo_fast/pfifo_head_drop

> Like you, I'm all in for less code. If someone can instruct us on how to 
> accomplish the same thing that our patch is doing, we would be happy to 
> withdraw it. We have submitted this patch because we want to lower the 
> bar to deploy Gatekeeper as much as possible, and requiring network 
> operators willing to deploy Gatekeeper to keep patching the kernel is an 
> operational burden.
> 

So I would suggest you keep this real simple - especially if you want to
go backwards in kernels. For existing kernels you can implement the
basic policies of what you need by using prio qdisc with a combination
of a classifier that knows how to match on dsfield (trivial to do with
u32) and skbedit action to tag the skb->priority. Then let prio qdisc
use the priomap to select the queue.
If you must drop enqueued low prio packets then you may need the new
qdisc. And to optimize, you will need the skbmod change.
I really think it is a bad idea to encapsulate the classifier in the
qdisc.


>> Look at the priomap or prio2band arrangement on prio qdisc
>> or pfifo_fast qdisc. You take an skbprio as an index into the array
>> and retrieve a queue to enqueue to. The size of the array is 16.
>> In the past this was based IIRC on ip precedence + 1 bit. Those map
>> similarly to DS fields (calls selectors, assured forwarding etc). So
>> no need to even increase the array beyond current 16.
> 
> What application is this change supposed to enable or help? I think this 
> change should be left for when one can explain the need for it.
> 

I meant to take a look at the prio map. It is an array of size 16 which
holds the skb->priority implicit classifier (prio, pfifo_fast etc).
A packets skb priority is used as an index into this array and from the 
result a queue is selected to put the packet onto.
The map of this array can be configured from user space. I was saying
earlier that it may be tempting to make a size 64 array to map the
possible dsfields - in practise that has never been pragmatic (so 16 was
sufficient).


cheers,
jamal

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-05-16 12:18 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-07  9:36 [PATCH net-next] net:sched: add gkprio scheduler Nishanth Devarajan
2018-05-08  5:24 ` Cong Wang
2018-05-08 10:12   ` Nishanth Devarajan
2018-05-08 12:59     ` Michel Machado
2018-05-08 13:29       ` Jamal Hadi Salim
2018-05-08 14:56         ` Michel Machado
2018-05-09  2:27         ` Cong Wang
2018-05-09 14:43           ` Jamal Hadi Salim
2018-05-09 17:37             ` Michel Machado
2018-05-12 14:48               ` Jamal Hadi Salim
2018-05-14 14:08                 ` Michel Machado
2018-05-16 12:18                   ` Jamal Hadi Salim
2018-05-09  2:24       ` Cong Wang
2018-05-09 14:09         ` Michel Machado
2018-05-10 17:38           ` Cong Wang
2018-05-10 19:06             ` Michel Machado

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.