All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] sched: CHOKe packet scheduler
@ 2011-01-05  0:29 Stephen Hemminger
  2011-01-05  6:02 ` Eric Dumazet
  2011-01-05  6:19 ` Eric Dumazet
  0 siblings, 2 replies; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-05  0:29 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

This implements the CHOKe packet scheduler based on the existing
Linux RED scheduler based on the algorithm described in the paper.
Configuration is the same as RED; only the name changes.

The core idea is:
  For every packet arrival:
  	Calculate Qave
	if (Qave < minth) {
	   Queue the new packet
	}
	Else {
	     Select randomly a packet from the queue for their flow id
	     Compare arriving packet with a randomly selected packet.
	     If they have the same flow id {
	     	Drop both the packets
	     }
	     Else {
	     	  if (Qave ≥ maxth) {
		     Calculate the dropping probability pa
		     Drop the packet with probability pa
		  }
		  Else {
		     Drop the new packet
		  }
	     }
       }

This an early access version.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/sched/Kconfig     |   11 +
 net/sched/Makefile    |    1 
 net/sched/sch_choke.c |  364 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 376 insertions(+)

--- a/net/sched/Kconfig	2011-01-04 16:25:18.000000000 -0800
+++ b/net/sched/Kconfig	2011-01-04 16:26:02.335973715 -0800
@@ -205,6 +205,17 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
--- a/net/sched/Makefile	2011-01-04 16:25:18.000000000 -0800
+++ b/net/sched/Makefile	2011-01-04 16:26:16.048938937 -0800
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/net/sched/sch_choke.c	2011-01-04 16:25:33.913971468 -0800
@@ -0,0 +1,364 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/red.h>
+#include <net/ipv6.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+	Source:
+	R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+	Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+	IEEE INFOCOM, 2000.
+
+	A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+	Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+	ADVANTAGE:
+	- Penalizes unfair flows
+	- Random drop provide gradual feedback
+
+	DRAWBACKS:
+	- Small queue for single flow
+	- Can be gamed by opening lots of connections
+	- Hard to get correct paremeters (same problem as RED)
+
+ */
+
+struct choke_sched_data
+{
+	u32		  limit;
+	unsigned char	  flags;
+
+	struct red_parms  parms;
+	struct red_stats  stats;
+};
+
+/* Select a packet at random from the list.
+ * Same caveats as skb_peek.
+ */
+static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
+{
+	struct sk_buff *skb = list->next;
+	unsigned int idx = net_random() % list->qlen;
+
+	while (skb && idx-- > 0)
+		skb = skb->next;
+
+	return skb;
+}
+
+/* Given IP header and size find src/dst port pair */
+static inline u32 get_ports(const void *hdr, size_t hdr_size, int offset)
+{
+	return *(u32 *)(hdr + hdr_size + offset);
+}
+
+
+static bool same_flow(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+	if (nskb->protocol != oskb->protocol)
+		return false;
+
+	switch (nskb->protocol) {
+	case htons(ETH_P_IP):
+	{
+		const struct iphdr *iph1, *iph2;
+		int poff;
+
+		if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
+			return false;
+
+		iph1 = ip_hdr(nskb);
+		iph2 = ip_hdr(oskb);
+
+		if (iph1->protocol != iph2->protocol ||
+		    iph1->daddr != iph2->daddr ||
+		    iph1->saddr != iph2->saddr)
+			return false;
+
+		/* Be hostile to new fragmented packets */
+		if (iph1->frag_off & htons(IP_MF|IP_OFFSET))
+			return true;
+
+		if (iph2->frag_off & htons(IP_MF|IP_OFFSET))
+			return false;
+
+		poff = proto_ports_offset(iph1->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(nskb, iph1->ihl * 4 + 4 + poff)) {
+			iph1 = ip_hdr(nskb);
+
+			return get_ports(iph1, iph1->ihl * 4, poff)
+				== get_ports(iph2, iph2->ihl * 4, poff);
+		}
+
+		return false;
+	}
+
+	case htons(ETH_P_IPV6):
+	{
+		const struct ipv6hdr *iph1, *iph2;
+		int poff;
+
+		if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
+			return false;
+
+		iph1 = ipv6_hdr(nskb);
+		iph2 = ipv6_hdr(oskb);
+
+		if (iph1->nexthdr != iph2->nexthdr ||
+		    ipv6_addr_cmp(&iph1->daddr, &iph2->daddr) != 0 ||
+		    ipv6_addr_cmp(&iph1->saddr, &iph2->saddr) != 0)
+			return false;
+
+		poff = proto_ports_offset(iph1->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(nskb, sizeof(*iph1) + 4 + poff)) {
+			iph1 = ipv6_hdr(nskb);
+
+			return get_ports(iph1, sizeof(*iph1), poff)
+				== get_ports(iph2, sizeof(*iph2), poff);
+		}
+		return false;
+	}
+	default:
+		return false;
+	}
+
+}
+
+/*
+ * Decide what to do with new packet based on queue size.
+ * returns 1 if packet should be admitted
+ *         0 if packet should be dropped
+ */
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+
+	p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		struct sk_buff *oskb;
+
+		/* Draw a packet at random from queue */
+		oskb = skb_peek_random(&sch->q);
+
+		/* Both packets from same flow? */
+		if (same_flow(skb, oskb)) {
+			/* Drop both packets */
+			__skb_unlink(oskb, &sch->q);
+			qdisc_drop(oskb, sch);
+			goto congestion_drop;
+		}
+
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			q->stats.forced_drop++;
+			goto congestion_drop;
+		}
+
+		if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				q->stats.prob_drop++;
+				goto congestion_drop;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (likely(skb_queue_len(&sch->q) < q->limit))
+		return qdisc_enqueue_tail(skb, sch);
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	skb = qdisc_dequeue_head(sch);
+	if (!skb) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]));
+
+	if (skb_queue_empty(&sch->q))
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	qdisc_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler
  2011-01-05  0:29 [RFC] sched: CHOKe packet scheduler Stephen Hemminger
@ 2011-01-05  6:02 ` Eric Dumazet
  2011-01-05  6:19 ` Eric Dumazet
  1 sibling, 0 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-05  6:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mardi 04 janvier 2011 à 16:29 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> Configuration is the same as RED; only the name changes.
> 
> The core idea is:
>   For every packet arrival:
>   	Calculate Qave
> 	if (Qave < minth) {
> 	   Queue the new packet
> 	}
> 	Else {
> 	     Select randomly a packet from the queue for their flow id
> 	     Compare arriving packet with a randomly selected packet.
> 	     If they have the same flow id {
> 	     	Drop both the packets
> 	     }
> 	     Else {
> 	     	  if (Qave ≥ maxth) {

you mean if (Qave is less than maxth) ?

> 		     Calculate the dropping probability pa
> 		     Drop the packet with probability pa
> 		  }
> 		  Else {
> 		     Drop the new packet
> 		  }
> 	     }
>        }
> 
> This an early access version.
> 

No ECN support at all ? even RED supports it :)

> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  net/sched/Kconfig     |   11 +
>  net/sched/Makefile    |    1 
>  net/sched/sch_choke.c |  364 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 376 insertions(+)
> 
> --- a/net/sched/Kconfig	2011-01-04 16:25:18.000000000 -0800
> +++ b/net/sched/Kconfig	2011-01-04 16:26:02.335973715 -0800
> @@ -205,6 +205,17 @@ config NET_SCH_DRR
>  
>  	  If unsure, say N.
>  
> +config NET_SCH_CHOKE
> +	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
> +	help
> +	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
> +	  and Keep for responsive flows, CHOose and Kill for unresponsive
> +	  flows). This is a variation of RED which trys to penalize flows
> +	  that monopolize the queue.
> +
> +	  To compile this code as a module, choose M here: the
> +	  module will be called sch_choke.
> +
>  config NET_SCH_INGRESS
>  	tristate "Ingress Qdisc"
>  	depends on NET_CLS_ACT
> --- a/net/sched/Makefile	2011-01-04 16:25:18.000000000 -0800
> +++ b/net/sched/Makefile	2011-01-04 16:26:16.048938937 -0800
> @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
>  obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
>  obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
> +obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
>  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
>  obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ b/net/sched/sch_choke.c	2011-01-04 16:25:33.913971468 -0800
> @@ -0,0 +1,364 @@
> +/*
> + * net/sched/sch_choke.c	CHOKE scheduler
> + *
> + * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/skbuff.h>
> +#include <linux/ipv6.h>
> +#include <linux/jhash.h>
> +#include <net/pkt_sched.h>
> +#include <net/ip.h>
> +#include <net/red.h>
> +#include <net/ipv6.h>
> +
> +/*	CHOKe stateless AQM for fair bandwidth allocation
> +        =================================================
> +
> +	Source:
> +	R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
> +	Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
> +	IEEE INFOCOM, 2000.
> +
> +	A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
> +	Characteristics", IEEE/ACM Transactions on Networking, 2004
> +
> +	ADVANTAGE:
> +	- Penalizes unfair flows
> +	- Random drop provide gradual feedback
> +
> +	DRAWBACKS:
> +	- Small queue for single flow
> +	- Can be gamed by opening lots of connections
> +	- Hard to get correct paremeters (same problem as RED)

	Big packets are really unfair. Must disable TSO/GRO :)

> +
> + */
> +
> +struct choke_sched_data
> +{
> +	u32		  limit;
> +	unsigned char	  flags;
> +
> +	struct red_parms  parms;
> +	struct red_stats  stats;
> +};
> +
> +/* Select a packet at random from the list.
> + * Same caveats as skb_peek.
> + */
> +static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
> +{
> +	struct sk_buff *skb = list->next;
> +	unsigned int idx = net_random() % list->qlen;
> +
> +	while (skb && idx-- > 0)
> +		skb = skb->next;

Ouch... A linked list (using in skb anchors) is not appropriate data
structure. Thats too many cache lines misses.

Maybe using a q->limit array of skb pointers ?

> +
> +	return skb;
> +}
> +
> +/* Given IP header and size find src/dst port pair */
> +static inline u32 get_ports(const void *hdr, size_t hdr_size, int offset)
> +{
> +	return *(u32 *)(hdr + hdr_size + offset);
> +}
> +
> +
> +static bool same_flow(struct sk_buff *nskb, const struct sk_buff *oskb)
> +{
> +	if (nskb->protocol != oskb->protocol)
> +		return false;
> +
> +	switch (nskb->protocol) {
> +	case htons(ETH_P_IP):
> +	{
> +		const struct iphdr *iph1, *iph2;
> +		int poff;
> +
> +		if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
> +			return false;

Why isnt it necessary to also may_pull test oskb ?

> +
> +		iph1 = ip_hdr(nskb);
> +		iph2 = ip_hdr(oskb);
> +
> +		if (iph1->protocol != iph2->protocol ||
> +		    iph1->daddr != iph2->daddr ||
> +		    iph1->saddr != iph2->saddr)
> +			return false;
> +
> +		/* Be hostile to new fragmented packets */
> +		if (iph1->frag_off & htons(IP_MF|IP_OFFSET))
> +			return true;
> +
> +		if (iph2->frag_off & htons(IP_MF|IP_OFFSET))
> +			return false;
> +
> +		poff = proto_ports_offset(iph1->protocol);
> +		if (poff >= 0 &&
> +		    pskb_network_may_pull(nskb, iph1->ihl * 4 + 4 + poff)) {
> +			iph1 = ip_hdr(nskb);
> +
> +			return get_ports(iph1, iph1->ihl * 4, poff)
> +				== get_ports(iph2, iph2->ihl * 4, poff);
> +		}
> +
> +		return false;
> +	}
> +
> +	case htons(ETH_P_IPV6):
> +	{
> +		const struct ipv6hdr *iph1, *iph2;
> +		int poff;
> +
> +		if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
> +			return false;

same here.

> +
> +		iph1 = ipv6_hdr(nskb);
> +		iph2 = ipv6_hdr(oskb);
> +
> +		if (iph1->nexthdr != iph2->nexthdr ||
> +		    ipv6_addr_cmp(&iph1->daddr, &iph2->daddr) != 0 ||
> +		    ipv6_addr_cmp(&iph1->saddr, &iph2->saddr) != 0)
> +			return false;
> +
> +		poff = proto_ports_offset(iph1->nexthdr);
> +		if (poff >= 0 &&
> +		    pskb_network_may_pull(nskb, sizeof(*iph1) + 4 + poff)) {
> +			iph1 = ipv6_hdr(nskb);
> +
> +			return get_ports(iph1, sizeof(*iph1), poff)
> +				== get_ports(iph2, sizeof(*iph2), poff);
> +		}
> +		return false;
> +	}
> +	default:
> +		return false;
> +	}
> +
> +}
> +
> +/*
> + * Decide what to do with new packet based on queue size.
> + * returns 1 if packet should be admitted
> + *         0 if packet should be dropped
> + */
> +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct red_parms *p = &q->parms;
> +
> +	p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
> +	if (red_is_idling(p))
> +		red_end_of_idle_period(p);
> +
> +	if (p->qavg <= p->qth_min)
> +		p->qcount = -1;
> +	else {
> +		struct sk_buff *oskb;
> +
> +		/* Draw a packet at random from queue */
> +		oskb = skb_peek_random(&sch->q);
> +
> +		/* Both packets from same flow? */
> +		if (same_flow(skb, oskb)) {
> +			/* Drop both packets */
> +			__skb_unlink(oskb, &sch->q);
> +			qdisc_drop(oskb, sch);
> +			goto congestion_drop;
> +		}
> +
> +		if (p->qavg > p->qth_max) {

ok, maybe CHOKe paper used : if (p->qavg >= p->qth_max)  ?

> +			p->qcount = -1;
> +
> +			sch->qstats.overlimits++;
> +			q->stats.forced_drop++;
> +			goto congestion_drop;
> +		}
> +
> +		if (++p->qcount) {
> +			if (red_mark_probability(p, p->qavg)) {
> +				p->qcount = 0;
> +				p->qR = red_random(p);
> +
> +				sch->qstats.overlimits++;
> +				q->stats.prob_drop++;
> +				goto congestion_drop;
> +			}
> +		} else
> +			p->qR = red_random(p);
> +	}
> +
> +	/* Admit new packet */
> +	if (likely(skb_queue_len(&sch->q) < q->limit))
> +		return qdisc_enqueue_tail(skb, sch);
> +
> +	q->stats.pdrop++;
> +	sch->qstats.drops++;
> +	kfree_skb(skb);
> +	return NET_XMIT_DROP;
> +
> + congestion_drop:
> +	qdisc_drop(skb, sch);
> +	return NET_XMIT_CN;
> +}
> +


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler
  2011-01-05  0:29 [RFC] sched: CHOKe packet scheduler Stephen Hemminger
  2011-01-05  6:02 ` Eric Dumazet
@ 2011-01-05  6:19 ` Eric Dumazet
  2011-01-05 17:17   ` Stephen Hemminger
  1 sibling, 1 reply; 27+ messages in thread
From: Eric Dumazet @ 2011-01-05  6:19 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mardi 04 janvier 2011 à 16:29 -0800, Stephen Hemminger a écrit :
> +static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
> +{
> +	struct sk_buff *skb = list->next;
> +	unsigned int idx = net_random() % list->qlen;
> +
> +	while (skb && idx-- > 0)
> +		skb = skb->next;
> +
> +	return skb;
> +}

You could avoid the divide op :

unsigned int idx = reciprocal_divide(random32(), list->qlen);




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler
  2011-01-05  6:19 ` Eric Dumazet
@ 2011-01-05 17:17   ` Stephen Hemminger
  2011-01-05 17:25     ` Eric Dumazet
  0 siblings, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-05 17:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Wed, 05 Jan 2011 07:19:35 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le mardi 04 janvier 2011 à 16:29 -0800, Stephen Hemminger a écrit :
> > +static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
> > +{
> > +	struct sk_buff *skb = list->next;
> > +	unsigned int idx = net_random() % list->qlen;
> > +
> > +	while (skb && idx-- > 0)
> > +		skb = skb->next;
> > +
> > +	return skb;
> > +}
> 
> You could avoid the divide op :
> 
> unsigned int idx = reciprocal_divide(random32(), list->qlen);

How would this work, it is a mod not a divide??

-- 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler
  2011-01-05 17:17   ` Stephen Hemminger
@ 2011-01-05 17:25     ` Eric Dumazet
  2011-01-05 19:21       ` [RFC] sched: CHOKe packet scheduler (v0.2) Stephen Hemminger
  0 siblings, 1 reply; 27+ messages in thread
From: Eric Dumazet @ 2011-01-05 17:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mercredi 05 janvier 2011 à 09:17 -0800, Stephen Hemminger a écrit :
> On Wed, 05 Jan 2011 07:19:35 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > Le mardi 04 janvier 2011 à 16:29 -0800, Stephen Hemminger a écrit :
> > > +static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
> > > +{
> > > +	struct sk_buff *skb = list->next;
> > > +	unsigned int idx = net_random() % list->qlen;
> > > +
> > > +	while (skb && idx-- > 0)
> > > +		skb = skb->next;
> > > +
> > > +	return skb;
> > > +}
> > 
> > You could avoid the divide op :
> > 
> > unsigned int idx = reciprocal_divide(random32(), list->qlen);
> 
> How would this work, it is a mod not a divide??
> 

It works, because random32() provides a 32bit 'random' number.
between 0 and 0xFFFFFFFF

We multiply it by X, get a 64bit number between 0 and 0xFFFFFFFF * X
then we right shift it by 32, get a number between 0 and X - 1 

We dont need to get the modulus, just a random number between 0 and X -
1

Dont worry, we should add a helper function to do that, since it might
be used in many places.

/* deliver a random number between 0 and N - 1 */
u32 random_N(unsigned int N)
{
	reciprocal_divide(random32(), N);
}





^ permalink raw reply	[flat|nested] 27+ messages in thread

* [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-05 17:25     ` Eric Dumazet
@ 2011-01-05 19:21       ` Stephen Hemminger
  2011-01-05 20:06         ` Eric Dumazet
  2011-01-06  4:07         ` Eric Dumazet
  0 siblings, 2 replies; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-05 19:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

This implements the CHOKe packet scheduler based on the existing
Linux RED scheduler based on the algorithm described in the paper.

The core idea is:
  For every packet arrival:
  	Calculate Qave
	if (Qave < minth) 
	     Queue the new packet
	else 
	     Select randomly a packet from the queue 
	     if (both packets from same flow)
	     then Drop both the packets
	     else if (Qave > maxth)
	          Drop packet
	     else
	       	  Admit packet with probability p (same as RED)

See also:
  Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
   queue management scheme for approximating fair bandwidth allocation", 
  Proceeding of INFOCOM'2000, March 2000. 

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
New in this version:
	- use skb hash values for flow matching
	- reciprocal_divide optimization
	- optional ECN support (same as RED)

 net/sched/Kconfig     |   11 +
 net/sched/Makefile    |    1 
 net/sched/sch_choke.c |  364 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 376 insertions(+)

--- a/net/sched/Kconfig	2011-01-04 16:25:18.000000000 -0800
+++ b/net/sched/Kconfig	2011-01-05 09:01:33.280032462 -0800
@@ -205,6 +205,17 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
--- a/net/sched/Makefile	2011-01-04 16:25:18.000000000 -0800
+++ b/net/sched/Makefile	2011-01-05 09:01:33.284032598 -0800
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/net/sched/sch_choke.c	2011-01-05 11:18:14.422320985 -0800
@@ -0,0 +1,307 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not a "classful" qdisc because it
+   needs to access packets in queue randomly.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+struct choke_sched_data
+{
+	u32		  limit;
+	unsigned char	  flags;
+
+	struct red_parms  parms;
+	struct red_stats  stats;
+};
+
+
+/* deliver a random number between 0 and N - 1 */
+static inline u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random(), N);
+}
+
+/* Select a packet at random from the list.
+ * Same caveats as skb_peek.
+ */
+static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
+{
+	struct sk_buff *skb = list->next;
+	unsigned int idx = random_N(list->qlen);
+
+	while (skb && idx-- > 0)
+		skb = skb->next;
+
+	return skb;
+}
+
+
+static inline int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+static inline int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+
+	p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		struct sk_buff *oskb;
+
+		/* Draw a packet at random from queue */
+		oskb = skb_peek_random(&sch->q);
+
+		/* Both packets from same flow?
+		 * Assumes skb_get_rxhash already set hash on oskb->rxhash
+		 * prior to queuing
+		 */
+		if (oskb->rxhash == skb_get_rxhash(skb)) {
+			/* Drop both packets */
+			__skb_unlink(oskb, &sch->q);
+			qdisc_drop(oskb, sch);
+			goto congestion_drop;
+		}
+
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		}
+
+		if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (likely(skb_queue_len(&sch->q) < q->limit))
+		return qdisc_enqueue_tail(skb, sch);
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	skb = qdisc_dequeue_head(sch);
+	if (!skb) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]));
+
+	if (skb_queue_empty(&sch->q))
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	qdisc_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-05 19:21       ` [RFC] sched: CHOKe packet scheduler (v0.2) Stephen Hemminger
@ 2011-01-05 20:06         ` Eric Dumazet
  2011-01-05 20:15           ` Stephen Hemminger
  2011-01-06  4:07         ` Eric Dumazet
  1 sibling, 1 reply; 27+ messages in thread
From: Eric Dumazet @ 2011-01-05 20:06 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mercredi 05 janvier 2011 à 11:21 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> 

> +
> +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct red_parms *p = &q->parms;
> +
> +	p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
> +	if (red_is_idling(p))
> +		red_end_of_idle_period(p);
> +
> +	if (p->qavg <= p->qth_min)
> +		p->qcount = -1;
> +	else {
> +		struct sk_buff *oskb;
> +
> +		/* Draw a packet at random from queue */
> +		oskb = skb_peek_random(&sch->q);
> +
> +		/* Both packets from same flow?
> +		 * Assumes skb_get_rxhash already set hash on oskb->rxhash
> +		 * prior to queuing

but this is not true... if at that time, p->qavg was <= p->qth_min.
Packet was directly enqueued.

Just use :

if (skb_get_rxhash(oskb) == skb_get_rxhash(skb))

Since skb_get_rxhash(skb) doesnt recompute rxhash if already set.

Hmm... I am now wondering if this actually works on egress at all
(can we use rxhash here I mean)



> +		 */
> +		if (oskb->rxhash == skb_get_rxhash(skb)) {
> +			/* Drop both packets */
> +			__skb_unlink(oskb, &sch->q);
> +			qdisc_drop(oskb, sch);
> +			goto congestion_drop;
> +		}



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-05 20:06         ` Eric Dumazet
@ 2011-01-05 20:15           ` Stephen Hemminger
  0 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-05 20:15 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Wed, 05 Jan 2011 21:06:07 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le mercredi 05 janvier 2011 à 11:21 -0800, Stephen Hemminger a écrit :
> > This implements the CHOKe packet scheduler based on the existing
> > Linux RED scheduler based on the algorithm described in the paper.
> > 
> 
> > +
> > +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> > +{
> > +	struct choke_sched_data *q = qdisc_priv(sch);
> > +	struct red_parms *p = &q->parms;
> > +
> > +	p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
> > +	if (red_is_idling(p))
> > +		red_end_of_idle_period(p);
> > +
> > +	if (p->qavg <= p->qth_min)
> > +		p->qcount = -1;
> > +	else {
> > +		struct sk_buff *oskb;
> > +
> > +		/* Draw a packet at random from queue */
> > +		oskb = skb_peek_random(&sch->q);
> > +
> > +		/* Both packets from same flow?
> > +		 * Assumes skb_get_rxhash already set hash on oskb->rxhash
> > +		 * prior to queuing
> 
> but this is not true... if at that time, p->qavg was <= p->qth_min.
> Packet was directly enqueued.
> 
> Just use :
> 
> if (skb_get_rxhash(oskb) == skb_get_rxhash(skb))
> 
> Since skb_get_rxhash(skb) doesnt recompute rxhash if already set.
> 
> Hmm... I am now wondering if this actually works on egress at all
> (can we use rxhash here I mean)
> 
> 
> 
> > +		 */
> > +		if (oskb->rxhash == skb_get_rxhash(skb)) {
> > +			/* Drop both packets */
> > +			__skb_unlink(oskb, &sch->q);
> > +			qdisc_drop(oskb, sch);
> > +			goto congestion_drop;
> > +		}
> 

The code computes a value, whether it is correct or not is another question.
Also a little concerned that different NIC's compute different flow hash
values which could cause false positives.

-- 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-05 19:21       ` [RFC] sched: CHOKe packet scheduler (v0.2) Stephen Hemminger
  2011-01-05 20:06         ` Eric Dumazet
@ 2011-01-06  4:07         ` Eric Dumazet
  2011-01-06  6:53           ` Stephen Hemminger
  2011-01-07  4:55           ` Stephen Hemminger
  1 sibling, 2 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-06  4:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mercredi 05 janvier 2011 à 11:21 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> 
> The core idea is:
>   For every packet arrival:
>   	Calculate Qave
> 	if (Qave < minth) 
> 	     Queue the new packet
> 	else 
> 	     Select randomly a packet from the queue 
> 	     if (both packets from same flow)
> 	     then Drop both the packets
> 	     else if (Qave > maxth)
> 	          Drop packet
> 	     else
> 	       	  Admit packet with probability p (same as RED)
> 
> See also:
>   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
>    queue management scheme for approximating fair bandwidth allocation", 
>   Proceeding of INFOCOM'2000, March 2000. 
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 

To be really useful in a wide range of environments, I believe that :

- CHOKe should be able to use an external flow classifier (like say...
SFQ) to compute a token and compare two skbs by this token instead of
custom rxhash or whatever. (rxhash can be the default in absence of flow
classifier). Probably you need to store the token in skb->cb[] to avoid
calling tc_classify() several times for a given packet. 

http://lwn.net/Articles/236200/
http://kerneltrap.org/mailarchive/linux-netdev/2008/1/31/667679

- Must use a FIFO with O(1) access to Nth skb in queue.
 
 A linked list makes this implementation too expensive for big queues.

For small queues (less than 128 skbs at this moment for SFQ), existing
schedulers are good enough.

CHOKe authors dont mention this in their paper, but their experiments
were done in 1999 with 1Mbs links. minth=100 and maxth=200, limit=300

We want to try CHOKe with modern links, probably with minth=2000 and
maxth=4000 or more.

They said "It is arguably more difficult to drop a randomy chosen packet
since this means removing from a linked-list. Instead of doing this, we
propose to add one extra bit to the packet header. The bit is set to one
if the drop candidate is to be dropped. When a packet advance to the
head of the FIFO buffer, the status of the bit determines whether it is
to be immediately discarded or transmitted on the outgoind line"

If they thought removing a buffer from a linked list was expensive
(!!!), they certainly assumed the previous access to the randomly chosen
buffer was faster than the skb unlink !

Using a circular buffer should be enough, using a similar trick than
suggested : when droping an skb from the ring, stick a NULL pointer and
dont memmove() the window to shrink it.

struct skb_ring {
	unsigned int head;
	unsigned int tail;
	unsigned int size; /* a power of two */
	struct sk_buff **table;
};

Doing so avoids the cache misses to adjacent skbs prev/next when
queue/dequeue is done.





^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-06  4:07         ` Eric Dumazet
@ 2011-01-06  6:53           ` Stephen Hemminger
  2011-01-07  4:55           ` Stephen Hemminger
  1 sibling, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-06  6:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Thu, 06 Jan 2011 05:07:30 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> - CHOKe should be able to use an external flow classifier (like say...
> SFQ) to compute a token and compare two skbs by this token instead of
> custom rxhash or whatever.

I don't think you can apply a filter to a non-classful qdisc?



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-06  4:07         ` Eric Dumazet
  2011-01-06  6:53           ` Stephen Hemminger
@ 2011-01-07  4:55           ` Stephen Hemminger
  2011-01-07  5:39             ` Changli Gao
                               ` (2 more replies)
  1 sibling, 3 replies; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-07  4:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Thu, 06 Jan 2011 05:07:30 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le mercredi 05 janvier 2011 à 11:21 -0800, Stephen Hemminger a écrit :
> > This implements the CHOKe packet scheduler based on the existing
> > Linux RED scheduler based on the algorithm described in the paper.
> > 
> > The core idea is:
> >   For every packet arrival:
> >   	Calculate Qave
> > 	if (Qave < minth) 
> > 	     Queue the new packet
> > 	else 
> > 	     Select randomly a packet from the queue 
> > 	     if (both packets from same flow)
> > 	     then Drop both the packets
> > 	     else if (Qave > maxth)
> > 	          Drop packet
> > 	     else
> > 	       	  Admit packet with probability p (same as RED)
> > 
> > See also:
> >   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
> >    queue management scheme for approximating fair bandwidth allocation", 
> >   Proceeding of INFOCOM'2000, March 2000. 
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> 
> To be really useful in a wide range of environments, I believe that :
> 
> - CHOKe should be able to use an external flow classifier (like say...
> SFQ) to compute a token and compare two skbs by this token instead of
> custom rxhash or whatever. (rxhash can be the default in absence of flow
> classifier). Probably you need to store the token in skb->cb[] to avoid
> calling tc_classify() several times for a given packet. 
> 
> http://lwn.net/Articles/236200/
> http://kerneltrap.org/mailarchive/linux-netdev/2008/1/31/667679

Probably should split SFQ flow hash stuff into core code for reuse.


> - Must use a FIFO with O(1) access to Nth skb in queue.
>  
>  A linked list makes this implementation too expensive for big queues.
> 
> For small queues (less than 128 skbs at this moment for SFQ), existing
> schedulers are good enough.
> 
> CHOKe authors dont mention this in their paper, but their experiments
> were done in 1999 with 1Mbs links. minth=100 and maxth=200, limit=300
> 
> We want to try CHOKe with modern links, probably with minth=2000 and
> maxth=4000 or more.
> 
> They said "It is arguably more difficult to drop a randomy chosen packet
> since this means removing from a linked-list. Instead of doing this, we
> propose to add one extra bit to the packet header. The bit is set to one
> if the drop candidate is to be dropped. When a packet advance to the
> head of the FIFO buffer, the status of the bit determines whether it is
> to be immediately discarded or transmitted on the outgoind line"
> 
> If they thought removing a buffer from a linked list was expensive
> (!!!), they certainly assumed the previous access to the randomly chosen
> buffer was faster than the skb unlink !
> 
> Using a circular buffer should be enough, using a similar trick than
> suggested : when droping an skb from the ring, stick a NULL pointer and
> dont memmove() the window to shrink it.
> 
> struct skb_ring {
> 	unsigned int head;
> 	unsigned int tail;
> 	unsigned int size; /* a power of two */
> 	struct sk_buff **table;
> };
> 
> Doing so avoids the cache misses to adjacent skbs prev/next when
> queue/dequeue is done.

The problem is that large tables of pointers in kernel require either
contiguous allocation or some indirect table algorithm.

-- 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-07  4:55           ` Stephen Hemminger
@ 2011-01-07  5:39             ` Changli Gao
  2011-01-07  7:10               ` Stephen Hemminger
  2011-01-07  8:37             ` Eric Dumazet
  2011-01-10 13:46             ` Eric Dumazet
  2 siblings, 1 reply; 27+ messages in thread
From: Changli Gao @ 2011-01-07  5:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, netdev

On Fri, Jan 7, 2011 at 12:55 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Thu, 06 Jan 2011 05:07:30 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>> Le mercredi 05 janvier 2011 à 11:21 -0800, Stephen Hemminger a écrit :
>> > This implements the CHOKe packet scheduler based on the existing
>> > Linux RED scheduler based on the algorithm described in the paper.
>> >
>> > The core idea is:
>> >   For every packet arrival:
>> >     Calculate Qave
>> >     if (Qave < minth)
>> >          Queue the new packet
>> >     else
>> >          Select randomly a packet from the queue
>> >          if (both packets from same flow)
>> >          then Drop both the packets
>> >          else if (Qave > maxth)
>> >               Drop packet
>> >          else
>> >               Admit packet with probability p (same as RED)
>> >
>> > See also:
>> >   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
>> >    queue management scheme for approximating fair bandwidth allocation",
>> >   Proceeding of INFOCOM'2000, March 2000.
>> >
>> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>> >
>>
>> To be really useful in a wide range of environments, I believe that :
>>
>> - CHOKe should be able to use an external flow classifier (like say...
>> SFQ) to compute a token and compare two skbs by this token instead of
>> custom rxhash or whatever. (rxhash can be the default in absence of flow
>> classifier). Probably you need to store the token in skb->cb[] to avoid
>> calling tc_classify() several times for a given packet.
>>
>> http://lwn.net/Articles/236200/
>> http://kerneltrap.org/mailarchive/linux-netdev/2008/1/31/667679
>
> Probably should split SFQ flow hash stuff into core code for reuse.
>
>

We need not do that, since we have sch_drr and cls_flow. :)

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-07  5:39             ` Changli Gao
@ 2011-01-07  7:10               ` Stephen Hemminger
  0 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-07  7:10 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David Miller, netdev

On Fri, 7 Jan 2011 13:39:26 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> On Fri, Jan 7, 2011 at 12:55 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > On Thu, 06 Jan 2011 05:07:30 +0100
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >> Le mercredi 05 janvier 2011 à 11:21 -0800, Stephen Hemminger a écrit :
> >> > This implements the CHOKe packet scheduler based on the existing
> >> > Linux RED scheduler based on the algorithm described in the paper.
> >> >
> >> > The core idea is:
> >> >   For every packet arrival:
> >> >     Calculate Qave
> >> >     if (Qave < minth)
> >> >          Queue the new packet
> >> >     else
> >> >          Select randomly a packet from the queue
> >> >          if (both packets from same flow)
> >> >          then Drop both the packets
> >> >          else if (Qave > maxth)
> >> >               Drop packet
> >> >          else
> >> >               Admit packet with probability p (same as RED)
> >> >
> >> > See also:
> >> >   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
> >> >    queue management scheme for approximating fair bandwidth allocation",
> >> >   Proceeding of INFOCOM'2000, March 2000.
> >> >
> >> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >> >
> >>
> >> To be really useful in a wide range of environments, I believe that :
> >>
> >> - CHOKe should be able to use an external flow classifier (like say...
> >> SFQ) to compute a token and compare two skbs by this token instead of
> >> custom rxhash or whatever. (rxhash can be the default in absence of flow
> >> classifier). Probably you need to store the token in skb->cb[] to avoid
> >> calling tc_classify() several times for a given packet.
> >>
> >> http://lwn.net/Articles/236200/
> >> http://kerneltrap.org/mailarchive/linux-netdev/2008/1/31/667679
> >
> > Probably should split SFQ flow hash stuff into core code for reuse.
> >
> >
> 
> We need not do that, since we have sch_drr and cls_flow. :)

I prefer that the qdisc be useable without any explicit flow classification.
I.e like SFQ it should fall back to a sensible flow matching. DRR and others
put everything in one flow, if no filters are used.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-07  4:55           ` Stephen Hemminger
  2011-01-07  5:39             ` Changli Gao
@ 2011-01-07  8:37             ` Eric Dumazet
  2011-01-10 13:46             ` Eric Dumazet
  2 siblings, 0 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-07  8:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le jeudi 06 janvier 2011 à 20:55 -0800, Stephen Hemminger a écrit :
> The problem is that large tables of pointers in kernel require either
> contiguous allocation or some indirect table algorithm.
> 

By large table, how many slots do you envision ?

1024 ?
8192 ?
65536 ?

Even insane value like 65536 is OK most of the time.
if kmalloc(GFP_KERNEL) fails, try vmalloc().

We are in process context and are allowed to sleep when qdisc is
created.

Anyway, accessing a random skb in a list of 65536 skbs is just crazy.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-07  4:55           ` Stephen Hemminger
  2011-01-07  5:39             ` Changli Gao
  2011-01-07  8:37             ` Eric Dumazet
@ 2011-01-10 13:46             ` Eric Dumazet
  2011-01-10 17:31               ` Stephen Hemminger
  2011-01-10 23:44               ` [RFC] sched: CHOKe packet scheduler (v0.4) Stephen Hemminger
  2 siblings, 2 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-10 13:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le jeudi 06 janvier 2011 à 20:55 -0800, Stephen Hemminger a écrit :

> 
> The problem is that large tables of pointers in kernel require either
> contiguous allocation or some indirect table algorithm.
> 

Here is a v3 version with an array based queue for O(1) peek_random
complexity.

Could you send the iproute2 patch so that I can test it ?

Thanks !


diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index e69de29..ea9db00 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,388 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not a "classful" qdisc because it
+   needs to access packets in queue randomly.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+struct choke_sched_data {
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+	struct red_stats stats;
+
+	unsigned int	 head;
+	unsigned int	 tail;
+	unsigned int	 holes;
+	unsigned int	 tab_mask; /* size - 1 */
+	struct sk_buff **tab;
+};
+
+static inline unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* deliver a random number between 0 and N - 1 */
+static inline u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+/* Select a packet at random from the queue in O(1) */
+static struct sk_buff *choke_peek_random(struct choke_sched_data *q, unsigned int *pidx)
+{
+	*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+	return q->tab[*pidx];
+}
+
+
+static inline int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+static inline int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+static inline void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->head] == NULL) {
+		q->head = (q->head + 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+static inline void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->tail - 1] == NULL) {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+static void choke_drop_by_idx(struct choke_sched_data *q, unsigned int idx)
+{
+	q->tab[idx] = NULL;
+	q->holes++;
+	choke_zap_head_holes(q);
+	choke_zap_tail_holes(q);
+}
+
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+
+	p->qavg = red_calc_qavg(p, choke_len(q) - q->holes);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		struct sk_buff *oskb;
+		unsigned int idx;
+
+		/* Draw a packet at random from queue */
+		oskb = choke_peek_random(q, &idx);
+
+		/* Both packets from same flow ? */
+		if (oskb && skb_get_rxhash(oskb) == skb_get_rxhash(skb)) {
+			/* Drop both packets */
+			choke_drop_by_idx(q, idx);
+			qdisc_drop(oskb, sch);
+			goto congestion_drop;
+		}
+
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		}
+
+		if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (likely(choke_len(q) < q->limit)) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		__qdisc_update_bstats(sch, qdisc_pkt_len(skb));
+		return NET_XMIT_SUCCESS;
+	}
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL; /* not really needed */
+	q->head = (q->head + 1) & q->tab_mask;
+	choke_zap_head_holes(q);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+						GFP_KERNEL);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int tail = 0;
+
+			while (q->head != q->tail) {
+				ntab[tail++] = q->tab[q->head];
+				q->head = (q->head + 1) & q->tab_mask;
+			}
+			q->head = 0;
+			q->tail = tail;
+		}
+		q->tab_mask = mask;
+		q->holes = 0;
+	} else
+		sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	sch->q.qlen = choke_len(q) - q->holes;
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	choke_free(q->tab);
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	qdisc_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-10 13:46             ` Eric Dumazet
@ 2011-01-10 17:31               ` Stephen Hemminger
  2011-01-10 17:45                 ` Eric Dumazet
  2011-01-10 23:44               ` [RFC] sched: CHOKe packet scheduler (v0.4) Stephen Hemminger
  1 sibling, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-10 17:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Mon, 10 Jan 2011 14:46:50 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le jeudi 06 janvier 2011 à 20:55 -0800, Stephen Hemminger a écrit :
> 
> > 
> > The problem is that large tables of pointers in kernel require either
> > contiguous allocation or some indirect table algorithm.
> > 
> 
> Here is a v3 version with an array based queue for O(1) peek_random
> complexity.
> 
> Could you send the iproute2 patch so that I can test it ?
> 
> Thanks !
> 
> 
> diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
> index e69de29..ea9db00 100644
> --- a/net/sched/sch_choke.c
> +++ b/net/sched/sch_choke.c
> @@ -0,0 +1,388 @@
> +/*
> + * net/sched/sch_choke.c	CHOKE scheduler
> + *
> + * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/skbuff.h>
> +#include <linux/reciprocal_div.h>
> +#include <net/pkt_sched.h>
> +#include <net/inet_ecn.h>
> +#include <net/red.h>
> +
> +/*	CHOKe stateless AQM for fair bandwidth allocation
> +        =================================================
> +
> +   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
> +   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
> +   maintains no flow state. The difference from RED is an additional step
> +   during the enqueuing process. If average queue size is over the
> +   low threshold (qmin), a packet is chosen at random from the queue.
> +   If both the new and chosen packet are from the same flow, both
> +   are dropped. Unlike RED, CHOKe is not a "classful" qdisc because it
> +   needs to access packets in queue randomly.
> +
> +   Source:
> +   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
> +   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
> +   IEEE INFOCOM, 2000.
> +
> +   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
> +   Characteristics", IEEE/ACM Transactions on Networking, 2004
> +
> + */
> +
> +struct choke_sched_data {
> +	u32		 limit;
> +	unsigned char	 flags;
> +
> +	struct red_parms parms;
> +	struct red_stats stats;
> +
> +	unsigned int	 head;
> +	unsigned int	 tail;
> +	unsigned int	 holes;
> +	unsigned int	 tab_mask; /* size - 1 */
> +	struct sk_buff **tab;
> +};
> +
> +static inline unsigned int choke_len(const struct choke_sched_data *q)
> +{
> +	return (q->tail - q->head) & q->tab_mask;
> +}
> +
> +/* deliver a random number between 0 and N - 1 */
> +static inline u32 random_N(unsigned int N)
> +{
> +	return reciprocal_divide(random32(), N);
> +}
> +
> +/* Select a packet at random from the queue in O(1) */
> +static struct sk_buff *choke_peek_random(struct choke_sched_data *q, unsigned int *pidx)
> +{
> +	*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
> +	return q->tab[*pidx];
> +}

I don't think this works right. The choke_peek_random could find a hole.
Either the data structure has to change, or the peek_random has to retry,
or if quick peek fails then compress the slot with memmove and retry.


-- 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.2)
  2011-01-10 17:31               ` Stephen Hemminger
@ 2011-01-10 17:45                 ` Eric Dumazet
  0 siblings, 0 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-10 17:45 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le lundi 10 janvier 2011 à 09:31 -0800, Stephen Hemminger a écrit :
> On Mon, 10 Jan 2011 14:46:50 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > Le jeudi 06 janvier 2011 à 20:55 -0800, Stephen Hemminger a écrit :
> > 
> > > 
> > > The problem is that large tables of pointers in kernel require either
> > > contiguous allocation or some indirect table algorithm.
> > > 
> > 
> > Here is a v3 version with an array based queue for O(1) peek_random
> > complexity.
> > 
> > Could you send the iproute2 patch so that I can test it ?
> > 
> > Thanks !
> > 
> > 
> > diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
> > index e69de29..ea9db00 100644
> > --- a/net/sched/sch_choke.c
> > +++ b/net/sched/sch_choke.c
> > @@ -0,0 +1,388 @@
> > +/*
> > + * net/sched/sch_choke.c	CHOKE scheduler
> > + *
> > + * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
> > + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * version 2 as published by the Free Software Foundation.
> > + *
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/skbuff.h>
> > +#include <linux/reciprocal_div.h>
> > +#include <net/pkt_sched.h>
> > +#include <net/inet_ecn.h>
> > +#include <net/red.h>
> > +
> > +/*	CHOKe stateless AQM for fair bandwidth allocation
> > +        =================================================
> > +
> > +   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
> > +   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
> > +   maintains no flow state. The difference from RED is an additional step
> > +   during the enqueuing process. If average queue size is over the
> > +   low threshold (qmin), a packet is chosen at random from the queue.
> > +   If both the new and chosen packet are from the same flow, both
> > +   are dropped. Unlike RED, CHOKe is not a "classful" qdisc because it
> > +   needs to access packets in queue randomly.
> > +
> > +   Source:
> > +   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
> > +   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
> > +   IEEE INFOCOM, 2000.
> > +
> > +   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
> > +   Characteristics", IEEE/ACM Transactions on Networking, 2004
> > +
> > + */
> > +
> > +struct choke_sched_data {
> > +	u32		 limit;
> > +	unsigned char	 flags;
> > +
> > +	struct red_parms parms;
> > +	struct red_stats stats;
> > +
> > +	unsigned int	 head;
> > +	unsigned int	 tail;
> > +	unsigned int	 holes;
> > +	unsigned int	 tab_mask; /* size - 1 */
> > +	struct sk_buff **tab;
> > +};
> > +
> > +static inline unsigned int choke_len(const struct choke_sched_data *q)
> > +{
> > +	return (q->tail - q->head) & q->tab_mask;
> > +}
> > +
> > +/* deliver a random number between 0 and N - 1 */
> > +static inline u32 random_N(unsigned int N)
> > +{
> > +	return reciprocal_divide(random32(), N);
> > +}
> > +
> > +/* Select a packet at random from the queue in O(1) */
> > +static struct sk_buff *choke_peek_random(struct choke_sched_data *q, unsigned int *pidx)
> > +{
> > +	*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
> > +	return q->tab[*pidx];
> > +}
> 
> I don't think this works right. The choke_peek_random could find a hole.
> Either the data structure has to change, or the peek_random has to retry,
> or if quick peek fails then compress the slot with memmove and retry.
> 
> 

Yes, this "compress only if peek_random() finds a hole seems good, I'll
try this.

As number of holes is known, we could have :

if (holes_proportion_is_less_than_20_percent())
	try another random number X times
else
	compress table to remove holes.




^ permalink raw reply	[flat|nested] 27+ messages in thread

* [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-10 13:46             ` Eric Dumazet
  2011-01-10 17:31               ` Stephen Hemminger
@ 2011-01-10 23:44               ` Stephen Hemminger
  2011-01-11  0:00                 ` Eric Dumazet
  1 sibling, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-10 23:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

This implements the CHOKe packet scheduler based on the existing
Linux RED scheduler based on the algorithm described in the paper.

The core idea is:
  For every packet arrival:
  	Calculate Qave
	if (Qave < minth) 
	     Queue the new packet
	else 
	     Select randomly a packet from the queue 
	     if (both packets from same flow)
	     then Drop both the packets
	     else if (Qave > maxth)
	          Drop packet
	     else
	       	  Admit packet with proability p (same as RED)

See also:
  Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
   queue management scheme for approximating fair bandwidth allocation", 
  Proceeding of INFOCOM'2000, March 2000. 

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
0.3 version from Eric uses table for queue.
0.4 allows classification with TC filters
    fixes crash when peek_random() finds a hole

 net/sched/Kconfig     |   11 +
 net/sched/Makefile    |    1 
 net/sched/sch_choke.c |  527 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 539 insertions(+)

--- a/net/sched/Kconfig	2011-01-10 09:17:25.328637817 -0800
+++ b/net/sched/Kconfig	2011-01-10 12:11:40.489701227 -0800
@@ -205,6 +205,17 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
--- a/net/sched/Makefile	2011-01-10 09:17:25.336639744 -0800
+++ b/net/sched/Makefile	2011-01-10 12:11:40.489701227 -0800
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/net/sched/sch_choke.c	2011-01-10 12:40:32.802282618 -0800
@@ -0,0 +1,527 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+	struct red_stats stats;
+
+/* Variables */
+	struct tcf_proto *filter_list;
+	unsigned int	 head;
+	unsigned int	 tail;
+	unsigned int	 holes;
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+static inline unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* deliver a random number between 0 and N - 1 */
+static inline u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+
+/* Select a packet at random from the queue in O(1) and handle holes */
+static struct sk_buff *choke_peek_random(struct choke_sched_data *q,
+					 unsigned int *pidx)
+{
+	struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			return skb;
+	} while (--retrys > 0);
+
+	/* queue is has lots of holes use the head which is known to exist */
+	return q->tab[*pidx = q->head];
+}
+
+/* Is ECN parameter configured */
+static inline int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static inline int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->head] == NULL) {
+		q->head = (q->head + 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->tail - 1] == NULL) {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct choke_sched_data *q, unsigned int idx)
+{
+	q->tab[idx] = NULL;
+	q->holes++;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+}
+
+/* Classify flow using either:
+   1. pre-existing classification result in skb
+   2. fast internal classification
+   3. use TC filter based classification
+*/
+static inline unsigned int choke_classify(struct sk_buff *skb,
+					  struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return skb_get_rxhash(skb);
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		return TC_H_MIN(res.classid);
+	}
+
+	return 0;
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+	unsigned int hash;
+	int uninitialized_var(ret);
+
+	hash = choke_classify(skb, sch, &ret);
+	if (unlikely(!hash)) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+
+	/* XXX add hash to qdisc_skb_cb? */
+	*(unsigned int *)(qdisc_skb_cb(skb)->data) = hash;
+
+	/* Compute average queue usage (see RED) */
+	p->qavg = red_calc_qavg(p, choke_len(q) - q->holes);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	/* Is queue small? */
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		struct sk_buff *oskb;
+		unsigned int idx;
+
+		/* Draw a packet at random from queue */
+		oskb = choke_peek_random(q, &idx);
+
+		/* Both packets from same flow ? */
+		if (*(unsigned int *)(qdisc_skb_cb(oskb)->data) == hash) {
+			/* Drop both packets */
+			choke_drop_by_idx(q, idx);
+			qdisc_drop(oskb, sch);
+			goto congestion_drop;
+		}
+
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		}
+
+		if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (likely(choke_len(q) < q->limit)) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		__qdisc_update_bstats(sch, qdisc_pkt_len(skb));
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL; /* not really needed */
+	q->head = (q->head + 1) & q->tab_mask;
+	choke_zap_head_holes(q);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+						GFP_KERNEL);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int tail = 0;
+
+			while (q->head != q->tail) {
+				ntab[tail++] = q->tab[q->head];
+				q->head = (q->head + 1) & q->tab_mask;
+			}
+			q->head = 0;
+			q->tail = tail;
+		}
+		q->tab_mask = mask;
+		q->holes = 0;
+	} else
+		sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	sch->q.qlen = choke_len(q) - q->holes;
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	qdisc_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-10 23:44               ` [RFC] sched: CHOKe packet scheduler (v0.4) Stephen Hemminger
@ 2011-01-11  0:00                 ` Eric Dumazet
  2011-01-11  1:10                   ` Stephen Hemminger
  0 siblings, 1 reply; 27+ messages in thread
From: Eric Dumazet @ 2011-01-11  0:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le lundi 10 janvier 2011 à 15:44 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> 
> The core idea is:
>   For every packet arrival:
>   	Calculate Qave
> 	if (Qave < minth) 
> 	     Queue the new packet
> 	else 
> 	     Select randomly a packet from the queue 
> 	     if (both packets from same flow)
> 	     then Drop both the packets
> 	     else if (Qave > maxth)
> 	          Drop packet
> 	     else
> 	       	  Admit packet with proability p (same as RED)
> 
> See also:
>   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
>    queue management scheme for approximating fair bandwidth allocation", 
>   Proceeding of INFOCOM'2000, March 2000. 
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 

You beat me, I found the bug I had  in _change()


> +
> +static int choke_change(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *tb[TCA_RED_MAX + 1];
> +	struct tc_red_qopt *ctl;
> +	int err;
> +	struct sk_buff **old = NULL;
> +	unsigned int mask;
> +
> +	if (opt == NULL)
> +		return -EINVAL;
> +
> +	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
> +	if (err < 0)
> +		return err;
> +
> +	if (tb[TCA_RED_PARMS] == NULL ||
> +	    tb[TCA_RED_STAB] == NULL)
> +		return -EINVAL;
> +
> +	ctl = nla_data(tb[TCA_RED_PARMS]);
> +
> +	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
> +	if (mask != q->tab_mask) {
> +		struct sk_buff **ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
> +						GFP_KERNEL);
> +		if (!ntab)
> +			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
> +		if (!ntab)
> +			return -ENOMEM;
> +		sch_tree_lock(sch);
> +		old = q->tab;
> +		if (old) {
> +			unsigned int tail = 0;
> +
> +			while (q->head != q->tail) {
> +				ntab[tail++] = q->tab[q->head];
> +				q->head = (q->head + 1) & q->tab_mask;
> +			}
> +			q->head = 0;
> +			q->tail = tail;
> +		}
> +		q->tab_mask = mask;

Here we missed :

		q->tab = ntab;

> +		q->holes = 0;
> +	} else
> +		sch_tree_lock(sch);
> +	q->flags = ctl->flags;
> +	q->limit = ctl->limit;
> +
> +	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
> +		      ctl->Plog, ctl->Scell_log,
> +		      nla_data(tb[TCA_RED_STAB]));
> +
> +	if (q->head == q->tail)
> +		red_end_of_idle_period(&q->parms);
> +
> +	sch_tree_unlock(sch);
> +	choke_free(old);
> +	return 0;
> +}
> +



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-11  0:00                 ` Eric Dumazet
@ 2011-01-11  1:10                   ` Stephen Hemminger
  2011-01-11  6:18                     ` Eric Dumazet
  0 siblings, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-11  1:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

OK, put that in.
> 
> Here we missed :
> 
> 		q->tab = ntab;
> 



-- 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-11  1:10                   ` Stephen Hemminger
@ 2011-01-11  6:18                     ` Eric Dumazet
  2011-01-11  6:34                       ` Eric Dumazet
  0 siblings, 1 reply; 27+ messages in thread
From: Eric Dumazet @ 2011-01-11  6:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le lundi 10 janvier 2011 à 17:10 -0800, Stephen Hemminger a écrit :
> OK, put that in.

Thanks !

> > 
> > Here we missed :
> > 
> > 		q->tab = ntab;
> > 

We also need to change 

.peek       =   qdisc_peek_head,

to

.peek = choke_peek_head,

static struct sk_buff *choke_peek_head(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

	return (q->head != q->tail) ? q->tab[q->head] : NULL;
}




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-11  6:18                     ` Eric Dumazet
@ 2011-01-11  6:34                       ` Eric Dumazet
  2011-01-11 23:48                         ` Stephen Hemminger
  2011-01-12  7:13                         ` [RFC] sched: CHOKe packet scheduler (v0.6) Eric Dumazet
  0 siblings, 2 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-11  6:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mardi 11 janvier 2011 à 07:18 +0100, Eric Dumazet a écrit :
> Le lundi 10 janvier 2011 à 17:10 -0800, Stephen Hemminger a écrit :
> > OK, put that in.
> 
> Thanks !
> 
> > > 
> > > Here we missed :
> > > 
> > > 		q->tab = ntab;
> > > 
> 
> We also need to change 
> 
> .peek       =   qdisc_peek_head,
> 
> to
> 
> .peek = choke_peek_head,
> 
> static struct sk_buff *choke_peek_head(struct Qdisc *sch)
> {
> 	struct choke_sched_data *q = qdisc_priv(sch);
> 
> 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
> }
> 
> 


And to correctly work with CBQ (at least...), we need to update 
sch->q.qlen = choke_len(q) - q->holes;
in dequeue() and enqueue()

Here is the version I successfully tested, with 30000 packets in
queue :)

qdisc choke 11: parent 1:11 limit 70000b min 10000b max 30000b ewma 1 Plog 16 Scell_log 11
 Sent 62099201 bytes 112920 pkt (dropped 367712, overlimits 282668 requeues 0) 
 rate 21344Kbit 4851pps backlog 39877589b 30001p requeues 0 
  marked 0 early 282668 pdrop 0 other 0

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a36270a..e63ae56 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -205,6 +205,17 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5db..894fa3f 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index e69de29..b565f2a 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,536 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+	struct red_stats stats;
+
+/* Variables */
+	struct tcf_proto *filter_list;
+	unsigned int	 head;
+	unsigned int	 tail;
+	unsigned int	 holes;
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+static inline unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* deliver a random number between 0 and N - 1 */
+static inline u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+
+/* Select a packet at random from the queue in O(1) and handle holes */
+static struct sk_buff *choke_peek_random(struct choke_sched_data *q,
+					 unsigned int *pidx)
+{
+	struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			return skb;
+	} while (--retrys > 0);
+
+	/* queue is has lots of holes use the head which is known to exist */
+	return q->tab[*pidx = q->head];
+}
+
+/* Is ECN parameter configured */
+static inline int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static inline int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->head] == NULL) {
+		q->head = (q->head + 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->tail - 1] == NULL) {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct choke_sched_data *q, unsigned int idx)
+{
+	q->tab[idx] = NULL;
+	q->holes++;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+}
+
+/* Classify flow using either:
+   1. pre-existing classification result in skb
+   2. fast internal classification
+   3. use TC filter based classification
+*/
+static inline unsigned int choke_classify(struct sk_buff *skb,
+					  struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return skb_get_rxhash(skb);
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		return TC_H_MIN(res.classid);
+	}
+
+	return 0;
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+	unsigned int hash;
+	int uninitialized_var(ret);
+
+	hash = choke_classify(skb, sch, &ret);
+	if (unlikely(!hash)) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+
+	/* XXX add hash to qdisc_skb_cb? */
+	*(unsigned int *)(qdisc_skb_cb(skb)->data) = hash;
+
+	/* Compute average queue usage (see RED) */
+	p->qavg = red_calc_qavg(p, choke_len(q) - q->holes);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	/* Is queue small? */
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		struct sk_buff *oskb;
+		unsigned int idx;
+
+		/* Draw a packet at random from queue */
+		oskb = choke_peek_random(q, &idx);
+
+		/* Both packets from same flow ? */
+		if (*(unsigned int *)(qdisc_skb_cb(oskb)->data) == hash) {
+			/* Drop both packets */
+			choke_drop_by_idx(q, idx);
+			qdisc_drop(oskb, sch);
+			goto congestion_drop;
+		}
+
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		}
+
+		if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (likely(choke_len(q) < q->limit)) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		__qdisc_update_bstats(sch, qdisc_pkt_len(skb));
+		sch->q.qlen = choke_len(q) - q->holes;
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL; /* not really needed */
+	q->head = (q->head + 1) & q->tab_mask;
+	choke_zap_head_holes(q);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	sch->q.qlen = choke_len(q) - q->holes;
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+						GFP_KERNEL);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int tail = 0;
+
+			while (q->head != q->tail) {
+				ntab[tail++] = q->tab[q->head];
+				q->head = (q->head + 1) & q->tab_mask;
+			}
+			q->head = 0;
+			q->tail = tail;
+		}
+		q->tab_mask = mask;
+		q->tab = ntab;
+		q->holes = 0;
+	} else
+		sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	choke_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-11  6:34                       ` Eric Dumazet
@ 2011-01-11 23:48                         ` Stephen Hemminger
  2011-01-12  0:04                           ` Eric Dumazet
  2011-01-12  7:13                         ` [RFC] sched: CHOKe packet scheduler (v0.6) Eric Dumazet
  1 sibling, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-11 23:48 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Tue, 11 Jan 2011 07:34:10 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le mardi 11 janvier 2011 à 07:18 +0100, Eric Dumazet a écrit :
> > Le lundi 10 janvier 2011 à 17:10 -0800, Stephen Hemminger a écrit :
> > > OK, put that in.
> > 
> > Thanks !
> > 
> > > > 
> > > > Here we missed :
> > > > 
> > > > 		q->tab = ntab;
> > > > 
> > 
> > We also need to change 
> > 
> > .peek       =   qdisc_peek_head,
> > 
> > to
> > 
> > .peek = choke_peek_head,
> > 
> > static struct sk_buff *choke_peek_head(struct Qdisc *sch)
> > {
> > 	struct choke_sched_data *q = qdisc_priv(sch);
> > 
> > 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
> > }
> > 
> > 
> 
> 
> And to correctly work with CBQ (at least...), we need to update 
> sch->q.qlen = choke_len(q) - q->holes;
> in dequeue() and enqueue()
> 
> Here is the version I successfully tested, with 30000 packets in
> queue :)
> 
> qdisc choke 11: parent 1:11 limit 70000b min 10000b max 30000b ewma 1 Plog 16 Scell_log 11
>  Sent 62099201 bytes 112920 pkt (dropped 367712, overlimits 282668 requeues 0) 
>  rate 21344Kbit 4851pps backlog 39877589b 30001p requeues 0 
>   marked 0 early 282668 pdrop 0 other 0

Maybe we should take over one of the red counters for the probabilistic match drop.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.4)
  2011-01-11 23:48                         ` Stephen Hemminger
@ 2011-01-12  0:04                           ` Eric Dumazet
  0 siblings, 0 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-12  0:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mardi 11 janvier 2011 à 15:48 -0800, Stephen Hemminger a écrit :
> On Tue, 11 Jan 2011 07:34:10 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:

> > qdisc choke 11: parent 1:11 limit 70000b min 10000b max 30000b ewma 1 Plog 16 Scell_log 11
> >  Sent 62099201 bytes 112920 pkt (dropped 367712, overlimits 282668 requeues 0) 
> >  rate 21344Kbit 4851pps backlog 39877589b 30001p requeues 0 
> >   marked 0 early 282668 pdrop 0 other 0
> 
> Maybe we should take over one of the red counters for the probabilistic match drop.
> 

aka chokedrop ;)

Anyway, iproute2 should be tweaked a bit, since limit/min/max are
packets, not bytes like RED. It would be nice to print probability
too ;)

AFAIK, requeue counter is not anymore used (On leaves at least), we
could re-use it for chokedrop

----
qdisc choke 11: parent 1:11 limit 70000p min 10000p max 30000p proba 0.08
  Sent 62099201 bytes 112920 pkt (dropped 367712, overlimits 282668 chokedrop 999) 
  rate 21344Kbit 4851pps backlog 39877589b 30001p chokedrop 999
  marked 0 early 282668 pdrop 0 other 0
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [RFC] sched: CHOKe packet scheduler (v0.6)
  2011-01-11  6:34                       ` Eric Dumazet
  2011-01-11 23:48                         ` Stephen Hemminger
@ 2011-01-12  7:13                         ` Eric Dumazet
  2011-01-12 17:27                           ` Stephen Hemminger
  1 sibling, 1 reply; 27+ messages in thread
From: Eric Dumazet @ 2011-01-12  7:13 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hi Stephen, here is my v0.6 version :

- Added sanity checks before kcalloc()/kzalloc()
- Added a __GFP_NOWARN to kcalloc()
- Added call to qdisc_bstats_update() after commit bfe0d0298f2a67d94d5
(net_sched: factorize qdisc stats handling)

TODO :
- Added a stat specific update to track CHOKe probabilistic dual-drops
  I temporarily use requeues counter to make sure our code works

qdisc choke 11: parent 1:11 limit 70000b min 10000b max 30000b ewma 1 Plog 16 Scell_log 11
 Sent 236155665 bytes 429432 pkt (dropped 1644435, overlimits 1251933 requeues 196251) 
 rate 38800Kbit 8820pps backlog 124438905b 30001p requeues 196251 
  marked 0 early 1251933 pdrop 0 other 0


Thanks !

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index e69de29..1c62292 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,540 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+	struct red_stats stats;
+
+/* Variables */
+	struct tcf_proto *filter_list;
+	unsigned int	 head;
+	unsigned int	 tail;
+	unsigned int	 holes;
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+static inline unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* deliver a random number between 0 and N - 1 */
+static inline u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+
+/* Select a packet at random from the queue in O(1) and handle holes */
+static struct sk_buff *choke_peek_random(struct choke_sched_data *q,
+					 unsigned int *pidx)
+{
+	struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			return skb;
+	} while (--retrys > 0);
+
+	/* queue is has lots of holes use the head which is known to exist */
+	return q->tab[*pidx = q->head];
+}
+
+/* Is ECN parameter configured */
+static inline int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static inline int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->head] == NULL) {
+		q->head = (q->head + 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	while (q->holes && q->tab[q->tail - 1] == NULL) {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		q->holes--;
+	}
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct choke_sched_data *q, unsigned int idx)
+{
+	q->tab[idx] = NULL;
+	q->holes++;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+}
+
+/* Classify flow using either:
+   1. pre-existing classification result in skb
+   2. fast internal classification
+   3. use TC filter based classification
+*/
+static inline unsigned int choke_classify(struct sk_buff *skb,
+					  struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return skb_get_rxhash(skb);
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		return TC_H_MIN(res.classid);
+	}
+
+	return 0;
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+	unsigned int hash;
+	int uninitialized_var(ret);
+
+	hash = choke_classify(skb, sch, &ret);
+	if (unlikely(!hash)) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+
+	/* XXX add hash to qdisc_skb_cb? */
+	*(unsigned int *)(qdisc_skb_cb(skb)->data) = hash;
+
+	/* Compute average queue usage (see RED) */
+	p->qavg = red_calc_qavg(p, choke_len(q) - q->holes);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	/* Is queue small? */
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		struct sk_buff *oskb;
+		unsigned int idx;
+
+		/* Draw a packet at random from queue */
+		oskb = choke_peek_random(q, &idx);
+
+		/* Both packets from same flow ? */
+		if (*(unsigned int *)(qdisc_skb_cb(oskb)->data) == hash) {
+			/* Drop both packets */
+			choke_drop_by_idx(q, idx);
+			qdisc_drop(oskb, sch);
+			sch->qstats.requeues++;
+			goto congestion_drop;
+		}
+
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		}
+
+		if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (likely(choke_len(q) < q->limit)) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen = choke_len(q) - q->holes;
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL; /* not really needed */
+	q->head = (q->head + 1) & q->tab_mask;
+	choke_zap_head_holes(q);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	sch->q.qlen = choke_len(q) - q->holes;
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	/* limit queue size to one MB */
+	if (mask + 1 > (1U << 20) / sizeof(struct sk_buff *))
+		return -EINVAL;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+						GFP_KERNEL | __GFP_NOWARN);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int tail = 0;
+
+			while (q->head != q->tail) {
+				ntab[tail++] = q->tab[q->head];
+				q->head = (q->head + 1) & q->tab_mask;
+			}
+			q->head = 0;
+			q->tail = tail;
+		}
+		q->tab_mask = mask;
+		q->tab = ntab;
+		q->holes = 0;
+	} else
+		sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	choke_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.6)
  2011-01-12  7:13                         ` [RFC] sched: CHOKe packet scheduler (v0.6) Eric Dumazet
@ 2011-01-12 17:27                           ` Stephen Hemminger
  2011-01-12 17:33                             ` Eric Dumazet
  0 siblings, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2011-01-12 17:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Wed, 12 Jan 2011 08:13:48 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Hi Stephen, here is my v0.6 version :
> 
> - Added sanity checks before kcalloc()/kzalloc()
> - Added a __GFP_NOWARN to kcalloc()
> - Added call to qdisc_bstats_update() after commit bfe0d0298f2a67d94d5
> (net_sched: factorize qdisc stats handling)
> 
> TODO :
> - Added a stat specific update to track CHOKe probabilistic dual-drops
>   I temporarily use requeues counter to make sure our code works

I am going to redo stats and config.
Should thresholds be packet or byte based? I prefer packet
Also leaning towards merging qmax and qlimit together.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] sched: CHOKe packet scheduler (v0.6)
  2011-01-12 17:27                           ` Stephen Hemminger
@ 2011-01-12 17:33                             ` Eric Dumazet
  0 siblings, 0 replies; 27+ messages in thread
From: Eric Dumazet @ 2011-01-12 17:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Le mercredi 12 janvier 2011 à 09:27 -0800, Stephen Hemminger a écrit :
> On Wed, 12 Jan 2011 08:13:48 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > Hi Stephen, here is my v0.6 version :
> > 
> > - Added sanity checks before kcalloc()/kzalloc()
> > - Added a __GFP_NOWARN to kcalloc()
> > - Added call to qdisc_bstats_update() after commit bfe0d0298f2a67d94d5
> > (net_sched: factorize qdisc stats handling)
> > 
> > TODO :
> > - Added a stat specific update to track CHOKe probabilistic dual-drops
> >   I temporarily use requeues counter to make sure our code works
> 
> I am going to redo stats and config.
> Should thresholds be packet or byte based? I prefer packet
> Also leaning towards merging qmax and qlimit together.

I believe CHOKe spirit is per packet, I agree with you.

(This makes the tab[] array sizing directly depends on ctl->limit, not
on a computation ctl->limit/smallest_packet_size)

Not sure we can merge qmax and qlimit, are you sure its OK with CHOKe
paper and experimental results ?
(Sorry I cant spend too much time right now to check this point)




^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2011-01-12 17:33 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-05  0:29 [RFC] sched: CHOKe packet scheduler Stephen Hemminger
2011-01-05  6:02 ` Eric Dumazet
2011-01-05  6:19 ` Eric Dumazet
2011-01-05 17:17   ` Stephen Hemminger
2011-01-05 17:25     ` Eric Dumazet
2011-01-05 19:21       ` [RFC] sched: CHOKe packet scheduler (v0.2) Stephen Hemminger
2011-01-05 20:06         ` Eric Dumazet
2011-01-05 20:15           ` Stephen Hemminger
2011-01-06  4:07         ` Eric Dumazet
2011-01-06  6:53           ` Stephen Hemminger
2011-01-07  4:55           ` Stephen Hemminger
2011-01-07  5:39             ` Changli Gao
2011-01-07  7:10               ` Stephen Hemminger
2011-01-07  8:37             ` Eric Dumazet
2011-01-10 13:46             ` Eric Dumazet
2011-01-10 17:31               ` Stephen Hemminger
2011-01-10 17:45                 ` Eric Dumazet
2011-01-10 23:44               ` [RFC] sched: CHOKe packet scheduler (v0.4) Stephen Hemminger
2011-01-11  0:00                 ` Eric Dumazet
2011-01-11  1:10                   ` Stephen Hemminger
2011-01-11  6:18                     ` Eric Dumazet
2011-01-11  6:34                       ` Eric Dumazet
2011-01-11 23:48                         ` Stephen Hemminger
2011-01-12  0:04                           ` Eric Dumazet
2011-01-12  7:13                         ` [RFC] sched: CHOKe packet scheduler (v0.6) Eric Dumazet
2011-01-12 17:27                           ` Stephen Hemminger
2011-01-12 17:33                             ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.