All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next-2.6] net_sched: sfq: allow divisor to be a variable
@ 2011-01-20  9:50 Eric Dumazet
  2011-01-20 10:17 ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2011-01-20  9:50 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski, jamal

SFQ currently uses a 1024 slots hash table, and its internal structure
(sfq_sched_data) allocation needs order-1 page on x86_64

Allow tc command to specify a divisor value (hash table size), between 1
and 65536.
If no value is provided, assume the 1024 default size.

This allows admins to setup smaller (or bigger) SFQ for specific needs.

This also brings back sfq_sched_data allocations to order-0 ones, saving
3KB per SFQ qdisc.

Jesper uses ~55.000 sfq in one machine, this patch should free 165 MB of
memory.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Jesper Dangaard Brouer <hawk@diku.dk>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Jamal Hadi Salim <hadi@cyberus.ca>
---
 net/sched/sch_sfq.c |   43 ++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3fe5c45..ad7fa8b 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -21,6 +21,7 @@
 #include <linux/skbuff.h>
 #include <linux/jhash.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <net/ip.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
@@ -76,7 +77,8 @@
 #define SFQ_DEPTH		128 /* max number of packets per flow */
 #define SFQ_SLOTS		128 /* max number of flows */
 #define SFQ_EMPTY_SLOT		255
-#define SFQ_HASH_DIVISOR	1024
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
 /* We use 16 bits to store allot, and want to handle packets up to 64K
  * Scale allot by 8 (1<<3) so that no overflow occurs.
  */
@@ -112,7 +114,7 @@ struct sfq_sched_data {
 	int		perturb_period;
 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
 	int		limit;
-
+	unsigned int	divisor;	/* number of slots in hash table */
 /* Variables */
 	struct tcf_proto *filter_list;
 	struct timer_list perturb_timer;
@@ -120,7 +122,7 @@ struct sfq_sched_data {
 	sfq_index	cur_depth;	/* depth of longest slot */
 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 	struct sfq_slot *tail;		/* current slot in round */
-	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
+	sfq_index	*ht;		/* Hash table (divisor slots) */
 	struct sfq_slot	slots[SFQ_SLOTS];
 	struct sfq_head	dep[SFQ_DEPTH];	/* Linked list of slots, indexed by depth */
 };
@@ -137,7 +139,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
 
 static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 {
-	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
+	return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
 }
 
 static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
@@ -201,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	if (TC_H_MAJ(skb->priority) == sch->handle &&
 	    TC_H_MIN(skb->priority) > 0 &&
-	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
+	    TC_H_MIN(skb->priority) <= q->divisor)
 		return TC_H_MIN(skb->priority);
 
 	if (!q->filter_list)
@@ -219,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 			return 0;
 		}
 #endif
-		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
+		if (TC_H_MIN(res.classid) <= q->divisor)
 			return TC_H_MIN(res.classid);
 	}
 	return 0;
@@ -496,7 +498,12 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	q->perturb_period = ctl->perturb_period * HZ;
 	if (ctl->limit)
 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
-
+	if (ctl->divisor) {
+		if (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)
+			return -EINVAL;
+		q->divisor = ctl->divisor;
+			
+	}
 	qlen = sch->q.qlen;
 	while (sch->q.qlen > q->limit)
 		sfq_drop(sch);
@@ -514,15 +521,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
+	size_t sz;
 	int i;
 
 	q->perturb_timer.function = sfq_perturbation;
 	q->perturb_timer.data = (unsigned long)sch;
 	init_timer_deferrable(&q->perturb_timer);
 
-	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
-		q->ht[i] = SFQ_EMPTY_SLOT;
-
 	for (i = 0; i < SFQ_DEPTH; i++) {
 		q->dep[i].next = i + SFQ_SLOTS;
 		q->dep[i].prev = i + SFQ_SLOTS;
@@ -531,6 +536,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->limit = SFQ_DEPTH - 1;
 	q->cur_depth = 0;
 	q->tail = NULL;
+	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 	if (opt == NULL) {
 		q->quantum = psched_mtu(qdisc_dev(sch));
 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
@@ -542,6 +548,15 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 			return err;
 	}
 
+	sz = sizeof(q->ht[0]) * q->divisor;
+	q->ht = kmalloc(sz, GFP_KERNEL);
+	if (!q->ht && sz > PAGE_SIZE)
+		q->ht = vmalloc(sz);
+	if (!q->ht)
+		return -ENOMEM;
+	for (i = 0; i < q->divisor; i++)
+		q->ht[i] = SFQ_EMPTY_SLOT;
+
 	for (i = 0; i < SFQ_SLOTS; i++) {
 		slot_queue_init(&q->slots[i]);
 		sfq_link(q, i);
@@ -556,6 +571,10 @@ static void sfq_destroy(struct Qdisc *sch)
 	tcf_destroy_chain(&q->filter_list);
 	q->perturb_period = 0;
 	del_timer_sync(&q->perturb_timer);
+	if (is_vmalloc_addr(q->ht))
+		vfree(q->ht);
+	else
+		kfree(q->ht);
 }
 
 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -568,7 +587,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.perturb_period = q->perturb_period / HZ;
 
 	opt.limit = q->limit;
-	opt.divisor = SFQ_HASH_DIVISOR;
+	opt.divisor = q->divisor;
 	opt.flows = q->limit;
 
 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -646,7 +665,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	if (arg->stop)
 		return;
 
-	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
+	for (i = 0; i < q->divisor; i++) {
 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
 		    arg->count < arg->skip) {
 			arg->count++;



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next-2.6] net_sched: sfq: allow divisor to be a variable
  2011-01-20  9:50 [PATCH net-next-2.6] net_sched: sfq: allow divisor to be a variable Eric Dumazet
@ 2011-01-20 10:17 ` Eric Dumazet
  2011-01-20 13:48   ` [PATCH net-next-2.6] net_sched: RCU conversion of stab Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2011-01-20 10:17 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski, jamal

Le jeudi 20 janvier 2011 à 10:50 +0100, Eric Dumazet a écrit :
> SFQ currently uses a 1024 slots hash table, and its internal structure
> (sfq_sched_data) allocation needs order-1 page on x86_64
> 
> Allow tc command to specify a divisor value (hash table size), between 1
> and 65536.
> If no value is provided, assume the 1024 default size.
> 
> This allows admins to setup smaller (or bigger) SFQ for specific needs.
> 
> This also brings back sfq_sched_data allocations to order-0 ones, saving
> 3KB per SFQ qdisc.
> 
> Jesper uses ~55.000 sfq in one machine, this patch should free 165 MB of
> memory.

Arg, sorry for the double post, I had a tg3 failure on my machine.

Please disregard this copy.



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH net-next-2.6] net_sched: RCU conversion of stab
  2011-01-20 10:17 ` Eric Dumazet
@ 2011-01-20 13:48   ` Eric Dumazet
  2011-01-20 15:01     ` Patrick McHardy
                       ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Eric Dumazet @ 2011-01-20 13:48 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski, jamal

This patch converts stab qdisc management to RCU, so that we can perform
the qdisc_calculate_pkt_len() call before getting qdisc lock.

This shortens the lock's held time in __dev_xmit_skb().

This permits more qdiscs to get TCQ_F_CAN_BYPASS status, avoiding lot of
cache misses and so reducing latencies.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Jesper Dangaard Brouer <hawk@diku.dk>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Jamal Hadi Salim <hadi@cyberus.ca>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
 include/net/sch_generic.h |   21 +++++++++++++++------
 net/core/dev.c            |    8 +++++---
 net/sched/sch_api.c       |   26 +++++++++++++++++---------
 net/sched/sch_generic.c   |    2 +-
 4 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e9eee99..d67cc34 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -35,6 +35,7 @@ enum qdisc___state_t {
 };
 
 struct qdisc_size_table {
+	struct rcu_head		rcu;
 	struct list_head	list;
 	struct tc_sizespec	szopts;
 	int			refcnt;
@@ -53,7 +54,7 @@ struct Qdisc {
 #define TCQ_F_WARN_NONWC	(1 << 16)
 	int			padded;
 	struct Qdisc_ops	*ops;
-	struct qdisc_size_table	*stab;
+	struct qdisc_size_table	__rcu *stab;
 	struct list_head	list;
 	u32			handle;
 	u32			parent;
@@ -331,8 +332,8 @@ extern struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 				 struct Qdisc_ops *ops);
 extern struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 				       struct Qdisc_ops *ops, u32 parentid);
-extern void qdisc_calculate_pkt_len(struct sk_buff *skb,
-				   struct qdisc_size_table *stab);
+extern void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+				      const struct qdisc_size_table *stab);
 extern void tcf_destroy(struct tcf_proto *tp);
 extern void tcf_destroy_chain(struct tcf_proto **fl);
 
@@ -411,12 +412,20 @@ enum net_xmit_qdisc_t {
 #define net_xmit_drop_count(e)	(1)
 #endif
 
-static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
+					   const struct Qdisc *sch)
 {
 #ifdef CONFIG_NET_SCHED
-	if (sch->stab)
-		qdisc_calculate_pkt_len(skb, sch->stab);
+	struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);
+
+	if (stab)
+		__qdisc_calculate_pkt_len(skb, stab);
 #endif
+}
+
+static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	qdisc_calculate_pkt_len(skb, sch);
 	return sch->enqueue(skb, sch);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8b1d886..f27882e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2325,15 +2325,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct netdev_queue *txq)
 {
 	spinlock_t *root_lock = qdisc_lock(q);
-	bool contended = qdisc_is_running(q);
+	bool contended;
 	int rc;
 
+	qdisc_skb_cb(skb)->pkt_len = skb->len;
+	qdisc_calculate_pkt_len(skb, q);
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
 	 * and dequeue packets faster.
 	 */
+	contended = qdisc_is_running(q);
 	if (unlikely(contended))
 		spin_lock(&q->busylock);
 
@@ -2351,7 +2354,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
 			skb_dst_force(skb);
 
-		qdisc_skb_cb(skb)->pkt_len = skb->len;
 		qdisc_bstats_update(q, skb);
 
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
@@ -2366,7 +2368,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		skb_dst_force(skb);
-		rc = qdisc_enqueue_root(skb, q);
+		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 36ac0ec..7043fd9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -398,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 	return stab;
 }
 
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
 void qdisc_put_stab(struct qdisc_size_table *tab)
 {
 	if (!tab)
@@ -407,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
 
 	if (--tab->refcnt == 0) {
 		list_del(&tab->list);
-		kfree(tab);
+		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 	}
 
 	spin_unlock(&qdisc_stab_lock);
@@ -430,7 +435,7 @@ nla_put_failure:
 	return -1;
 }
 
-void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 {
 	int pkt_len, slot;
 
@@ -456,7 +461,7 @@ out:
 		pkt_len = 1;
 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
 }
-EXPORT_SYMBOL(qdisc_calculate_pkt_len);
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 
 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 {
@@ -835,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 				err = PTR_ERR(stab);
 				goto err_out4;
 			}
-			sch->stab = stab;
+			rcu_assign_pointer(sch->stab, stab);
 		}
 		if (tca[TCA_RATE]) {
 			spinlock_t *root_lock;
@@ -875,7 +880,7 @@ err_out4:
 	 * Any broken qdiscs that would require a ops->reset() here?
 	 * The qdisc was never in action so it shouldn't be necessary.
 	 */
-	qdisc_put_stab(sch->stab);
+	qdisc_put_stab(rtnl_dereference(sch->stab));
 	if (ops->destroy)
 		ops->destroy(sch);
 	goto err_out3;
@@ -883,7 +888,7 @@ err_out4:
 
 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 {
-	struct qdisc_size_table *stab = NULL;
+	struct qdisc_size_table *ostab, *stab = NULL;
 	int err = 0;
 
 	if (tca[TCA_OPTIONS]) {
@@ -900,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 			return PTR_ERR(stab);
 	}
 
-	qdisc_put_stab(sch->stab);
-	sch->stab = stab;
+	ostab = rtnl_dereference(sch->stab);
+	rcu_assign_pointer(sch->stab, stab);
+	qdisc_put_stab(ostab);
 
 	if (tca[TCA_RATE]) {
 		/* NB: ignores errors from replace_estimator
@@ -1180,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	struct nlmsghdr  *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct gnet_dump d;
+	struct qdisc_size_table *stab;
 
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 	tcm = NLMSG_DATA(nlh);
@@ -1195,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 		goto nla_put_failure;
 	q->qstats.qlen = q->q.qlen;
 
-	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
+	stab = rtnl_dereference(q->stab);
+	if (stab && qdisc_dump_stab(skb, stab) < 0)
 		goto nla_put_failure;
 
 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2f1cb62..cc17e79 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -632,7 +632,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 #ifdef CONFIG_NET_SCHED
 	qdisc_list_del(qdisc);
 
-	qdisc_put_stab(qdisc->stab);
+	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
 	if (ops->reset)



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next-2.6] net_sched: RCU conversion of stab
  2011-01-20 13:48   ` [PATCH net-next-2.6] net_sched: RCU conversion of stab Eric Dumazet
@ 2011-01-20 15:01     ` Patrick McHardy
  2011-01-20 16:39       ` Eric Dumazet
  2011-01-20 15:27     ` [PATCH net-next-2.6] net_sched: move TCQ_F_THROTTLED flag Eric Dumazet
  2011-01-21  0:56     ` [PATCH net-next-2.6] net_sched: RCU conversion of stab David Miller
  2 siblings, 1 reply; 10+ messages in thread
From: Patrick McHardy @ 2011-01-20 15:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Jarek Poplawski, jamal

Am 20.01.2011 14:48, schrieb Eric Dumazet:
> This patch converts stab qdisc management to RCU, so that we can perform
> the qdisc_calculate_pkt_len() call before getting qdisc lock.
> 
> This shortens the lock's held time in __dev_xmit_skb().
> 
> This permits more qdiscs to get TCQ_F_CAN_BYPASS status, avoiding lot of
> cache misses and so reducing latencies.
> 

Looks good to me.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH net-next-2.6] net_sched:  move TCQ_F_THROTTLED flag
  2011-01-20 13:48   ` [PATCH net-next-2.6] net_sched: RCU conversion of stab Eric Dumazet
  2011-01-20 15:01     ` Patrick McHardy
@ 2011-01-20 15:27     ` Eric Dumazet
  2011-01-21  0:56       ` David Miller
  2011-01-21 11:04       ` [PATCH net-next-2.6] net_sched: TCQ_F_CAN_BYPASS generalization Eric Dumazet
  2011-01-21  0:56     ` [PATCH net-next-2.6] net_sched: RCU conversion of stab David Miller
  2 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2011-01-20 15:27 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski, jamal

In commit 371121057607e (net: QDISC_STATE_RUNNING dont need atomic bit
ops) I moved QDISC_STATE_RUNNING flag to __state container, located in
the cache line containing qdisc lock and often dirtied fields.

I now move TCQ_F_THROTTLED bit too, so that we let first cache line read
mostly, and shared by all cpus. This should speedup HTB/CBQ for example.

Not using test_bit()/__clear_bit()/__test_and_set_bit allows to use an
"unsigned int" for __state container, reducing by 8 bytes Qdisc size.

Introduce helpers to hide implementation details.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Jesper Dangaard Brouer <hawk@diku.dk>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Jamal Hadi Salim <hadi@cyberus.ca>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
 include/net/sch_generic.h |   38 ++++++++++++++++++++++++++----------
 net/sched/sch_api.c       |    6 ++---
 net/sched/sch_cbq.c       |    6 ++---
 net/sched/sch_hfsc.c      |    2 -
 net/sched/sch_htb.c       |    4 +--
 net/sched/sch_netem.c     |    2 -
 net/sched/sch_tbf.c       |    2 -
 7 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e9eee99..f6345f5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -31,7 +31,8 @@ enum qdisc_state_t {
  * following bits are only changed while qdisc lock is held
  */
 enum qdisc___state_t {
-	__QDISC___STATE_RUNNING,
+	__QDISC___STATE_RUNNING = 1,
+	__QDISC___STATE_THROTTLED = 2,
 };
 
 struct qdisc_size_table {
@@ -46,10 +47,9 @@ struct Qdisc {
 	struct sk_buff *	(*dequeue)(struct Qdisc *dev);
 	unsigned		flags;
 #define TCQ_F_BUILTIN		1
-#define TCQ_F_THROTTLED		2
-#define TCQ_F_INGRESS		4
-#define TCQ_F_CAN_BYPASS	8
-#define TCQ_F_MQROOT		16
+#define TCQ_F_INGRESS		2
+#define TCQ_F_CAN_BYPASS	4
+#define TCQ_F_MQROOT		8
 #define TCQ_F_WARN_NONWC	(1 << 16)
 	int			padded;
 	struct Qdisc_ops	*ops;
@@ -78,25 +78,43 @@ struct Qdisc {
 	unsigned long		state;
 	struct sk_buff_head	q;
 	struct gnet_stats_basic_packed bstats;
-	unsigned long		__state;
+	unsigned int		__state;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
 	spinlock_t		busylock;
 };
 
-static inline bool qdisc_is_running(struct Qdisc *qdisc)
+static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
-	return test_bit(__QDISC___STATE_RUNNING, &qdisc->__state);
+	return (qdisc->__state & __QDISC___STATE_RUNNING) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
-	return !__test_and_set_bit(__QDISC___STATE_RUNNING, &qdisc->__state);
+	if (qdisc_is_running(qdisc))
+		return false;
+	qdisc->__state |= __QDISC___STATE_RUNNING;
+	return true;
 }
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
-	__clear_bit(__QDISC___STATE_RUNNING, &qdisc->__state);
+	qdisc->__state &= ~__QDISC___STATE_RUNNING;
+}
+
+static inline bool qdisc_is_throttled(const struct Qdisc *qdisc)
+{
+	return (qdisc->__state & __QDISC___STATE_THROTTLED) ? true : false;
+}
+
+static inline void qdisc_throttled(struct Qdisc *qdisc)
+{
+	qdisc->__state |= __QDISC___STATE_THROTTLED;
+}
+
+static inline void qdisc_unthrottled(struct Qdisc *qdisc)
+{
+	qdisc->__state &= ~__QDISC___STATE_THROTTLED;
 }
 
 struct Qdisc_class_ops {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 36ac0ec..374fcbe 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -473,7 +473,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 						 timer);
 
-	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(wd->qdisc);
 	__netif_schedule(qdisc_root(wd->qdisc));
 
 	return HRTIMER_NORESTART;
@@ -495,7 +495,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 		     &qdisc_root_sleeping(wd->qdisc)->state))
 		return;
 
-	wd->qdisc->flags |= TCQ_F_THROTTLED;
+	qdisc_throttled(wd->qdisc);
 	time = ktime_set(0, 0);
 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
@@ -505,7 +505,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule);
 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 {
 	hrtimer_cancel(&wd->timer);
-	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(wd->qdisc);
 }
 EXPORT_SYMBOL(qdisc_watchdog_cancel);
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 4aaf44c..25ed522 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -351,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 {
 	int toplevel = q->toplevel;
 
-	if (toplevel > cl->level && !(cl->q->flags & TCQ_F_THROTTLED)) {
+	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
 		psched_time_t now;
 		psched_tdiff_t incr;
 
@@ -625,7 +625,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
 	}
 
-	sch->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(sch);
 	__netif_schedule(qdisc_root(sch));
 	return HRTIMER_NORESTART;
 }
@@ -974,7 +974,7 @@ cbq_dequeue(struct Qdisc *sch)
 		skb = cbq_dequeue_1(sch);
 		if (skb) {
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
+			qdisc_unthrottled(sch);
 			return skb;
 		}
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index dea4009..b632d92 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1664,7 +1664,7 @@ hfsc_dequeue(struct Qdisc *sch)
 		set_passive(cl);
 	}
 
-	sch->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(sch);
 	sch->q.qlen--;
 
 	return skb;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 3e86fd3..39db75c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -865,7 +865,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
 	skb = __skb_dequeue(&q->direct_queue);
 	if (skb != NULL) {
-		sch->flags &= ~TCQ_F_THROTTLED;
+		qdisc_unthrottled(sch);
 		sch->q.qlen--;
 		return skb;
 	}
@@ -901,7 +901,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 			skb = htb_dequeue_tree(q, prio, level);
 			if (likely(skb != NULL)) {
 				sch->q.qlen--;
-				sch->flags &= ~TCQ_F_THROTTLED;
+				qdisc_unthrottled(sch);
 				goto fin;
 			}
 		}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index c2bbbe6..c26ef36 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -266,7 +266,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-	if (sch->flags & TCQ_F_THROTTLED)
+	if (qdisc_is_throttled(sch))
 		return NULL;
 
 	skb = q->qdisc->ops->peek(q->qdisc);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 475edfb..86c0166 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -185,7 +185,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 			q->tokens = toks;
 			q->ptokens = ptoks;
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
+			qdisc_unthrottled(sch);
 			return skb;
 		}
 



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next-2.6] net_sched: RCU conversion of stab
  2011-01-20 15:01     ` Patrick McHardy
@ 2011-01-20 16:39       ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2011-01-20 16:39 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Jarek Poplawski, jamal

Le jeudi 20 janvier 2011 à 16:01 +0100, Patrick McHardy a écrit :
> Am 20.01.2011 14:48, schrieb Eric Dumazet:
> > This patch converts stab qdisc management to RCU, so that we can perform
> > the qdisc_calculate_pkt_len() call before getting qdisc lock.
> > 
> > This shortens the lock's held time in __dev_xmit_skb().
> > 
> > This permits more qdiscs to get TCQ_F_CAN_BYPASS status, avoiding lot of
> > cache misses and so reducing latencies.
> > 
> 
> Looks good to me.
> 

Thanks for reviewing Patrick !



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next-2.6] net_sched: RCU conversion of stab
  2011-01-20 13:48   ` [PATCH net-next-2.6] net_sched: RCU conversion of stab Eric Dumazet
  2011-01-20 15:01     ` Patrick McHardy
  2011-01-20 15:27     ` [PATCH net-next-2.6] net_sched: move TCQ_F_THROTTLED flag Eric Dumazet
@ 2011-01-21  0:56     ` David Miller
  2 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2011-01-21  0:56 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, kaber, hawk, jarkao2, hadi

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Jan 2011 14:48:19 +0100

> This patch converts stab qdisc management to RCU, so that we can perform
> the qdisc_calculate_pkt_len() call before getting qdisc lock.
> 
> This shortens the lock's held time in __dev_xmit_skb().
> 
> This permits more qdiscs to get TCQ_F_CAN_BYPASS status, avoiding lot of
> cache misses and so reducing latencies.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next-2.6] net_sched: move TCQ_F_THROTTLED flag
  2011-01-20 15:27     ` [PATCH net-next-2.6] net_sched: move TCQ_F_THROTTLED flag Eric Dumazet
@ 2011-01-21  0:56       ` David Miller
  2011-01-21 11:04       ` [PATCH net-next-2.6] net_sched: TCQ_F_CAN_BYPASS generalization Eric Dumazet
  1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2011-01-21  0:56 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, kaber, hawk, jarkao2, hadi

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Jan 2011 16:27:16 +0100

> In commit 371121057607e (net: QDISC_STATE_RUNNING dont need atomic bit
> ops) I moved QDISC_STATE_RUNNING flag to __state container, located in
> the cache line containing qdisc lock and often dirtied fields.
> 
> I now move TCQ_F_THROTTLED bit too, so that we let first cache line read
> mostly, and shared by all cpus. This should speedup HTB/CBQ for example.
> 
> Not using test_bit()/__clear_bit()/__test_and_set_bit allows to use an
> "unsigned int" for __state container, reducing by 8 bytes Qdisc size.
> 
> Introduce helpers to hide implementation details.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH net-next-2.6] net_sched:  TCQ_F_CAN_BYPASS generalization
  2011-01-20 15:27     ` [PATCH net-next-2.6] net_sched: move TCQ_F_THROTTLED flag Eric Dumazet
  2011-01-21  0:56       ` David Miller
@ 2011-01-21 11:04       ` Eric Dumazet
  2011-01-22  0:27         ` David Miller
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2011-01-21 11:04 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski, jamal

Now qdisc stab is handled before TCQ_F_CAN_BYPASS test in
__dev_xmit_skb(), we can generalize TCQ_F_CAN_BYPASS to other qdiscs
than pfifo_fast : pfifo, bfifo, pfifo_head_drop and sfq

SFQ is special because it can have external classifiers, and in these
cases, we cannot bypass queue discipline (packet could be dropped by
classifier) without admin asking it, or further changes.

Its worth doing this, especially for SFQ, avoiding dirtying memory in
case no packets are already waiting in queue.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Jesper Dangaard Brouer <hawk@diku.dk>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Jamal Hadi Salim <hadi@cyberus.ca>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
I am not sure RED can use bypass too, feel free to comment on this ;)

 net/sched/sch_fifo.c    |   13 ++++++++++++-
 net/sched/sch_generic.c |    5 ++---
 net/sched/sch_mq.c      |    1 -
 net/sched/sch_mqprio.c  |    1 -
 net/sched/sch_sfq.c     |    6 ++++++
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index b3075f8..f7290d2 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -64,11 +64,13 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
+	bool bypass;
+	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
 
 	if (opt == NULL) {
 		u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
 
-		if (sch->ops == &bfifo_qdisc_ops)
+		if (is_bfifo)
 			limit *= psched_mtu(qdisc_dev(sch));
 
 		q->limit = limit;
@@ -81,6 +83,15 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
 		q->limit = ctl->limit;
 	}
 
+	if (is_bfifo)
+		bypass = q->limit >= psched_mtu(qdisc_dev(sch));
+	else
+		bypass = q->limit >= 1;
+
+	if (bypass)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cc17e79..0da09d5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -527,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 		skb_queue_head_init(band2list(priv, prio));
 
+	/* Can by-pass the queue discipline */
+	qdisc->flags |= TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
@@ -691,9 +693,6 @@ static void attach_one_default_qdisc(struct net_device *dev,
 			netdev_info(dev, "activation failed\n");
 			return;
 		}
-
-		/* Can by-pass the queue discipline for default qdisc */
-		qdisc->flags |= TCQ_F_CAN_BYPASS;
 	}
 	dev_queue->qdisc_sleeping = qdisc;
 }
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f..ec5cbc8 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
 			goto err;
-		qdisc->flags |= TCQ_F_CAN_BYPASS;
 		priv->qdiscs[ntx] = qdisc;
 	}
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 8620c65..fbc6f53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -130,7 +130,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 			err = -ENOMEM;
 			goto err;
 		}
-		qdisc->flags |= TCQ_F_CAN_BYPASS;
 		priv->qdiscs[i] = qdisc;
 	}
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 156ad30..fdba52a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -560,6 +560,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 		slot_queue_init(&q->slots[i]);
 		sfq_link(q, i);
 	}
+	if (q->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
@@ -611,6 +615,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 			      u32 classid)
 {
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next-2.6] net_sched: TCQ_F_CAN_BYPASS generalization
  2011-01-21 11:04       ` [PATCH net-next-2.6] net_sched: TCQ_F_CAN_BYPASS generalization Eric Dumazet
@ 2011-01-22  0:27         ` David Miller
  0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2011-01-22  0:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, kaber, hawk, jarkao2, hadi

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 21 Jan 2011 12:04:41 +0100

> Now qdisc stab is handled before TCQ_F_CAN_BYPASS test in
> __dev_xmit_skb(), we can generalize TCQ_F_CAN_BYPASS to other qdiscs
> than pfifo_fast : pfifo, bfifo, pfifo_head_drop and sfq
> 
> SFQ is special because it can have external classifiers, and in these
> cases, we cannot bypass queue discipline (packet could be dropped by
> classifier) without admin asking it, or further changes.
> 
> Its worth doing this, especially for SFQ, avoiding dirtying memory in
> case no packets are already waiting in queue.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks.

> I am not sure RED can use bypass too, feel free to comment on this ;)

The only thing that RED would seem to care about would be the
queue size average calculation.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-01-22  0:26 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-20  9:50 [PATCH net-next-2.6] net_sched: sfq: allow divisor to be a variable Eric Dumazet
2011-01-20 10:17 ` Eric Dumazet
2011-01-20 13:48   ` [PATCH net-next-2.6] net_sched: RCU conversion of stab Eric Dumazet
2011-01-20 15:01     ` Patrick McHardy
2011-01-20 16:39       ` Eric Dumazet
2011-01-20 15:27     ` [PATCH net-next-2.6] net_sched: move TCQ_F_THROTTLED flag Eric Dumazet
2011-01-21  0:56       ` David Miller
2011-01-21 11:04       ` [PATCH net-next-2.6] net_sched: TCQ_F_CAN_BYPASS generalization Eric Dumazet
2011-01-22  0:27         ` David Miller
2011-01-21  0:56     ` [PATCH net-next-2.6] net_sched: RCU conversion of stab David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.