All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC Patch net-next] net_sched: make classifying lockless on ingress
@ 2013-12-20 23:28 Cong Wang
  2013-12-20 23:49 ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Cong Wang @ 2013-12-20 23:28 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Eric Dumazet, David S. Miller, Jamal Hadi Salim

This patch tries to switch filter list to using struct
list_head, so that on the read side, the list can be traversed
with RCU read lock. I hope either on egress or ingress classify
is already done with RCU read lock. I don't pretend I fully
understanding qdisc locking.

Also, I am not sure I use RCU API's correctly at all. At least
I don't see any warning with CONFIG_PROVE_RCU=y.

Without this patch, the spin_lock easily appears on the top of
my perf top with 4 netperf sessions (4 is the number of CPU)
and with 1000 u32 filters on ingress.

Comments?

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>


---
 include/net/pkt_sched.h   |  4 ++--
 include/net/sch_generic.h |  9 ++++++---
 net/core/dev.c            |  2 --
 net/sched/cls_api.c       | 36 ++++++++++++++++++++++++------------
 net/sched/sch_api.c       | 35 ++++++++++++++++++++++-------------
 net/sched/sch_atm.c       | 14 +++++++-------
 net/sched/sch_cbq.c       |  9 +++++----
 net/sched/sch_choke.c     | 11 ++++++-----
 net/sched/sch_drr.c       |  7 ++++---
 net/sched/sch_dsmark.c    |  7 ++++---
 net/sched/sch_fq_codel.c  |  9 +++++----
 net/sched/sch_hfsc.c      | 15 +++++++++------
 net/sched/sch_htb.c       | 20 ++++++++++++--------
 net/sched/sch_ingress.c   | 14 +++++++++++---
 net/sched/sch_multiq.c    |  7 ++++---
 net/sched/sch_prio.c      |  9 +++++----
 net/sched/sch_qfq.c       |  7 ++++---
 net/sched/sch_sfb.c       |  9 +++++----
 net/sched/sch_sfq.c       | 11 ++++++-----
 19 files changed, 141 insertions(+), 94 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 891d80d..27a1efa 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -109,9 +109,9 @@ static inline void qdisc_run(struct Qdisc *q)
 		__qdisc_run(q);
 }
 
-int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
+int tc_classify_compat(struct sk_buff *skb, const struct list_head *head,
 		       struct tcf_result *res);
-int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+int tc_classify(struct sk_buff *skb, const struct list_head *head,
 		struct tcf_result *res);
 
 /* Calculate maximal size of packet seen by hard_start_xmit
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 013d96d..97123cc 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -6,6 +6,7 @@
 #include <linux/rcupdate.h>
 #include <linux/pkt_sched.h>
 #include <linux/pkt_cls.h>
+#include <linux/list.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 
@@ -143,7 +144,7 @@ struct Qdisc_class_ops {
 	void			(*walk)(struct Qdisc *, struct qdisc_walker * arg);
 
 	/* Filter manipulation */
-	struct tcf_proto **	(*tcf_chain)(struct Qdisc *, unsigned long);
+	struct list_head *	(*tcf_chain)(struct Qdisc *, unsigned long);
 	unsigned long		(*bind_tcf)(struct Qdisc *, unsigned long,
 					u32 classid);
 	void			(*unbind_tcf)(struct Qdisc *, unsigned long);
@@ -184,6 +185,8 @@ struct tcf_result {
 	u32		classid;
 };
 
+struct tcf_proto;
+
 struct tcf_proto_ops {
 	struct list_head	head;
 	char			kind[IFNAMSIZ];
@@ -212,7 +215,7 @@ struct tcf_proto_ops {
 
 struct tcf_proto {
 	/* Fast access part */
-	struct tcf_proto	*next;
+	struct list_head	head;
 	void			*root;
 	int			(*classify)(struct sk_buff *,
 					    const struct tcf_proto *,
@@ -376,7 +379,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
 void tcf_destroy(struct tcf_proto *tp);
-void tcf_destroy_chain(struct tcf_proto **fl);
+void tcf_destroy_chain(struct list_head *fl);
 
 /* Reset all TX qdiscs greater then index of a device.  */
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
diff --git a/net/core/dev.c b/net/core/dev.c
index c482fe8..7cc0d6a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3382,10 +3382,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
-		spin_lock(qdisc_lock(q));
 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
 			result = qdisc_enqueue_root(skb, q);
-		spin_unlock(qdisc_lock(q));
 	}
 
 	return result;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 12e882e..afeda53 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -125,8 +125,9 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 	u32 parent;
 	struct net_device *dev;
 	struct Qdisc  *q;
-	struct tcf_proto **back, **chain;
-	struct tcf_proto *tp;
+	struct tcf_proto *back;
+	struct list_head *chain;
+	struct tcf_proto *tp, *res = NULL;
 	const struct tcf_proto_ops *tp_ops;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
@@ -196,21 +197,27 @@ replay:
 		goto errout;
 
 	/* Check the chain for existence of proto-tcf with this priority */
-	for (back = chain; (tp = *back) != NULL; back = &tp->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(tp, chain, head) {
+		back = list_next_entry(tp, head);
 		if (tp->prio >= prio) {
 			if (tp->prio == prio) {
 				if (!nprio ||
-				    (tp->protocol != protocol && protocol))
+				    (tp->protocol != protocol && protocol)) {
+					rcu_read_unlock();
 					goto errout;
+				}
+				res = tp;
 			} else
-				tp = NULL;
+				res = NULL;
 			break;
 		}
 	}
+	rcu_read_unlock();
 
 	root_lock = qdisc_root_sleeping_lock(q);
 
-	if (tp == NULL) {
+	if ((tp = res) == NULL) {
 		/* Proto-tcf does not exist, create new one */
 
 		if (tca[TCA_KIND] == NULL || !protocol)
@@ -228,6 +235,7 @@ replay:
 		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
 		if (tp == NULL)
 			goto errout;
+		INIT_LIST_HEAD(&tp->head);
 		err = -ENOENT;
 		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
 		if (tp_ops == NULL) {
@@ -258,7 +266,7 @@ replay:
 		}
 		tp->ops = tp_ops;
 		tp->protocol = protocol;
-		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
+		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(back));
 		tp->q = q;
 		tp->classify = tp_ops->classify;
 		tp->classid = parent;
@@ -280,7 +288,7 @@ replay:
 	if (fh == 0) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
 			spin_lock_bh(root_lock);
-			*back = tp->next;
+			list_del_rcu(&tp->head);
 			spin_unlock_bh(root_lock);
 
 			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
@@ -321,8 +329,7 @@ replay:
 	if (err == 0) {
 		if (tp_created) {
 			spin_lock_bh(root_lock);
-			tp->next = *back;
-			*back = tp;
+			list_add_rcu(&tp->head, chain);
 			spin_unlock_bh(root_lock);
 		}
 		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
@@ -417,7 +424,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_t;
 	struct net_device *dev;
 	struct Qdisc *q;
-	struct tcf_proto *tp, **chain;
+	struct tcf_proto *tp;
+	struct list_head *chain;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	unsigned long cl = 0;
 	const struct Qdisc_class_ops *cops;
@@ -451,7 +459,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 
 	s_t = cb->args[0];
 
-	for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+	t = 0;
+	rcu_read_lock();
+	list_for_each_entry_rcu(tp, chain, head) {
 		if (t < s_t)
 			continue;
 		if (TC_H_MAJ(tcm->tcm_info) &&
@@ -482,7 +492,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[1] = arg.w.count + 1;
 		if (arg.w.stop)
 			break;
+		t++;
 	}
+	rcu_read_unlock();
 
 	cb->args[0] = t;
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c31190e..58c79dc 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -29,6 +29,7 @@
 #include <linux/hrtimer.h>
 #include <linux/lockdep.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -1772,13 +1773,15 @@ done:
  * to this qdisc, (optionally) tests for protocol and asks
  * specific classifiers.
  */
-int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
+int tc_classify_compat(struct sk_buff *skb, const struct list_head *head,
 		       struct tcf_result *res)
 {
+	struct tcf_proto *tp;
 	__be16 protocol = skb->protocol;
-	int err;
+	int err = -1;
 
-	for (; tp; tp = tp->next) {
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	list_for_each_entry_rcu(tp, head, head) {
 		if (tp->protocol != protocol &&
 		    tp->protocol != htons(ETH_P_ALL))
 			continue;
@@ -1789,23 +1792,25 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
 #endif
-			return err;
+			break;
 		}
 	}
-	return -1;
+
+	return err;
 }
 EXPORT_SYMBOL(tc_classify_compat);
 
-int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+int tc_classify(struct sk_buff *skb, const struct list_head *head,
 		struct tcf_result *res)
 {
 	int err = 0;
 #ifdef CONFIG_NET_CLS_ACT
-	const struct tcf_proto *otp = tp;
+	const struct tcf_proto *tp;
+	const struct tcf_proto *otp = list_first_entry(head, struct tcf_proto, head);
 reclassify:
 #endif
 
-	err = tc_classify_compat(skb, tp, res);
+	err = tc_classify_compat(skb, head, res);
 #ifdef CONFIG_NET_CLS_ACT
 	if (err == TC_ACT_RECLASSIFY) {
 		u32 verd = G_TC_VERD(skb->tc_verd);
@@ -1830,16 +1835,20 @@ void tcf_destroy(struct tcf_proto *tp)
 {
 	tp->ops->destroy(tp);
 	module_put(tp->ops->owner);
+	synchronize_rcu();
 	kfree(tp);
 }
 
-void tcf_destroy_chain(struct tcf_proto **fl)
+void tcf_destroy_chain(struct list_head *fl)
 {
-	struct tcf_proto *tp;
+	struct tcf_proto *tp, *n;
+	LIST_HEAD(list);
+	list_splice_init_rcu(fl, &list, synchronize_rcu);
 
-	while ((tp = *fl) != NULL) {
-		*fl = tp->next;
-		tcf_destroy(tp);
+	list_for_each_entry_safe(tp, n, &list, head) {
+		tp->ops->destroy(tp);
+		module_put(tp->ops->owner);
+		kfree(tp);
 	}
 }
 EXPORT_SYMBOL(tcf_destroy_chain);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 1f9c314..20a07c2 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -41,7 +41,7 @@
 
 struct atm_flow_data {
 	struct Qdisc		*q;	/* FIFO, TBF, etc. */
-	struct tcf_proto	*filter_list;
+	struct list_head 	filter_list;
 	struct atm_vcc		*vcc;	/* VCC; NULL if VCC is closed */
 	void			(*old_pop)(struct atm_vcc *vcc,
 					   struct sk_buff *skb); /* chaining */
@@ -273,7 +273,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		error = -ENOBUFS;
 		goto err_out;
 	}
-	flow->filter_list = NULL;
+	INIT_LIST_HEAD(&flow->filter_list);
 	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
 	if (!flow->q)
 		flow->q = &noop_qdisc;
@@ -311,7 +311,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
 	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
 	if (list_empty(&flow->list))
 		return -EINVAL;
-	if (flow->filter_list || flow == &p->link)
+	if (!list_empty(&flow->filter_list) || flow == &p->link)
 		return -EBUSY;
 	/*
 	 * Reference count must be 2: one for "keepalive" (set at class
@@ -345,7 +345,7 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
@@ -370,9 +370,9 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (TC_H_MAJ(skb->priority) != sch->handle ||
 	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
 		list_for_each_entry(flow, &p->flows, list) {
-			if (flow->filter_list) {
+			if (!list_empty(&flow->filter_list)) {
 				result = tc_classify_compat(skb,
-							    flow->filter_list,
+							    &flow->filter_list,
 							    &res);
 				if (result < 0)
 					continue;
@@ -544,7 +544,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!p->link.q)
 		p->link.q = &noop_qdisc;
 	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
-	p->link.filter_list = NULL;
+	INIT_LIST_HEAD(&p->link.filter_list);
 	p->link.vcc = NULL;
 	p->link.sock = NULL;
 	p->link.classid = sch->handle;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index e251833..e6c64c0 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -133,7 +133,7 @@ struct cbq_class {
 	struct gnet_stats_rate_est64 rate_est;
 	struct tc_cbq_xstats	xstats;
 
-	struct tcf_proto	*filter_list;
+	struct list_head	filter_list;
 
 	int			refcnt;
 	int			filters;
@@ -239,8 +239,8 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		/*
 		 * Step 2+n. Apply classifier.
 		 */
-		if (!head->filter_list ||
-		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
+		if (list_empty(&head->filter_list) ||
+		    (result = tc_classify_compat(skb, &head->filter_list, &res)) < 0)
 			goto fallback;
 
 		cl = (void *)res.class;
@@ -1881,6 +1881,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 		}
 	}
 
+	INIT_LIST_HEAD(&cl->filter_list);
 	cl->R_tab = rtab;
 	rtab = NULL;
 	cl->refcnt = 1;
@@ -1976,7 +1977,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	return 0;
 }
 
-static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
+static struct list_head *cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl = (struct cbq_class *)arg;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index ddd73cb..9b36830 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -58,7 +58,7 @@ struct choke_sched_data {
 
 /* Variables */
 	struct red_vars  vars;
-	struct tcf_proto *filter_list;
+	struct list_head filter_list;
 	struct {
 		u32	prob_drop;	/* Early probability drops */
 		u32	prob_mark;	/* Early probability marks */
@@ -202,7 +202,7 @@ static bool choke_classify(struct sk_buff *skb,
 	struct tcf_result res;
 	int result;
 
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, &q->filter_list, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -256,7 +256,7 @@ static bool choke_match_random(const struct choke_sched_data *q,
 		return false;
 
 	oskb = choke_peek_random(q, pidx);
-	if (q->filter_list)
+	if (!list_empty(&q->filter_list))
 		return choke_get_classid(nskb) == choke_get_classid(oskb);
 
 	return choke_match_flow(oskb, nskb);
@@ -268,7 +268,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	const struct red_parms *p = &q->parms;
 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 
-	if (q->filter_list) {
+	if (!list_empty(&q->filter_list)) {
 		/* If using external classifiers, get result and record it. */
 		if (!choke_classify(skb, sch, &ret))
 			goto other_drop;	/* Packet was eaten by filter */
@@ -476,6 +476,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
+	INIT_LIST_HEAD(&q->filter_list);
 
 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
 		      ctl->Plog, ctl->Scell_log,
@@ -566,7 +567,7 @@ static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
 	return 0;
 }
 
-static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *choke_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8302717..62f45ac 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -35,7 +35,7 @@ struct drr_class {
 
 struct drr_sched {
 	struct list_head		active;
-	struct tcf_proto		*filter_list;
+	struct list_head		filter_list;
 	struct Qdisc_class_hash		clhash;
 };
 
@@ -184,7 +184,7 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg)
 		drr_destroy_class(sch, cl);
 }
 
-static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
+static struct list_head *drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 
@@ -328,7 +328,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, &q->filter_list, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -443,6 +443,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 	if (err < 0)
 		return err;
 	INIT_LIST_HEAD(&q->active);
+	INIT_LIST_HEAD(&q->filter_list);
 	return 0;
 }
 
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 0952fd2..1e43b60 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -37,7 +37,7 @@
 
 struct dsmark_qdisc_data {
 	struct Qdisc		*q;
-	struct tcf_proto	*filter_list;
+	struct list_head	filter_list;
 	u8			*mask;	/* "owns" the array */
 	u8			*value;
 	u16			indices;
@@ -185,7 +185,7 @@ ignore:
 	}
 }
 
-static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,
+static inline struct list_head *dsmark_find_tcf(struct Qdisc *sch,
 						 unsigned long cl)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
@@ -228,7 +228,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		skb->tc_index = TC_H_MIN(skb->priority);
 	else {
 		struct tcf_result res;
-		int result = tc_classify(skb, p->filter_list, &res);
+		int result = tc_classify(skb, &p->filter_list, &res);
 
 		pr_debug("result %d class 0x%04x\n", result, res.classid);
 
@@ -379,6 +379,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 	p->indices = indices;
 	p->default_index = default_index;
 	p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
+	INIT_LIST_HEAD(&p->filter_list);
 
 	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
 	if (p->q == NULL)
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 5578628..3c19917 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -52,7 +52,7 @@ struct fq_codel_flow {
 }; /* please try to keep this structure <= 64 bytes */
 
 struct fq_codel_sched_data {
-	struct tcf_proto *filter_list;	/* optional external classifier */
+	struct list_head filter_list;	/* optional external classifier */
 	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
 	u32		*backlogs;	/* backlog table [flows_cnt] */
 	u32		flows_cnt;	/* number of flows */
@@ -92,11 +92,11 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 	    TC_H_MIN(skb->priority) <= q->flows_cnt)
 		return TC_H_MIN(skb->priority);
 
-	if (!q->filter_list)
+	if (list_emty(&q->filter_list))
 		return fq_codel_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, &q->filter_list, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -393,6 +393,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 	q->perturbation = net_random();
 	INIT_LIST_HEAD(&q->new_flows);
 	INIT_LIST_HEAD(&q->old_flows);
+	INIT_LIST_HEAD(&q->filter_list);
 	codel_params_init(&q->cparams);
 	codel_stats_init(&q->cstats);
 	q->cparams.ecn = true;
@@ -501,7 +502,7 @@ static void fq_codel_put(struct Qdisc *q, unsigned long cl)
 {
 }
 
-static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index c407561..8bb4ade 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -116,7 +116,7 @@ struct hfsc_class {
 	struct gnet_stats_queue qstats;
 	struct gnet_stats_rate_est64 rate_est;
 	unsigned int	level;		/* class level in hierarchy */
-	struct tcf_proto *filter_list;	/* filter list */
+	struct list_head filter_list;	/* filter list */
 	unsigned int	filter_cnt;	/* filter count */
 
 	struct hfsc_sched *sched;	/* scheduler data */
@@ -1083,6 +1083,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->refcnt    = 1;
 	cl->sched     = q;
 	cl->cl_parent = parent;
+	INIT_LIST_HEAD(&cl->filter_list);
 	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
 				      &pfifo_qdisc_ops, classid);
 	if (cl->qdisc == NULL)
@@ -1151,7 +1152,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *head, *cl;
 	struct tcf_result res;
-	struct tcf_proto *tcf;
+	struct list_head *list;
 	int result;
 
 	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
@@ -1161,8 +1162,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	head = &q->root;
-	tcf = q->root.filter_list;
-	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+	list = &q->root.filter_list;
+	while (list) {
+		if ((result = tc_classify(skb, list, &res)) < 0)
+			break;
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
@@ -1185,7 +1188,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 			return cl; /* hit leaf class */
 
 		/* apply inner filter chain */
-		tcf = cl->filter_list;
+		list = &cl->filter_list;
 		head = cl;
 	}
 
@@ -1285,7 +1288,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
 	cl->filter_cnt--;
 }
 
-static struct tcf_proto **
+static struct list_head *
 hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 6b0e854..b234d3b 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -103,7 +103,7 @@ struct htb_class {
 	u32			prio;		/* these two are used only by leaves... */
 	int			quantum;	/* but stored for parent-to-leaf return */
 
-	struct tcf_proto	*filter_list;	/* class attached filters */
+	struct list_head	filter_list;	/* class attached filters */
 	int			filter_cnt;
 	int			refcnt;		/* usage count of this class */
 
@@ -153,7 +153,7 @@ struct htb_sched {
 	int			rate2quantum;	/* quant = rate / rate2quantum */
 
 	/* filters for qdisc itself */
-	struct tcf_proto	*filter_list;
+	struct list_head	filter_list;
 
 #define HTB_WARN_TOOMANYEVENTS	0x1
 	unsigned int		warned;	/* only one warning */
@@ -209,7 +209,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl;
 	struct tcf_result res;
-	struct tcf_proto *tcf;
+	struct list_head *head;
 	int result;
 
 	/* allow to select class by setting skb->priority to valid classid;
@@ -223,8 +223,10 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return cl;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	tcf = q->filter_list;
-	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+	head = &q->filter_list;
+	while (!list_empty(head)) {
+		if ((result = tc_classify(skb, head, &res)) < 0)
+			break;
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
@@ -246,7 +248,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 			return cl;	/* we hit leaf; return it */
 
 		/* we have got inner class; apply inner filter chain */
-		tcf = cl->filter_list;
+		head = &cl->filter_list;
 	}
 	/* classification failed; try to use default class */
 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
@@ -1040,6 +1042,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 	qdisc_watchdog_init(&q->watchdog, sch);
 	INIT_WORK(&q->work, htb_work_func);
 	skb_queue_head_init(&q->direct_queue);
+	INIT_LIST_HEAD(&q->filter_list);
 
 	if (tb[TCA_HTB_DIRECT_QLEN])
 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
@@ -1412,6 +1415,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		cl->refcnt = 1;
 		cl->children = 0;
 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+		INIT_LIST_HEAD(&cl->filter_list);
 		RB_CLEAR_NODE(&cl->pq_node);
 
 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
@@ -1519,11 +1523,11 @@ failure:
 	return err;
 }
 
-static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
+static struct list_head *htb_find_tcf(struct Qdisc *sch, unsigned long arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)arg;
-	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
+	struct list_head *fl = cl ? &cl->filter_list : &q->filter_list;
 
 	return fl;
 }
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index bce1665..b182e0c 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -17,7 +17,7 @@
 
 
 struct ingress_qdisc_data {
-	struct tcf_proto	*filter_list;
+	struct list_head	filter_list;
 };
 
 /* ------------------------- Class/flow operations ------------------------- */
@@ -46,7 +46,7 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 }
 
-static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *ingress_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct ingress_qdisc_data *p = qdisc_priv(sch);
 
@@ -61,7 +61,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	struct tcf_result res;
 	int result;
 
-	result = tc_classify(skb, p->filter_list, &res);
+	result = tc_classify(skb, &p->filter_list, &res);
 
 	qdisc_bstats_update(sch, skb);
 	switch (result) {
@@ -108,6 +108,13 @@ nla_put_failure:
 	return -1;
 }
 
+static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct ingress_qdisc_data *q = qdisc_priv(sch);
+	INIT_LIST_HEAD(&q->filter_list);
+	return 0;
+}
+
 static const struct Qdisc_class_ops ingress_class_ops = {
 	.leaf		=	ingress_leaf,
 	.get		=	ingress_get,
@@ -119,6 +126,7 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 };
 
 static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
+	.init		=	ingress_init,
 	.cl_ops		=	&ingress_class_ops,
 	.id		=	"ingress",
 	.priv_size	=	sizeof(struct ingress_qdisc_data),
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index afb050a..e4be093 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -31,7 +31,7 @@ struct multiq_sched_data {
 	u16 bands;
 	u16 max_bands;
 	u16 curband;
-	struct tcf_proto *filter_list;
+	struct list_head filter_list;
 	struct Qdisc **queues;
 };
 
@@ -45,7 +45,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	err = tc_classify(skb, q->filter_list, &res);
+	err = tc_classify(skb, &q->filter_list, &res);
 #ifdef CONFIG_NET_CLS_ACT
 	switch (err) {
 	case TC_ACT_STOLEN:
@@ -258,6 +258,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 	if (opt == NULL)
 		return -EINVAL;
 
+	INIT_LIST_HEAD(&q->filter_list);
 	q->max_bands = qdisc_dev(sch)->num_tx_queues;
 
 	q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
@@ -388,7 +389,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 79359b6..f4ee09f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -24,7 +24,7 @@
 
 struct prio_sched_data {
 	int bands;
-	struct tcf_proto *filter_list;
+	struct list_head filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
 };
@@ -40,7 +40,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
-		err = tc_classify(skb, q->filter_list, &res);
+		err = tc_classify(skb, &q->filter_list, &res);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
@@ -50,7 +50,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 			return NULL;
 		}
 #endif
-		if (!q->filter_list || err < 0) {
+		if (list_empty(&q->filter_list) || err < 0) {
 			if (TC_H_MAJ(band))
 				band = 0;
 			return q->queues[q->prio2band[band & TC_PRIO_MAX]];
@@ -235,6 +235,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
 		if ((err = prio_tune(sch, opt)) != 0)
 			return err;
 	}
+	INIT_LIST_HEAD(&q->filter_list);
 	return 0;
 }
 
@@ -351,7 +352,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *prio_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8056fb4..c43f85a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -181,7 +181,7 @@ struct qfq_group {
 };
 
 struct qfq_sched {
-	struct tcf_proto *filter_list;
+	struct list_head	filter_list;
 	struct Qdisc_class_hash clhash;
 
 	u64			oldV, V;	/* Precise virtual times. */
@@ -576,7 +576,7 @@ static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
 		qfq_destroy_class(sch, cl);
 }
 
-static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
+static struct list_head *qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 
@@ -714,7 +714,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, &q->filter_list, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -1498,6 +1498,7 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 	}
 
 	INIT_HLIST_HEAD(&q->nonfull_aggs);
+	INIT_LIST_HEAD(&q->filter_list);
 
 	return 0;
 }
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 30ea467..20fcc6d 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -55,7 +55,7 @@ struct sfb_bins {
 
 struct sfb_sched_data {
 	struct Qdisc	*qdisc;
-	struct tcf_proto *filter_list;
+	struct list_head filter_list;
 	unsigned long	rehash_interval;
 	unsigned long	warmup_time;	/* double buffering warmup time in jiffies */
 	u32		max;
@@ -259,7 +259,7 @@ static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
 	struct tcf_result res;
 	int result;
 
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, &q->filter_list, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -306,7 +306,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		}
 	}
 
-	if (q->filter_list) {
+	if (!list_empty(&q->filter_list)) {
 		/* If using external classifiers, get result and record it. */
 		if (!sfb_classify(skb, q, &ret, &salt))
 			goto other_drop;
@@ -533,6 +533,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
 	q->tokens_avail = ctl->penalty_burst;
 	q->token_time = jiffies;
 
+	INIT_LIST_HEAD(&q->filter_list);
 	q->slot = 0;
 	q->double_buffering = false;
 	sfb_zero_all_buckets(q);
@@ -660,7 +661,7 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 76f01e0..2a5b2a4 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -125,7 +125,7 @@ struct sfq_sched_data {
 	u8		cur_depth;	/* depth of longest slot */
 	u8		flags;
 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
-	struct tcf_proto *filter_list;
+	struct list_head filter_list;
 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
 
@@ -194,13 +194,13 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	    TC_H_MIN(skb->priority) <= q->divisor)
 		return TC_H_MIN(skb->priority);
 
-	if (!q->filter_list) {
+	if (list_empty(&q->filter_list)) {
 		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 		return sfq_hash(q, skb) + 1;
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, &q->filter_list, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -630,7 +630,7 @@ static void sfq_perturbation(unsigned long arg)
 
 	spin_lock(root_lock);
 	q->perturbation = net_random();
-	if (!q->filter_list && q->tail)
+	if (list_empty(&q->filter_list) && q->tail)
 		sfq_rehash(sch);
 	spin_unlock(root_lock);
 
@@ -760,6 +760,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 	q->perturb_period = 0;
 	q->perturbation = net_random();
+	INIT_LIST_HEAD(&q->filter_list);
 
 	if (opt) {
 		int err = sfq_change(sch, opt);
@@ -846,7 +847,7 @@ static void sfq_put(struct Qdisc *q, unsigned long cl)
 {
 }
 
-static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct list_head *sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-20 23:28 [RFC Patch net-next] net_sched: make classifying lockless on ingress Cong Wang
@ 2013-12-20 23:49 ` Eric Dumazet
  2013-12-20 23:57   ` Cong Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2013-12-20 23:49 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David S. Miller, Jamal Hadi Salim, John Fastabend

On Fri, 2013-12-20 at 15:28 -0800, Cong Wang wrote:

> diff --git a/net/core/dev.c b/net/core/dev.c
> index c482fe8..7cc0d6a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3382,10 +3382,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
>  
>  	q = rxq->qdisc;
>  	if (q != &noop_qdisc) {
> -		spin_lock(qdisc_lock(q));
>  		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
>  			result = qdisc_enqueue_root(skb, q);
> -		spin_unlock(qdisc_lock(q));
>  	}
>  

Well... That would be too easy.

Really, a lot more work would be needed.

Qdisc ->enqueue()/dequeue()/reset()/... all assume the qdisc lock is
held.

John Fastabend did some work on this area.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-20 23:49 ` Eric Dumazet
@ 2013-12-20 23:57   ` Cong Wang
  2013-12-21  0:08     ` Eric Dumazet
  2013-12-21  1:09     ` John Fastabend
  0 siblings, 2 replies; 13+ messages in thread
From: Cong Wang @ 2013-12-20 23:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linux Kernel Network Developers, David S. Miller,
	Jamal Hadi Salim, John Fastabend

On Fri, Dec 20, 2013 at 3:49 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-12-20 at 15:28 -0800, Cong Wang wrote:
>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index c482fe8..7cc0d6a 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3382,10 +3382,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
>>
>>       q = rxq->qdisc;
>>       if (q != &noop_qdisc) {
>> -             spin_lock(qdisc_lock(q));
>>               if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
>>                       result = qdisc_enqueue_root(skb, q);
>> -             spin_unlock(qdisc_lock(q));
>>       }
>>
>
> Well... That would be too easy.
>
> Really, a lot more work would be needed.
>
> Qdisc ->enqueue()/dequeue()/reset()/... all assume the qdisc lock is
> held.

Just push the lock down to ->enqueue() ? Since ingress qdisc is classless
and its ->dequeue() is nop.


>
> John Fastabend did some work on this area.
>


Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-20 23:57   ` Cong Wang
@ 2013-12-21  0:08     ` Eric Dumazet
  2013-12-21  0:24       ` Cong Wang
  2013-12-21  1:09     ` John Fastabend
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2013-12-21  0:08 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, David S. Miller,
	Jamal Hadi Salim, John Fastabend

On Fri, 2013-12-20 at 15:57 -0800, Cong Wang wrote:

> Just push the lock down to ->enqueue() ? Since ingress qdisc is classless
> and its ->dequeue() is nop.


https://github.com/jrfastab/Linux-Kernel-QOS

https://github.com/jrfastab/Linux-Kernel-QOS/commit/67746f95acd77cf15d7ce34f644b76058ce19813

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-21  0:08     ` Eric Dumazet
@ 2013-12-21  0:24       ` Cong Wang
  2013-12-21  2:32         ` John Fastabend
  0 siblings, 1 reply; 13+ messages in thread
From: Cong Wang @ 2013-12-21  0:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linux Kernel Network Developers, David S. Miller,
	Jamal Hadi Salim, John Fastabend

On Fri, Dec 20, 2013 at 4:08 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-12-20 at 15:57 -0800, Cong Wang wrote:
>
>> Just push the lock down to ->enqueue() ? Since ingress qdisc is classless
>> and its ->dequeue() is nop.
>
>
> https://github.com/jrfastab/Linux-Kernel-QOS
>
> https://github.com/jrfastab/Linux-Kernel-QOS/commit/67746f95acd77cf15d7ce34f644b76058ce19813
>

It's a nice idea to introduce a ->prequeue().

And so brave to play with RCU on singly linked list... maybe I can
just steal the ->prequeue() piece. John? :-D

BTW, the last commit is about 5 months ago...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-20 23:57   ` Cong Wang
  2013-12-21  0:08     ` Eric Dumazet
@ 2013-12-21  1:09     ` John Fastabend
  1 sibling, 0 replies; 13+ messages in thread
From: John Fastabend @ 2013-12-21  1:09 UTC (permalink / raw)
  To: Cong Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, David S. Miller,
	Jamal Hadi Salim

On 12/20/2013 3:57 PM, Cong Wang wrote:
> On Fri, Dec 20, 2013 at 3:49 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Fri, 2013-12-20 at 15:28 -0800, Cong Wang wrote:
>>
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index c482fe8..7cc0d6a 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3382,10 +3382,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
>>>
>>>        q = rxq->qdisc;
>>>        if (q != &noop_qdisc) {
>>> -             spin_lock(qdisc_lock(q));
>>>                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
>>>                        result = qdisc_enqueue_root(skb, q);
>>> -             spin_unlock(qdisc_lock(q));
>>>        }
>>>
>>
>> Well... That would be too easy.
>>
>> Really, a lot more work would be needed.
>>
>> Qdisc ->enqueue()/dequeue()/reset()/... all assume the qdisc lock is
>> held.
>
> Just push the lock down to ->enqueue() ? Since ingress qdisc is classless
> and its ->dequeue() is nop.
>

huh? You need to make all the classifiers lockless probably via RCU and
then make the actions safe as well. If you want to take a look at what
I started doing its here https://github.com/jrfastab/Linux-Kernel-QOS/ 
complete with a nasty bug in the u32 classifier that I haven't had time
to track down yet. Probably others for that matter.

I have primarily been concerned with xmit qdiscs so far and using mqprio
allows using the skb->priority field for qdisc selection in the
multiqueue nics which has been good enough. I suppose I should _finally_
get around to completing this seeing its come up twice now in a few
weeks and I've been thinking about if for longer than I care to
remember.

.John

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-21  0:24       ` Cong Wang
@ 2013-12-21  2:32         ` John Fastabend
  2013-12-21 22:11           ` Jamal Hadi Salim
  0 siblings, 1 reply; 13+ messages in thread
From: John Fastabend @ 2013-12-21  2:32 UTC (permalink / raw)
  To: Cong Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, David S. Miller,
	Jamal Hadi Salim

On 12/20/2013 4:24 PM, Cong Wang wrote:
> On Fri, Dec 20, 2013 at 4:08 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Fri, 2013-12-20 at 15:57 -0800, Cong Wang wrote:
>>
>>> Just push the lock down to ->enqueue() ? Since ingress qdisc is classless
>>> and its ->dequeue() is nop.
>>
>>
>> https://github.com/jrfastab/Linux-Kernel-QOS
>>
>> https://github.com/jrfastab/Linux-Kernel-QOS/commit/67746f95acd77cf15d7ce34f644b76058ce19813
>>
>
> It's a nice idea to introduce a ->prequeue().
>
> And so brave to play with RCU on singly linked list... maybe I can
> just steal the ->prequeue() piece. John? :-D

If you only steal the prequeue piece then you don't solve the lock
contention part so I don't think it helps. At which point I suspect
you might as well use one of the existing qdiscs not designed for
multiqueue nics.

>
> BTW, the last commit is about 5 months ago...

Yeah well I imagined I would write a rate limiting qdisc to use
this infrastructure. Jamal hinted at using a systolic processes
for this. But I work on this when I have time and have been
busy the last few months with other things unfortunately.

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-21  2:32         ` John Fastabend
@ 2013-12-21 22:11           ` Jamal Hadi Salim
  2013-12-21 23:09             ` John Fastabend
  0 siblings, 1 reply; 13+ messages in thread
From: Jamal Hadi Salim @ 2013-12-21 22:11 UTC (permalink / raw)
  To: John Fastabend, Cong Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, David S. Miller

On 12/20/13 21:32, John Fastabend wrote:

> If you only steal the prequeue piece then you don't solve the lock
> contention part so I don't think it helps. At which point I suspect
> you might as well use one of the existing qdiscs not designed for
> multiqueue nics.
>

Indeed.


> Yeah well I imagined I would write a rate limiting qdisc to use
> this infrastructure. Jamal hinted at using a systolic processes
> for this. But I work on this when I have time and have been
> busy the last few months with other things unfortunately.

The main problem is you cant avoid locks once you have sharing across
multiple processors. You could try to improve certain things, but
you'll be doing that at the expense of certain use cases; and for
a general purpose OS, it gets hard.
a) netdev: All qdiscs are attached to a netdev. netdevs are shared
across cpus that is if you want the goodies they come with.
If we can ease that, then we may improve the parallelization.
At one point, in a discussion with Eric, it seemed he was heading
towards a per-netdev-ingress-per-cpu (sort of what  multiqueu does for
transmit). Then you can make certain things like netdev stats loosely 
synchronous and rcu would make a lot of sense.
b) graphs of flows and actions are shareable across netdevs and
cpus. Just choose not to share and you can optimize your use case
(at the expense of missing out the sharing features). IOW, this becomes
a config option.

cheers,
jamal

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-21 22:11           ` Jamal Hadi Salim
@ 2013-12-21 23:09             ` John Fastabend
  2013-12-22 16:01               ` Jamal Hadi Salim
  2013-12-24  0:56               ` Cong Wang
  0 siblings, 2 replies; 13+ messages in thread
From: John Fastabend @ 2013-12-21 23:09 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: John Fastabend, Cong Wang, Eric Dumazet,
	Linux Kernel Network Developers, David S. Miller

On 12/21/2013 02:11 PM, Jamal Hadi Salim wrote:
> On 12/20/13 21:32, John Fastabend wrote:
>
>> If you only steal the prequeue piece then you don't solve the lock
>> contention part so I don't think it helps. At which point I suspect
>> you might as well use one of the existing qdiscs not designed for
>> multiqueue nics.
>>
>
> Indeed.
>
>
>> Yeah well I imagined I would write a rate limiting qdisc to use
>> this infrastructure. Jamal hinted at using a systolic processes
>> for this. But I work on this when I have time and have been
>> busy the last few months with other things unfortunately.
>
> The main problem is you cant avoid locks once you have sharing across
> multiple processors. You could try to improve certain things, but
> you'll be doing that at the expense of certain use cases; and for
> a general purpose OS, it gets hard.
> a) netdev: All qdiscs are attached to a netdev. netdevs are shared
> across cpus that is if you want the goodies they come with.
> If we can ease that, then we may improve the parallelization.
> At one point, in a discussion with Eric, it seemed he was heading
> towards a per-netdev-ingress-per-cpu (sort of what  multiqueu does for
> transmit). Then you can make certain things like netdev stats loosely
> synchronous and rcu would make a lot of sense.

I solved this by making them per CPU and synchronizing when I hit
an operation that required sync'ing them. Going forward if folks
have the time to write SMP aware qdisc's that work with eventually
consistent counters that would be great.

You could make this fully generic by having a classifer to match
the cpu id and then forwarding the skb to a qdisc based on the
cpu_id.

Then per-netdev-ingress-per-cpu is really just a configured policy.
If we wanted to make it the default configuration that would be
fine.

> b) graphs of flows and actions are shareable across netdevs and
> cpus. Just choose not to share and you can optimize your use case
> (at the expense of missing out the sharing features). IOW, this becomes
> a config option.
>
> cheers,
> jamal
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
John Fastabend         Intel Corporation

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-21 23:09             ` John Fastabend
@ 2013-12-22 16:01               ` Jamal Hadi Salim
  2013-12-24  0:56               ` Cong Wang
  1 sibling, 0 replies; 13+ messages in thread
From: Jamal Hadi Salim @ 2013-12-22 16:01 UTC (permalink / raw)
  To: John Fastabend
  Cc: John Fastabend, Cong Wang, Eric Dumazet,
	Linux Kernel Network Developers, David S. Miller

On 12/21/13 18:09, John Fastabend wrote:

>
> I solved this by making them per CPU and synchronizing when I hit
> an operation that required sync'ing them. Going forward if folks
> have the time to write SMP aware qdisc's that work with eventually
> consistent counters that would be great.
>

I think what you describe is reasonable as well. Need to weigh pro/con
of both.

> You could make this fully generic by having a classifer to match
> the cpu id and then forwarding the skb to a qdisc based on the
> cpu_id.
>

Indeed. More a "generic stateless metadata" classifier which may
look at more than just cpu id to make the systolic decision.
Probably at pre-enqueu that you had.
I would assume the amount of config update on such a table would be
very very minimal - so RCU would do well.
If someone wants more deeper lookup, then a tc classifier would make
more sense. But by default the generic stateless classifier maybe
sufficient.

> Then per-netdev-ingress-per-cpu is really just a configured policy.
> If we wanted to make it the default configuration that would be
> fine.
>

Perhaps thats just defensive talk on my part when people say "qdiscs
are slow". No - netdevs are slow. Rephrase: Netdevs are shared across
CPUs, you MUST lock. Locks create cache misses etc.
As an example, I dont know why something like RPS could not have 
benefited from the rich classifier-action if someone wanted it to.
So there may still be need for per-netdev-ingress-per-cpu.

cheers,
jamal

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-21 23:09             ` John Fastabend
  2013-12-22 16:01               ` Jamal Hadi Salim
@ 2013-12-24  0:56               ` Cong Wang
  2013-12-24  6:08                 ` John Fastabend
  1 sibling, 1 reply; 13+ messages in thread
From: Cong Wang @ 2013-12-24  0:56 UTC (permalink / raw)
  To: John Fastabend
  Cc: Jamal Hadi Salim, John Fastabend, Eric Dumazet,
	Linux Kernel Network Developers, David S. Miller

On Sat, Dec 21, 2013 at 3:09 PM, John Fastabend
<john.fastabend@gmail.com> wrote:
>
> I solved this by making them per CPU and synchronizing when I hit
> an operation that required sync'ing them. Going forward if folks
> have the time to write SMP aware qdisc's that work with eventually
> consistent counters that would be great.
>

Interesting, then you have to copy the same filters and actions
to all per-cpu-ingress-qdisc, right? Also you need to handle
CPU online/offline event.

The number of CPU's grows fast today, so the total size
of such ingress qdisc would be huge if I install lots
of filters and action.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-24  0:56               ` Cong Wang
@ 2013-12-24  6:08                 ` John Fastabend
  2013-12-26 12:02                   ` Jamal Hadi Salim
  0 siblings, 1 reply; 13+ messages in thread
From: John Fastabend @ 2013-12-24  6:08 UTC (permalink / raw)
  To: Cong Wang
  Cc: Jamal Hadi Salim, John Fastabend, Eric Dumazet,
	Linux Kernel Network Developers, David S. Miller

On 12/23/2013 04:56 PM, Cong Wang wrote:
> On Sat, Dec 21, 2013 at 3:09 PM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>>
>> I solved this by making them per CPU and synchronizing when I hit
>> an operation that required sync'ing them. Going forward if folks
>> have the time to write SMP aware qdisc's that work with eventually
>> consistent counters that would be great.
>>
>
> Interesting, then you have to copy the same filters and actions
> to all per-cpu-ingress-qdisc, right? Also you need to handle
> CPU online/offline event.
>
> The number of CPU's grows fast today, so the total size
> of such ingress qdisc would be huge if I install lots
> of filters and action.
>

In this case I was specifically talking about statistics so the
bstats and qstats.

As long as the qdisc's do not require global state this works well
enough. However as Jamal keeps pointing out the problem is any qdisc
which requires global state requires locking (I paraphrase but I think
replicate the spirit correctly) and this doesn't work well with many
CPUs. So you either replicate the qdiscs one per queue like we do in
the mq and mqprio case effectively removing any global state or you
develop qdiscs that don't require global state or at least work with
eventually consistent data to avoid the constant syncing of data.

I think though a qdisc per nic queue is really not as bad as you think.
For example we do this on the tx side and it works OK. Note its per
RX queue and not per CPU.

.John



-- 
John Fastabend         Intel Corporation

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC Patch net-next] net_sched: make classifying lockless on ingress
  2013-12-24  6:08                 ` John Fastabend
@ 2013-12-26 12:02                   ` Jamal Hadi Salim
  0 siblings, 0 replies; 13+ messages in thread
From: Jamal Hadi Salim @ 2013-12-26 12:02 UTC (permalink / raw)
  To: John Fastabend, Cong Wang
  Cc: John Fastabend, Eric Dumazet, Linux Kernel Network Developers,
	David S. Miller

On 12/24/13 01:08, John Fastabend wrote:

>
> In this case I was specifically talking about statistics so the
> bstats and qstats.
>

Thats my thinking as well. Mostly control plane requests would
drive it. Assuming it is real world and not a benchmark test - then
you can afford to be loosely synchronized.

> As long as the qdisc's do not require global state this works well
> enough. However as Jamal keeps pointing out the problem is any qdisc
> which requires global state requires locking (I paraphrase but I think
> replicate the spirit correctly) and this doesn't work well with many
> CPUs. So you either replicate the qdiscs one per queue like we do in
> the mq and mqprio case effectively removing any global state or you
> develop qdiscs that don't require global state or at least work with
> eventually consistent data to avoid the constant syncing of data.
>
> I think though a qdisc per nic queue is really not as bad as you think.
> For example we do this on the tx side and it works OK. Note its per
> RX queue and not per CPU.
>

And it is as you said earlier a configuration issue.

cheers,
jamal

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2013-12-26 12:02 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-12-20 23:28 [RFC Patch net-next] net_sched: make classifying lockless on ingress Cong Wang
2013-12-20 23:49 ` Eric Dumazet
2013-12-20 23:57   ` Cong Wang
2013-12-21  0:08     ` Eric Dumazet
2013-12-21  0:24       ` Cong Wang
2013-12-21  2:32         ` John Fastabend
2013-12-21 22:11           ` Jamal Hadi Salim
2013-12-21 23:09             ` John Fastabend
2013-12-22 16:01               ` Jamal Hadi Salim
2013-12-24  0:56               ` Cong Wang
2013-12-24  6:08                 ` John Fastabend
2013-12-26 12:02                   ` Jamal Hadi Salim
2013-12-21  1:09     ` John Fastabend

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.