All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
@ 2016-11-24  1:58 Cong Wang
  2016-11-24  8:29 ` Roi Dayan
  2016-11-28  2:57 ` John Fastabend
  0 siblings, 2 replies; 16+ messages in thread
From: Cong Wang @ 2016-11-24  1:58 UTC (permalink / raw)
  To: netdev; +Cc: roid, jiri, Cong Wang, Daniel Borkmann, John Fastabend

Roi reported we could have a race condition where in ->classify() path
we dereference tp->root and meanwhile a parallel ->destroy() makes it
a NULL.

This is possible because ->destroy() could be called when deleting
a filter to check if we are the last one in tp, this tp is still
linked and visible at that time.

The root cause of this problem is the semantic of ->destroy(), it
does two things (for non-force case):

1) check if tp is empty
2) if tp is empty we could really destroy it

and its caller, if cares, needs to check its return value to see if
it is really destroyed. Therefore we can't unlink tp unless we know
it is empty.

As suggested by Daniel, we could actually move the test logic to ->delete()
so that we can safely unlink tp after ->delete() tells us the last one is
just deleted and before ->destroy().

What's more, even we unlink it before ->destroy(), it could still have
readers since we don't wait for a grace period here, we should not modify
tp->root in ->destroy() either.

Fixes: 1e052be69d04 ("net_sched: destroy proto tp when all filters are gone")
Reported-by: Roi Dayan <roid@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 include/net/sch_generic.h |  6 ++--
 net/sched/cls_api.c       | 18 +++++++-----
 net/sched/cls_basic.c     | 11 +++-----
 net/sched/cls_bpf.c       | 11 +++-----
 net/sched/cls_cgroup.c    | 12 ++------
 net/sched/cls_flow.c      | 11 +++-----
 net/sched/cls_flower.c    | 10 ++-----
 net/sched/cls_fw.c        | 30 +++++++++++---------
 net/sched/cls_matchall.c  | 10 ++-----
 net/sched/cls_route.c     | 30 ++++++++++----------
 net/sched/cls_rsvp.h      | 34 +++++++++++------------
 net/sched/cls_tcindex.c   | 15 +++++-----
 net/sched/cls_u32.c       | 71 +++++++++++++++++++++++++++--------------------
 net/sched/sch_api.c       | 14 ++++------
 14 files changed, 137 insertions(+), 146 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a2..27cd1bd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -203,14 +203,14 @@ struct tcf_proto_ops {
 					    const struct tcf_proto *,
 					    struct tcf_result *);
 	int			(*init)(struct tcf_proto*);
-	bool			(*destroy)(struct tcf_proto*, bool);
+	void			(*destroy)(struct tcf_proto*);
 
 	unsigned long		(*get)(struct tcf_proto*, u32 handle);
 	int			(*change)(struct net *net, struct sk_buff *,
 					struct tcf_proto*, unsigned long,
 					u32 handle, struct nlattr **,
 					unsigned long *, bool);
-	int			(*delete)(struct tcf_proto*, unsigned long);
+	int			(*delete)(struct tcf_proto*, unsigned long, bool*);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
 
 	/* rtnetlink specific */
@@ -405,7 +405,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 				const struct Qdisc_ops *ops, u32 parentid);
 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
-bool tcf_destroy(struct tcf_proto *tp, bool force);
+void tcf_destroy(struct tcf_proto *tp);
 void tcf_destroy_chain(struct tcf_proto __rcu **fl);
 int skb_do_redirect(struct sk_buff *);
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8e93d4a..f159aeb 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -321,7 +321,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 
 			tfilter_notify(net, skb, n, tp, fh,
 				       RTM_DELTFILTER, false);
-			tcf_destroy(tp, true);
+			tcf_destroy(tp);
 			err = 0;
 			goto errout;
 		}
@@ -331,25 +331,29 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 		    !(n->nlmsg_flags & NLM_F_CREATE))
 			goto errout;
 	} else {
+		bool last;
+
 		switch (n->nlmsg_type) {
 		case RTM_NEWTFILTER:
 			err = -EEXIST;
 			if (n->nlmsg_flags & NLM_F_EXCL) {
 				if (tp_created)
-					tcf_destroy(tp, true);
+					tcf_destroy(tp);
 				goto errout;
 			}
 			break;
 		case RTM_DELTFILTER:
-			err = tp->ops->delete(tp, fh);
+			err = tp->ops->delete(tp, fh, &last);
 			if (err == 0) {
-				struct tcf_proto *next = rtnl_dereference(tp->next);
-
 				tfilter_notify(net, skb, n, tp,
 					       t->tcm_handle,
 					       RTM_DELTFILTER, false);
-				if (tcf_destroy(tp, false))
+				if (last) {
+					struct tcf_proto *next = rtnl_dereference(tp->next);
+
 					RCU_INIT_POINTER(*back, next);
+					tcf_destroy(tp);
+				}
 			}
 			goto errout;
 		case RTM_GETTFILTER:
@@ -372,7 +376,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
-			tcf_destroy(tp, true);
+			tcf_destroy(tp);
 	}
 
 errout:
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index eb219b7..dd63230 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -96,31 +96,28 @@ static void basic_delete_filter(struct rcu_head *head)
 	kfree(f);
 }
 
-static bool basic_destroy(struct tcf_proto *tp, bool force)
+static void basic_destroy(struct tcf_proto *tp)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f, *n;
 
-	if (!force && !list_empty(&head->flist))
-		return false;
-
 	list_for_each_entry_safe(f, n, &head->flist, link) {
 		list_del_rcu(&f->link);
 		tcf_unbind_filter(tp, &f->res);
 		call_rcu(&f->rcu, basic_delete_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
-static int basic_delete(struct tcf_proto *tp, unsigned long arg)
+static int basic_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
+	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f = (struct basic_filter *) arg;
 
 	list_del_rcu(&f->link);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, basic_delete_filter);
+	*last = list_empty(&head->flist);
 	return 0;
 }
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 52dc85a..770984c0 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -265,26 +265,25 @@ static void __cls_bpf_delete_prog(struct rcu_head *rcu)
 	cls_bpf_delete_prog(prog->tp, prog);
 }
 
-static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
+	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
 	cls_bpf_stop_offload(tp, prog);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+	*last = list_empty(&head->plist);
 
 	return 0;
 }
 
-static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
+static void cls_bpf_destroy(struct tcf_proto *tp)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 	struct cls_bpf_prog *prog, *tmp;
 
-	if (!force && !list_empty(&head->plist))
-		return false;
-
 	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
 		cls_bpf_stop_offload(tp, prog);
 		list_del_rcu(&prog->link);
@@ -292,9 +291,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
 		call_rcu(&prog->rcu, __cls_bpf_delete_prog);
 	}
 
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
 static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 85233c47..fa9405e 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -131,21 +131,15 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	return err;
 }
 
-static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force)
+static void cls_cgroup_destroy(struct tcf_proto *tp)
 {
 	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
 
-	if (!force)
-		return false;
-
-	if (head) {
-		RCU_INIT_POINTER(tp->root, NULL);
+	if (head)
 		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
-	}
-	return true;
 }
 
-static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
+static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index e396723..ea2be75 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -563,12 +563,14 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	return err;
 }
 
-static int flow_delete(struct tcf_proto *tp, unsigned long arg)
+static int flow_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
+	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f = (struct flow_filter *)arg;
 
 	list_del_rcu(&f->list);
 	call_rcu(&f->rcu, flow_destroy_filter);
+	*last = list_empty(&head->filters);
 	return 0;
 }
 
@@ -584,21 +586,16 @@ static int flow_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static bool flow_destroy(struct tcf_proto *tp, bool force)
+static void flow_destroy(struct tcf_proto *tp)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f, *next;
 
-	if (!force && !list_empty(&head->filters))
-		return false;
-
 	list_for_each_entry_safe(f, next, &head->filters, list) {
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, flow_destroy_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
 static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e8dd09a..495d63224 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -280,21 +280,16 @@ static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 	call_rcu(&f->rcu, fl_destroy_filter);
 }
 
-static bool fl_destroy(struct tcf_proto *tp, bool force)
+static void fl_destroy(struct tcf_proto *tp)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f, *next;
 
-	if (!force && !list_empty(&head->filters))
-		return false;
-
 	list_for_each_entry_safe(f, next, &head->filters, list)
 		__fl_delete(tp, f);
-	RCU_INIT_POINTER(tp->root, NULL);
 	if (head->mask_assigned)
 		rhashtable_destroy(&head->ht);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
 static unsigned long fl_get(struct tcf_proto *tp, u32 handle)
@@ -777,7 +772,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	return err;
 }
 
-static int fl_delete(struct tcf_proto *tp, unsigned long arg)
+static int fl_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
@@ -785,6 +780,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	__fl_delete(tp, f);
+	*last = list_empty(&head->filters);
 	return 0;
 }
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 9dc63d5..bc8ceb7 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -127,20 +127,14 @@ static void fw_delete_filter(struct rcu_head *head)
 	kfree(f);
 }
 
-static bool fw_destroy(struct tcf_proto *tp, bool force)
+static void fw_destroy(struct tcf_proto *tp)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f;
 	int h;
 
 	if (head == NULL)
-		return true;
-
-	if (!force) {
-		for (h = 0; h < HTSIZE; h++)
-			if (rcu_access_pointer(head->ht[h]))
-				return false;
-	}
+		return;
 
 	for (h = 0; h < HTSIZE; h++) {
 		while ((f = rtnl_dereference(head->ht[h])) != NULL) {
@@ -150,17 +144,17 @@ static bool fw_destroy(struct tcf_proto *tp, bool force)
 			call_rcu(&f->rcu, fw_delete_filter);
 		}
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
-static int fw_delete(struct tcf_proto *tp, unsigned long arg)
+static int fw_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f = (struct fw_filter *)arg;
 	struct fw_filter __rcu **fp;
 	struct fw_filter *pfp;
+	int ret = -EINVAL;
+	int h;
 
 	if (head == NULL || f == NULL)
 		goto out;
@@ -173,11 +167,21 @@ static int fw_delete(struct tcf_proto *tp, unsigned long arg)
 			RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
 			tcf_unbind_filter(tp, &f->res);
 			call_rcu(&f->rcu, fw_delete_filter);
-			return 0;
+			ret = 0;
+			break;
 		}
 	}
+
+	*last = true;
+	for (h = 0; h < HTSIZE; h++) {
+		if (rcu_access_pointer(head->ht[h])) {
+			*last = false;
+			break;
+		}
+	}
+
 out:
-	return -EINVAL;
+	return ret;
 }
 
 static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 25927b6..7d54805 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -99,24 +99,19 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 					     &offload);
 }
 
-static bool mall_destroy(struct tcf_proto *tp, bool force)
+static void mall_destroy(struct tcf_proto *tp)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct cls_mall_filter *f = head->filter;
 
-	if (!force && f)
-		return false;
-
 	if (f) {
 		if (tc_should_offload(dev, tp, f->flags))
 			mall_destroy_hw_filter(tp, f, (unsigned long) f);
 
 		call_rcu(&f->rcu, mall_destroy_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
 static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
@@ -225,7 +220,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	return err;
 }
 
-static int mall_delete(struct tcf_proto *tp, unsigned long arg)
+static int mall_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
@@ -237,6 +232,7 @@ static int mall_delete(struct tcf_proto *tp, unsigned long arg)
 	RCU_INIT_POINTER(head->filter, NULL);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, mall_destroy_filter);
+	*last = true;
 	return 0;
 }
 
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 455fc8f..1a38e41 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -276,20 +276,13 @@ static void route4_delete_filter(struct rcu_head *head)
 	kfree(f);
 }
 
-static bool route4_destroy(struct tcf_proto *tp, bool force)
+static void route4_destroy(struct tcf_proto *tp)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	int h1, h2;
 
 	if (head == NULL)
-		return true;
-
-	if (!force) {
-		for (h1 = 0; h1 <= 256; h1++) {
-			if (rcu_access_pointer(head->table[h1]))
-				return false;
-		}
-	}
+		return;
 
 	for (h1 = 0; h1 <= 256; h1++) {
 		struct route4_bucket *b;
@@ -312,12 +305,10 @@ static bool route4_destroy(struct tcf_proto *tp, bool force)
 			kfree_rcu(b, rcu);
 		}
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
-	return true;
 }
 
-static int route4_delete(struct tcf_proto *tp, unsigned long arg)
+static int route4_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter *f = (struct route4_filter *)arg;
@@ -325,7 +316,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 	struct route4_filter *nf;
 	struct route4_bucket *b;
 	unsigned int h = 0;
-	int i;
+	int i, h1;
 
 	if (!head || !f)
 		return -EINVAL;
@@ -356,16 +347,25 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 
 				rt = rtnl_dereference(b->ht[i]);
 				if (rt)
-					return 0;
+					goto out;
 			}
 
 			/* OK, session has no flows */
 			RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
 			kfree_rcu(b, rcu);
+			break;
+		}
+	}
 
-			return 0;
+out:
+	*last = true;
+	for (h1 = 0; h1 <= 256; h1++) {
+		if (rcu_access_pointer(head->table[h1])) {
+			*last = false;
+			break;
 		}
 	}
+
 	return 0;
 }
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f05a19..e8ba81a 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -301,22 +301,13 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 	call_rcu(&f->rcu, rsvp_delete_filter_rcu);
 }
 
-static bool rsvp_destroy(struct tcf_proto *tp, bool force)
+static void rsvp_destroy(struct tcf_proto *tp)
 {
 	struct rsvp_head *data = rtnl_dereference(tp->root);
 	int h1, h2;
 
 	if (data == NULL)
-		return true;
-
-	if (!force) {
-		for (h1 = 0; h1 < 256; h1++) {
-			if (rcu_access_pointer(data->ht[h1]))
-				return false;
-		}
-	}
-
-	RCU_INIT_POINTER(tp->root, NULL);
+		return;
 
 	for (h1 = 0; h1 < 256; h1++) {
 		struct rsvp_session *s;
@@ -336,10 +327,9 @@ static bool rsvp_destroy(struct tcf_proto *tp, bool force)
 		}
 	}
 	kfree_rcu(data, rcu);
-	return true;
 }
 
-static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
+static int rsvp_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct rsvp_head *head = rtnl_dereference(tp->root);
 	struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
@@ -347,7 +337,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
 	unsigned int h = f->handle;
 	struct rsvp_session __rcu **sp;
 	struct rsvp_session *nsp, *s = f->sess;
-	int i;
+	int i, h1;
 
 	fp = &s->ht[(h >> 8) & 0xFF];
 	for (nfp = rtnl_dereference(*fp); nfp;
@@ -360,7 +350,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
 
 			for (i = 0; i <= 16; i++)
 				if (s->ht[i])
-					return 0;
+					goto out;
 
 			/* OK, session has no flows */
 			sp = &head->ht[h & 0xFF];
@@ -369,13 +359,23 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
 				if (nsp == s) {
 					RCU_INIT_POINTER(*sp, s->next);
 					kfree_rcu(s, rcu);
-					return 0;
+					goto out;
 				}
 			}
 
-			return 0;
+			break;
 		}
 	}
+
+out:
+	*last = true;
+	for (h1 = 0; h1 < 256; h1++) {
+		if (rcu_access_pointer(head->ht[h1])) {
+			*last = false;
+			break;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 96144bd..9149a03 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -150,7 +150,7 @@ static void tcindex_destroy_fexts(struct rcu_head *head)
 	kfree(f);
 }
 
-static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+static int tcindex_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
@@ -186,6 +186,8 @@ static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
 		call_rcu(&f->rcu, tcindex_destroy_fexts);
 	else
 		call_rcu(&r->rcu, tcindex_destroy_rexts);
+
+	*last = false;
 	return 0;
 }
 
@@ -193,7 +195,9 @@ static int tcindex_destroy_element(struct tcf_proto *tp,
 				   unsigned long arg,
 				   struct tcf_walker *walker)
 {
-	return tcindex_delete(tp, arg);
+	bool last;
+
+	return tcindex_delete(tp, arg, &last);
 }
 
 static void __tcindex_destroy(struct rcu_head *head)
@@ -529,23 +533,18 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 	}
 }
 
-static bool tcindex_destroy(struct tcf_proto *tp, bool force)
+static void tcindex_destroy(struct tcf_proto *tp)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
 	struct tcf_walker walker;
 
-	if (!force)
-		return false;
-
 	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
 	walker.count = 0;
 	walker.skip = 0;
 	walker.fn = tcindex_destroy_element;
 	tcindex_walk(tp, &walker);
 
-	RCU_INIT_POINTER(tp->root, NULL);
 	call_rcu(&p->rcu, __tcindex_destroy);
-	return true;
 }
 
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ae83c3ae..787573b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -582,37 +582,13 @@ static bool ht_empty(struct tc_u_hnode *ht)
 	return true;
 }
 
-static bool u32_destroy(struct tcf_proto *tp, bool force)
+static void u32_destroy(struct tcf_proto *tp)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
 
 	WARN_ON(root_ht == NULL);
 
-	if (!force) {
-		if (root_ht) {
-			if (root_ht->refcnt > 1)
-				return false;
-			if (root_ht->refcnt == 1) {
-				if (!ht_empty(root_ht))
-					return false;
-			}
-		}
-
-		if (tp_c->refcnt > 1)
-			return false;
-
-		if (tp_c->refcnt == 1) {
-			struct tc_u_hnode *ht;
-
-			for (ht = rtnl_dereference(tp_c->hlist);
-			     ht;
-			     ht = rtnl_dereference(ht->next))
-				if (!ht_empty(ht))
-					return false;
-		}
-	}
-
 	if (root_ht && --root_ht->refcnt == 0)
 		u32_destroy_hnode(tp, root_ht);
 
@@ -637,20 +613,22 @@ static bool u32_destroy(struct tcf_proto *tp, bool force)
 	}
 
 	tp->data = NULL;
-	return true;
 }
 
-static int u32_delete(struct tcf_proto *tp, unsigned long arg)
+static int u32_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
 {
 	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
+	struct tc_u_common *tp_c = tp->data;
+	int ret = 0;
 
 	if (ht == NULL)
-		return 0;
+		goto out;
 
 	if (TC_U32_KEY(ht->handle)) {
 		u32_remove_hw_knode(tp, ht->handle);
-		return u32_delete_key(tp, (struct tc_u_knode *)ht);
+		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
+		goto out;
 	}
 
 	if (root_ht == ht)
@@ -663,7 +641,40 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
 		return -EBUSY;
 	}
 
-	return 0;
+out:
+	*last = true;
+	if (root_ht) {
+		if (root_ht->refcnt > 1) {
+			*last = false;
+			goto ret;
+		}
+		if (root_ht->refcnt == 1) {
+			if (!ht_empty(root_ht)) {
+				*last = false;
+				goto ret;
+			}
+		}
+	}
+
+	if (tp_c->refcnt > 1) {
+		*last = false;
+		goto ret;
+	}
+
+	if (tp_c->refcnt == 1) {
+		struct tc_u_hnode *ht;
+
+		for (ht = rtnl_dereference(tp_c->hlist);
+		     ht;
+		     ht = rtnl_dereference(ht->next))
+			if (!ht_empty(ht)) {
+				*last = false;
+				break;
+			}
+	}
+
+ret:
+	return ret;
 }
 
 #define NR_U32_NODE (1<<12)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f337f1b..7fc48b3 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1899,15 +1899,11 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 }
 EXPORT_SYMBOL(tc_classify);
 
-bool tcf_destroy(struct tcf_proto *tp, bool force)
+void tcf_destroy(struct tcf_proto *tp)
 {
-	if (tp->ops->destroy(tp, force)) {
-		module_put(tp->ops->owner);
-		kfree_rcu(tp, rcu);
-		return true;
-	}
-
-	return false;
+	tp->ops->destroy(tp);
+	module_put(tp->ops->owner);
+	kfree_rcu(tp, rcu);
 }
 
 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
@@ -1916,7 +1912,7 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl)
 
 	while ((tp = rtnl_dereference(*fl)) != NULL) {
 		RCU_INIT_POINTER(*fl, tp->next);
-		tcf_destroy(tp, true);
+		tcf_destroy(tp);
 	}
 }
 EXPORT_SYMBOL(tcf_destroy_chain);
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24  1:58 [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete() Cong Wang
@ 2016-11-24  8:29 ` Roi Dayan
  2016-11-24 10:14   ` Daniel Borkmann
  2016-11-28  2:57 ` John Fastabend
  1 sibling, 1 reply; 16+ messages in thread
From: Roi Dayan @ 2016-11-24  8:29 UTC (permalink / raw)
  To: Cong Wang, netdev; +Cc: roid, jiri, Daniel Borkmann, John Fastabend

Hi,

I'm testing this patch with KASAN enabled and got into a new kernel 
crash I didn't hit before.


[ 1860.725065] 
==================================================================
[ 1860.733893] BUG: KASAN: use-after-free in 
__netif_receive_skb_core+0x1ebe/0x29a0 at addr ffff880a68b04028
[ 1860.745415] Read of size 8 by task CPU 0/KVM/5334
[ 1860.751368] CPU: 8 PID: 5334 Comm: CPU 0/KVM Tainted: G O    
4.9.0-rc3+ #18
[ 1860.760547] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
[ 1860.768036] Call Trace:
[ 1860.771307]  [<ffffffffa9b6dc42>] dump_stack+0x63/0x81
[ 1860.777167]  [<ffffffffa95fb751>] kasan_object_err+0x21/0x70
[ 1860.783826]  [<ffffffffa95fb9dd>] kasan_report_error+0x1ed/0x4e0
[ 1860.790640]  [<ffffffffa9b9b841>] ? csum_partial+0x11/0x20
[ 1860.796871]  [<ffffffffaa44a6b9>] ? csum_partial_ext+0x9/0x10
[ 1860.803571]  [<ffffffffaa453155>] ? __skb_checksum+0x115/0x8d0
[ 1860.810370]  [<ffffffffa95fbe81>] __asan_report_load8_noabort+0x61/0x70
[ 1860.818263]  [<ffffffffaa49c3fe>] ? 
__netif_receive_skb_core+0x1ebe/0x29a0
[ 1860.826215]  [<ffffffffaa49c3fe>] __netif_receive_skb_core+0x1ebe/0x29a0
[ 1860.833991]  [<ffffffffaa49a540>] ? netdev_info+0x100/0x100
[ 1860.840529]  [<ffffffffaa671792>] ? udp4_gro_receive+0x802/0x1090
[ 1860.847783]  [<ffffffffa9bb9a08>] ? find_next_bit+0x18/0x20
[ 1860.854126]  [<ffffffffaa49cf04>] __netif_receive_skb+0x24/0x150
[ 1860.861695]  [<ffffffffaa49d0d1>] netif_receive_skb_internal+0xa1/0x1d0
[ 1860.869366]  [<ffffffffaa49d030>] ? __netif_receive_skb+0x150/0x150
[ 1860.876464]  [<ffffffffaa49f7e9>] ? dev_gro_receive+0x969/0x1660
[ 1860.883924]  [<ffffffffaa4a0e1f>] napi_gro_receive+0x1df/0x300
[ 1860.890744]  [<ffffffffc02e885d>] mlx5e_handle_rx_cqe_rep+0x83d/0xd30 
[mlx5_core]

checking with gdb

(gdb) l *(__netif_receive_skb_core+0x1ebe)
0xffffffff8249c3fe is in __netif_receive_skb_core (net/core/dev.c:3937).
3932                    *pt_prev = NULL;
3933            }
3934
3935            qdisc_skb_cb(skb)->pkt_len = skb->len;
3936            skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3937            qdisc_bstats_cpu_update(cl->q, skb);
3938
3939            switch (tc_classify(skb, cl, &cl_res, false)) {
3940            case TC_ACT_OK:
3941            case TC_ACT_RECLASSIFY:


Thanks,
Roi



On 24/11/2016 03:58, Cong Wang wrote:
> Roi reported we could have a race condition where in ->classify() path
> we dereference tp->root and meanwhile a parallel ->destroy() makes it
> a NULL.
>
> This is possible because ->destroy() could be called when deleting
> a filter to check if we are the last one in tp, this tp is still
> linked and visible at that time.
>
> The root cause of this problem is the semantic of ->destroy(), it
> does two things (for non-force case):
>
> 1) check if tp is empty
> 2) if tp is empty we could really destroy it
>
> and its caller, if cares, needs to check its return value to see if
> it is really destroyed. Therefore we can't unlink tp unless we know
> it is empty.
>
> As suggested by Daniel, we could actually move the test logic to ->delete()
> so that we can safely unlink tp after ->delete() tells us the last one is
> just deleted and before ->destroy().
>
> What's more, even we unlink it before ->destroy(), it could still have
> readers since we don't wait for a grace period here, we should not modify
> tp->root in ->destroy() either.
>
> Fixes: 1e052be69d04 ("net_sched: destroy proto tp when all filters are gone")
> Reported-by: Roi Dayan <roid@mellanox.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
>   include/net/sch_generic.h |  6 ++--
>   net/sched/cls_api.c       | 18 +++++++-----
>   net/sched/cls_basic.c     | 11 +++-----
>   net/sched/cls_bpf.c       | 11 +++-----
>   net/sched/cls_cgroup.c    | 12 ++------
>   net/sched/cls_flow.c      | 11 +++-----
>   net/sched/cls_flower.c    | 10 ++-----
>   net/sched/cls_fw.c        | 30 +++++++++++---------
>   net/sched/cls_matchall.c  | 10 ++-----
>   net/sched/cls_route.c     | 30 ++++++++++----------
>   net/sched/cls_rsvp.h      | 34 +++++++++++------------
>   net/sched/cls_tcindex.c   | 15 +++++-----
>   net/sched/cls_u32.c       | 71 +++++++++++++++++++++++++++--------------------
>   net/sched/sch_api.c       | 14 ++++------
>   14 files changed, 137 insertions(+), 146 deletions(-)
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index e6aa0a2..27cd1bd 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -203,14 +203,14 @@ struct tcf_proto_ops {
>   					    const struct tcf_proto *,
>   					    struct tcf_result *);
>   	int			(*init)(struct tcf_proto*);
> -	bool			(*destroy)(struct tcf_proto*, bool);
> +	void			(*destroy)(struct tcf_proto*);
>   
>   	unsigned long		(*get)(struct tcf_proto*, u32 handle);
>   	int			(*change)(struct net *net, struct sk_buff *,
>   					struct tcf_proto*, unsigned long,
>   					u32 handle, struct nlattr **,
>   					unsigned long *, bool);
> -	int			(*delete)(struct tcf_proto*, unsigned long);
> +	int			(*delete)(struct tcf_proto*, unsigned long, bool*);
>   	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
>   
>   	/* rtnetlink specific */
> @@ -405,7 +405,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
>   				const struct Qdisc_ops *ops, u32 parentid);
>   void __qdisc_calculate_pkt_len(struct sk_buff *skb,
>   			       const struct qdisc_size_table *stab);
> -bool tcf_destroy(struct tcf_proto *tp, bool force);
> +void tcf_destroy(struct tcf_proto *tp);
>   void tcf_destroy_chain(struct tcf_proto __rcu **fl);
>   int skb_do_redirect(struct sk_buff *);
>   
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 8e93d4a..f159aeb 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -321,7 +321,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
>   
>   			tfilter_notify(net, skb, n, tp, fh,
>   				       RTM_DELTFILTER, false);
> -			tcf_destroy(tp, true);
> +			tcf_destroy(tp);
>   			err = 0;
>   			goto errout;
>   		}
> @@ -331,25 +331,29 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
>   		    !(n->nlmsg_flags & NLM_F_CREATE))
>   			goto errout;
>   	} else {
> +		bool last;
> +
>   		switch (n->nlmsg_type) {
>   		case RTM_NEWTFILTER:
>   			err = -EEXIST;
>   			if (n->nlmsg_flags & NLM_F_EXCL) {
>   				if (tp_created)
> -					tcf_destroy(tp, true);
> +					tcf_destroy(tp);
>   				goto errout;
>   			}
>   			break;
>   		case RTM_DELTFILTER:
> -			err = tp->ops->delete(tp, fh);
> +			err = tp->ops->delete(tp, fh, &last);
>   			if (err == 0) {
> -				struct tcf_proto *next = rtnl_dereference(tp->next);
> -
>   				tfilter_notify(net, skb, n, tp,
>   					       t->tcm_handle,
>   					       RTM_DELTFILTER, false);
> -				if (tcf_destroy(tp, false))
> +				if (last) {
> +					struct tcf_proto *next = rtnl_dereference(tp->next);
> +
>   					RCU_INIT_POINTER(*back, next);
> +					tcf_destroy(tp);
> +				}
>   			}
>   			goto errout;
>   		case RTM_GETTFILTER:
> @@ -372,7 +376,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
>   		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
>   	} else {
>   		if (tp_created)
> -			tcf_destroy(tp, true);
> +			tcf_destroy(tp);
>   	}
>   
>   errout:
> diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
> index eb219b7..dd63230 100644
> --- a/net/sched/cls_basic.c
> +++ b/net/sched/cls_basic.c
> @@ -96,31 +96,28 @@ static void basic_delete_filter(struct rcu_head *head)
>   	kfree(f);
>   }
>   
> -static bool basic_destroy(struct tcf_proto *tp, bool force)
> +static void basic_destroy(struct tcf_proto *tp)
>   {
>   	struct basic_head *head = rtnl_dereference(tp->root);
>   	struct basic_filter *f, *n;
>   
> -	if (!force && !list_empty(&head->flist))
> -		return false;
> -
>   	list_for_each_entry_safe(f, n, &head->flist, link) {
>   		list_del_rcu(&f->link);
>   		tcf_unbind_filter(tp, &f->res);
>   		call_rcu(&f->rcu, basic_delete_filter);
>   	}
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
> -static int basic_delete(struct tcf_proto *tp, unsigned long arg)
> +static int basic_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
> +	struct basic_head *head = rtnl_dereference(tp->root);
>   	struct basic_filter *f = (struct basic_filter *) arg;
>   
>   	list_del_rcu(&f->link);
>   	tcf_unbind_filter(tp, &f->res);
>   	call_rcu(&f->rcu, basic_delete_filter);
> +	*last = list_empty(&head->flist);
>   	return 0;
>   }
>   
> diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
> index 52dc85a..770984c0 100644
> --- a/net/sched/cls_bpf.c
> +++ b/net/sched/cls_bpf.c
> @@ -265,26 +265,25 @@ static void __cls_bpf_delete_prog(struct rcu_head *rcu)
>   	cls_bpf_delete_prog(prog->tp, prog);
>   }
>   
> -static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
> +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
> +	struct cls_bpf_head *head = rtnl_dereference(tp->root);
>   
>   	cls_bpf_stop_offload(tp, prog);
>   	list_del_rcu(&prog->link);
>   	tcf_unbind_filter(tp, &prog->res);
>   	call_rcu(&prog->rcu, __cls_bpf_delete_prog);
> +	*last = list_empty(&head->plist);
>   
>   	return 0;
>   }
>   
> -static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
> +static void cls_bpf_destroy(struct tcf_proto *tp)
>   {
>   	struct cls_bpf_head *head = rtnl_dereference(tp->root);
>   	struct cls_bpf_prog *prog, *tmp;
>   
> -	if (!force && !list_empty(&head->plist))
> -		return false;
> -
>   	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
>   		cls_bpf_stop_offload(tp, prog);
>   		list_del_rcu(&prog->link);
> @@ -292,9 +291,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
>   		call_rcu(&prog->rcu, __cls_bpf_delete_prog);
>   	}
>   
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
>   static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
> diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
> index 85233c47..fa9405e 100644
> --- a/net/sched/cls_cgroup.c
> +++ b/net/sched/cls_cgroup.c
> @@ -131,21 +131,15 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
>   	return err;
>   }
>   
> -static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force)
> +static void cls_cgroup_destroy(struct tcf_proto *tp)
>   {
>   	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
>   
> -	if (!force)
> -		return false;
> -
> -	if (head) {
> -		RCU_INIT_POINTER(tp->root, NULL);
> +	if (head)
>   		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
> -	}
> -	return true;
>   }
>   
> -static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
> +static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	return -EOPNOTSUPP;
>   }
> diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
> index e396723..ea2be75 100644
> --- a/net/sched/cls_flow.c
> +++ b/net/sched/cls_flow.c
> @@ -563,12 +563,14 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
>   	return err;
>   }
>   
> -static int flow_delete(struct tcf_proto *tp, unsigned long arg)
> +static int flow_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
> +	struct flow_head *head = rtnl_dereference(tp->root);
>   	struct flow_filter *f = (struct flow_filter *)arg;
>   
>   	list_del_rcu(&f->list);
>   	call_rcu(&f->rcu, flow_destroy_filter);
> +	*last = list_empty(&head->filters);
>   	return 0;
>   }
>   
> @@ -584,21 +586,16 @@ static int flow_init(struct tcf_proto *tp)
>   	return 0;
>   }
>   
> -static bool flow_destroy(struct tcf_proto *tp, bool force)
> +static void flow_destroy(struct tcf_proto *tp)
>   {
>   	struct flow_head *head = rtnl_dereference(tp->root);
>   	struct flow_filter *f, *next;
>   
> -	if (!force && !list_empty(&head->filters))
> -		return false;
> -
>   	list_for_each_entry_safe(f, next, &head->filters, list) {
>   		list_del_rcu(&f->list);
>   		call_rcu(&f->rcu, flow_destroy_filter);
>   	}
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
>   static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index e8dd09a..495d63224 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -280,21 +280,16 @@ static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
>   	call_rcu(&f->rcu, fl_destroy_filter);
>   }
>   
> -static bool fl_destroy(struct tcf_proto *tp, bool force)
> +static void fl_destroy(struct tcf_proto *tp)
>   {
>   	struct cls_fl_head *head = rtnl_dereference(tp->root);
>   	struct cls_fl_filter *f, *next;
>   
> -	if (!force && !list_empty(&head->filters))
> -		return false;
> -
>   	list_for_each_entry_safe(f, next, &head->filters, list)
>   		__fl_delete(tp, f);
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	if (head->mask_assigned)
>   		rhashtable_destroy(&head->ht);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
>   static unsigned long fl_get(struct tcf_proto *tp, u32 handle)
> @@ -777,7 +772,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>   	return err;
>   }
>   
> -static int fl_delete(struct tcf_proto *tp, unsigned long arg)
> +static int fl_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct cls_fl_head *head = rtnl_dereference(tp->root);
>   	struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
> @@ -785,6 +780,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
>   	rhashtable_remove_fast(&head->ht, &f->ht_node,
>   			       head->ht_params);
>   	__fl_delete(tp, f);
> +	*last = list_empty(&head->filters);
>   	return 0;
>   }
>   
> diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
> index 9dc63d5..bc8ceb7 100644
> --- a/net/sched/cls_fw.c
> +++ b/net/sched/cls_fw.c
> @@ -127,20 +127,14 @@ static void fw_delete_filter(struct rcu_head *head)
>   	kfree(f);
>   }
>   
> -static bool fw_destroy(struct tcf_proto *tp, bool force)
> +static void fw_destroy(struct tcf_proto *tp)
>   {
>   	struct fw_head *head = rtnl_dereference(tp->root);
>   	struct fw_filter *f;
>   	int h;
>   
>   	if (head == NULL)
> -		return true;
> -
> -	if (!force) {
> -		for (h = 0; h < HTSIZE; h++)
> -			if (rcu_access_pointer(head->ht[h]))
> -				return false;
> -	}
> +		return;
>   
>   	for (h = 0; h < HTSIZE; h++) {
>   		while ((f = rtnl_dereference(head->ht[h])) != NULL) {
> @@ -150,17 +144,17 @@ static bool fw_destroy(struct tcf_proto *tp, bool force)
>   			call_rcu(&f->rcu, fw_delete_filter);
>   		}
>   	}
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
> -static int fw_delete(struct tcf_proto *tp, unsigned long arg)
> +static int fw_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct fw_head *head = rtnl_dereference(tp->root);
>   	struct fw_filter *f = (struct fw_filter *)arg;
>   	struct fw_filter __rcu **fp;
>   	struct fw_filter *pfp;
> +	int ret = -EINVAL;
> +	int h;
>   
>   	if (head == NULL || f == NULL)
>   		goto out;
> @@ -173,11 +167,21 @@ static int fw_delete(struct tcf_proto *tp, unsigned long arg)
>   			RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
>   			tcf_unbind_filter(tp, &f->res);
>   			call_rcu(&f->rcu, fw_delete_filter);
> -			return 0;
> +			ret = 0;
> +			break;
>   		}
>   	}
> +
> +	*last = true;
> +	for (h = 0; h < HTSIZE; h++) {
> +		if (rcu_access_pointer(head->ht[h])) {
> +			*last = false;
> +			break;
> +		}
> +	}
> +
>   out:
> -	return -EINVAL;
> +	return ret;
>   }
>   
>   static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
> diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
> index 25927b6..7d54805 100644
> --- a/net/sched/cls_matchall.c
> +++ b/net/sched/cls_matchall.c
> @@ -99,24 +99,19 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
>   					     &offload);
>   }
>   
> -static bool mall_destroy(struct tcf_proto *tp, bool force)
> +static void mall_destroy(struct tcf_proto *tp)
>   {
>   	struct cls_mall_head *head = rtnl_dereference(tp->root);
>   	struct net_device *dev = tp->q->dev_queue->dev;
>   	struct cls_mall_filter *f = head->filter;
>   
> -	if (!force && f)
> -		return false;
> -
>   	if (f) {
>   		if (tc_should_offload(dev, tp, f->flags))
>   			mall_destroy_hw_filter(tp, f, (unsigned long) f);
>   
>   		call_rcu(&f->rcu, mall_destroy_filter);
>   	}
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
>   static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
> @@ -225,7 +220,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
>   	return err;
>   }
>   
> -static int mall_delete(struct tcf_proto *tp, unsigned long arg)
> +static int mall_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct cls_mall_head *head = rtnl_dereference(tp->root);
>   	struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
> @@ -237,6 +232,7 @@ static int mall_delete(struct tcf_proto *tp, unsigned long arg)
>   	RCU_INIT_POINTER(head->filter, NULL);
>   	tcf_unbind_filter(tp, &f->res);
>   	call_rcu(&f->rcu, mall_destroy_filter);
> +	*last = true;
>   	return 0;
>   }
>   
> diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
> index 455fc8f..1a38e41 100644
> --- a/net/sched/cls_route.c
> +++ b/net/sched/cls_route.c
> @@ -276,20 +276,13 @@ static void route4_delete_filter(struct rcu_head *head)
>   	kfree(f);
>   }
>   
> -static bool route4_destroy(struct tcf_proto *tp, bool force)
> +static void route4_destroy(struct tcf_proto *tp)
>   {
>   	struct route4_head *head = rtnl_dereference(tp->root);
>   	int h1, h2;
>   
>   	if (head == NULL)
> -		return true;
> -
> -	if (!force) {
> -		for (h1 = 0; h1 <= 256; h1++) {
> -			if (rcu_access_pointer(head->table[h1]))
> -				return false;
> -		}
> -	}
> +		return;
>   
>   	for (h1 = 0; h1 <= 256; h1++) {
>   		struct route4_bucket *b;
> @@ -312,12 +305,10 @@ static bool route4_destroy(struct tcf_proto *tp, bool force)
>   			kfree_rcu(b, rcu);
>   		}
>   	}
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	kfree_rcu(head, rcu);
> -	return true;
>   }
>   
> -static int route4_delete(struct tcf_proto *tp, unsigned long arg)
> +static int route4_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct route4_head *head = rtnl_dereference(tp->root);
>   	struct route4_filter *f = (struct route4_filter *)arg;
> @@ -325,7 +316,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
>   	struct route4_filter *nf;
>   	struct route4_bucket *b;
>   	unsigned int h = 0;
> -	int i;
> +	int i, h1;
>   
>   	if (!head || !f)
>   		return -EINVAL;
> @@ -356,16 +347,25 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
>   
>   				rt = rtnl_dereference(b->ht[i]);
>   				if (rt)
> -					return 0;
> +					goto out;
>   			}
>   
>   			/* OK, session has no flows */
>   			RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
>   			kfree_rcu(b, rcu);
> +			break;
> +		}
> +	}
>   
> -			return 0;
> +out:
> +	*last = true;
> +	for (h1 = 0; h1 <= 256; h1++) {
> +		if (rcu_access_pointer(head->table[h1])) {
> +			*last = false;
> +			break;
>   		}
>   	}
> +
>   	return 0;
>   }
>   
> diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
> index 4f05a19..e8ba81a 100644
> --- a/net/sched/cls_rsvp.h
> +++ b/net/sched/cls_rsvp.h
> @@ -301,22 +301,13 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
>   	call_rcu(&f->rcu, rsvp_delete_filter_rcu);
>   }
>   
> -static bool rsvp_destroy(struct tcf_proto *tp, bool force)
> +static void rsvp_destroy(struct tcf_proto *tp)
>   {
>   	struct rsvp_head *data = rtnl_dereference(tp->root);
>   	int h1, h2;
>   
>   	if (data == NULL)
> -		return true;
> -
> -	if (!force) {
> -		for (h1 = 0; h1 < 256; h1++) {
> -			if (rcu_access_pointer(data->ht[h1]))
> -				return false;
> -		}
> -	}
> -
> -	RCU_INIT_POINTER(tp->root, NULL);
> +		return;
>   
>   	for (h1 = 0; h1 < 256; h1++) {
>   		struct rsvp_session *s;
> @@ -336,10 +327,9 @@ static bool rsvp_destroy(struct tcf_proto *tp, bool force)
>   		}
>   	}
>   	kfree_rcu(data, rcu);
> -	return true;
>   }
>   
> -static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
> +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct rsvp_head *head = rtnl_dereference(tp->root);
>   	struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
> @@ -347,7 +337,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
>   	unsigned int h = f->handle;
>   	struct rsvp_session __rcu **sp;
>   	struct rsvp_session *nsp, *s = f->sess;
> -	int i;
> +	int i, h1;
>   
>   	fp = &s->ht[(h >> 8) & 0xFF];
>   	for (nfp = rtnl_dereference(*fp); nfp;
> @@ -360,7 +350,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
>   
>   			for (i = 0; i <= 16; i++)
>   				if (s->ht[i])
> -					return 0;
> +					goto out;
>   
>   			/* OK, session has no flows */
>   			sp = &head->ht[h & 0xFF];
> @@ -369,13 +359,23 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
>   				if (nsp == s) {
>   					RCU_INIT_POINTER(*sp, s->next);
>   					kfree_rcu(s, rcu);
> -					return 0;
> +					goto out;
>   				}
>   			}
>   
> -			return 0;
> +			break;
>   		}
>   	}
> +
> +out:
> +	*last = true;
> +	for (h1 = 0; h1 < 256; h1++) {
> +		if (rcu_access_pointer(head->ht[h1])) {
> +			*last = false;
> +			break;
> +		}
> +	}
> +
>   	return 0;
>   }
>   
> diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
> index 96144bd..9149a03 100644
> --- a/net/sched/cls_tcindex.c
> +++ b/net/sched/cls_tcindex.c
> @@ -150,7 +150,7 @@ static void tcindex_destroy_fexts(struct rcu_head *head)
>   	kfree(f);
>   }
>   
> -static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
> +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct tcindex_data *p = rtnl_dereference(tp->root);
>   	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
> @@ -186,6 +186,8 @@ static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
>   		call_rcu(&f->rcu, tcindex_destroy_fexts);
>   	else
>   		call_rcu(&r->rcu, tcindex_destroy_rexts);
> +
> +	*last = false;
>   	return 0;
>   }
>   
> @@ -193,7 +195,9 @@ static int tcindex_destroy_element(struct tcf_proto *tp,
>   				   unsigned long arg,
>   				   struct tcf_walker *walker)
>   {
> -	return tcindex_delete(tp, arg);
> +	bool last;
> +
> +	return tcindex_delete(tp, arg, &last);
>   }
>   
>   static void __tcindex_destroy(struct rcu_head *head)
> @@ -529,23 +533,18 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
>   	}
>   }
>   
> -static bool tcindex_destroy(struct tcf_proto *tp, bool force)
> +static void tcindex_destroy(struct tcf_proto *tp)
>   {
>   	struct tcindex_data *p = rtnl_dereference(tp->root);
>   	struct tcf_walker walker;
>   
> -	if (!force)
> -		return false;
> -
>   	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
>   	walker.count = 0;
>   	walker.skip = 0;
>   	walker.fn = tcindex_destroy_element;
>   	tcindex_walk(tp, &walker);
>   
> -	RCU_INIT_POINTER(tp->root, NULL);
>   	call_rcu(&p->rcu, __tcindex_destroy);
> -	return true;
>   }
>   
>   
> diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
> index ae83c3ae..787573b 100644
> --- a/net/sched/cls_u32.c
> +++ b/net/sched/cls_u32.c
> @@ -582,37 +582,13 @@ static bool ht_empty(struct tc_u_hnode *ht)
>   	return true;
>   }
>   
> -static bool u32_destroy(struct tcf_proto *tp, bool force)
> +static void u32_destroy(struct tcf_proto *tp)
>   {
>   	struct tc_u_common *tp_c = tp->data;
>   	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
>   
>   	WARN_ON(root_ht == NULL);
>   
> -	if (!force) {
> -		if (root_ht) {
> -			if (root_ht->refcnt > 1)
> -				return false;
> -			if (root_ht->refcnt == 1) {
> -				if (!ht_empty(root_ht))
> -					return false;
> -			}
> -		}
> -
> -		if (tp_c->refcnt > 1)
> -			return false;
> -
> -		if (tp_c->refcnt == 1) {
> -			struct tc_u_hnode *ht;
> -
> -			for (ht = rtnl_dereference(tp_c->hlist);
> -			     ht;
> -			     ht = rtnl_dereference(ht->next))
> -				if (!ht_empty(ht))
> -					return false;
> -		}
> -	}
> -
>   	if (root_ht && --root_ht->refcnt == 0)
>   		u32_destroy_hnode(tp, root_ht);
>   
> @@ -637,20 +613,22 @@ static bool u32_destroy(struct tcf_proto *tp, bool force)
>   	}
>   
>   	tp->data = NULL;
> -	return true;
>   }
>   
> -static int u32_delete(struct tcf_proto *tp, unsigned long arg)
> +static int u32_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
>   {
>   	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
>   	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
> +	struct tc_u_common *tp_c = tp->data;
> +	int ret = 0;
>   
>   	if (ht == NULL)
> -		return 0;
> +		goto out;
>   
>   	if (TC_U32_KEY(ht->handle)) {
>   		u32_remove_hw_knode(tp, ht->handle);
> -		return u32_delete_key(tp, (struct tc_u_knode *)ht);
> +		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
> +		goto out;
>   	}
>   
>   	if (root_ht == ht)
> @@ -663,7 +641,40 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
>   		return -EBUSY;
>   	}
>   
> -	return 0;
> +out:
> +	*last = true;
> +	if (root_ht) {
> +		if (root_ht->refcnt > 1) {
> +			*last = false;
> +			goto ret;
> +		}
> +		if (root_ht->refcnt == 1) {
> +			if (!ht_empty(root_ht)) {
> +				*last = false;
> +				goto ret;
> +			}
> +		}
> +	}
> +
> +	if (tp_c->refcnt > 1) {
> +		*last = false;
> +		goto ret;
> +	}
> +
> +	if (tp_c->refcnt == 1) {
> +		struct tc_u_hnode *ht;
> +
> +		for (ht = rtnl_dereference(tp_c->hlist);
> +		     ht;
> +		     ht = rtnl_dereference(ht->next))
> +			if (!ht_empty(ht)) {
> +				*last = false;
> +				break;
> +			}
> +	}
> +
> +ret:
> +	return ret;
>   }
>   
>   #define NR_U32_NODE (1<<12)
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index f337f1b..7fc48b3 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -1899,15 +1899,11 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
>   }
>   EXPORT_SYMBOL(tc_classify);
>   
> -bool tcf_destroy(struct tcf_proto *tp, bool force)
> +void tcf_destroy(struct tcf_proto *tp)
>   {
> -	if (tp->ops->destroy(tp, force)) {
> -		module_put(tp->ops->owner);
> -		kfree_rcu(tp, rcu);
> -		return true;
> -	}
> -
> -	return false;
> +	tp->ops->destroy(tp);
> +	module_put(tp->ops->owner);
> +	kfree_rcu(tp, rcu);
>   }
>   
>   void tcf_destroy_chain(struct tcf_proto __rcu **fl)
> @@ -1916,7 +1912,7 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl)
>   
>   	while ((tp = rtnl_dereference(*fl)) != NULL) {
>   		RCU_INIT_POINTER(*fl, tp->next);
> -		tcf_destroy(tp, true);
> +		tcf_destroy(tp);
>   	}
>   }
>   EXPORT_SYMBOL(tcf_destroy_chain);

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24  8:29 ` Roi Dayan
@ 2016-11-24 10:14   ` Daniel Borkmann
  2016-11-24 11:01     ` Roi Dayan
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Borkmann @ 2016-11-24 10:14 UTC (permalink / raw)
  To: Roi Dayan, Cong Wang, netdev; +Cc: jiri, John Fastabend

On 11/24/2016 09:29 AM, Roi Dayan wrote:
> Hi,
>
> I'm testing this patch with KASAN enabled and got into a new kernel crash I didn't hit before.
>
> [ 1860.725065] ==================================================================
> [ 1860.733893] BUG: KASAN: use-after-free in __netif_receive_skb_core+0x1ebe/0x29a0 at addr ffff880a68b04028
> [ 1860.745415] Read of size 8 by task CPU 0/KVM/5334
> [ 1860.751368] CPU: 8 PID: 5334 Comm: CPU 0/KVM Tainted: G O 4.9.0-rc3+ #18
> [ 1860.760547] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
> [ 1860.768036] Call Trace:
> [ 1860.771307]  [<ffffffffa9b6dc42>] dump_stack+0x63/0x81
> [ 1860.777167]  [<ffffffffa95fb751>] kasan_object_err+0x21/0x70
> [ 1860.783826]  [<ffffffffa95fb9dd>] kasan_report_error+0x1ed/0x4e0
> [ 1860.790640]  [<ffffffffa9b9b841>] ? csum_partial+0x11/0x20
> [ 1860.796871]  [<ffffffffaa44a6b9>] ? csum_partial_ext+0x9/0x10
> [ 1860.803571]  [<ffffffffaa453155>] ? __skb_checksum+0x115/0x8d0
> [ 1860.810370]  [<ffffffffa95fbe81>] __asan_report_load8_noabort+0x61/0x70
> [ 1860.818263]  [<ffffffffaa49c3fe>] ? __netif_receive_skb_core+0x1ebe/0x29a0
> [ 1860.826215]  [<ffffffffaa49c3fe>] __netif_receive_skb_core+0x1ebe/0x29a0
> [ 1860.833991]  [<ffffffffaa49a540>] ? netdev_info+0x100/0x100
> [ 1860.840529]  [<ffffffffaa671792>] ? udp4_gro_receive+0x802/0x1090
> [ 1860.847783]  [<ffffffffa9bb9a08>] ? find_next_bit+0x18/0x20
> [ 1860.854126]  [<ffffffffaa49cf04>] __netif_receive_skb+0x24/0x150
> [ 1860.861695]  [<ffffffffaa49d0d1>] netif_receive_skb_internal+0xa1/0x1d0
> [ 1860.869366]  [<ffffffffaa49d030>] ? __netif_receive_skb+0x150/0x150
> [ 1860.876464]  [<ffffffffaa49f7e9>] ? dev_gro_receive+0x969/0x1660
> [ 1860.883924]  [<ffffffffaa4a0e1f>] napi_gro_receive+0x1df/0x300
> [ 1860.890744]  [<ffffffffc02e885d>] mlx5e_handle_rx_cqe_rep+0x83d/0xd30 [mlx5_core]
>
> checking with gdb
>
> (gdb) l *(__netif_receive_skb_core+0x1ebe)
> 0xffffffff8249c3fe is in __netif_receive_skb_core (net/core/dev.c:3937).
> 3932                    *pt_prev = NULL;
> 3933            }
> 3934
> 3935            qdisc_skb_cb(skb)->pkt_len = skb->len;
> 3936            skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
> 3937            qdisc_bstats_cpu_update(cl->q, skb);
> 3938
> 3939            switch (tc_classify(skb, cl, &cl_res, false)) {
> 3940            case TC_ACT_OK:
> 3941            case TC_ACT_RECLASSIFY:

Can you elaborate some more on your test-case? Adding/dropping ingress qdisc with
some classifier on it in a loop while traffic goes through?

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24 10:14   ` Daniel Borkmann
@ 2016-11-24 11:01     ` Roi Dayan
  2016-11-24 15:20       ` Daniel Borkmann
  0 siblings, 1 reply; 16+ messages in thread
From: Roi Dayan @ 2016-11-24 11:01 UTC (permalink / raw)
  To: Daniel Borkmann, Cong Wang, netdev; +Cc: roid, jiri, John Fastabend


On 24/11/2016 12:14, Daniel Borkmann wrote:
> On 11/24/2016 09:29 AM, Roi Dayan wrote:
>> Hi,
>>
>> I'm testing this patch with KASAN enabled and got into a new kernel 
>> crash I didn't hit before.
>>
>> [ 1860.725065] 
>> ==================================================================
>> [ 1860.733893] BUG: KASAN: use-after-free in 
>> __netif_receive_skb_core+0x1ebe/0x29a0 at addr ffff880a68b04028
>> [ 1860.745415] Read of size 8 by task CPU 0/KVM/5334
>> [ 1860.751368] CPU: 8 PID: 5334 Comm: CPU 0/KVM Tainted: G O 
>> 4.9.0-rc3+ #18
>> [ 1860.760547] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 
>> 07/01/2015
>> [ 1860.768036] Call Trace:
>> [ 1860.771307]  [<ffffffffa9b6dc42>] dump_stack+0x63/0x81
>> [ 1860.777167]  [<ffffffffa95fb751>] kasan_object_err+0x21/0x70
>> [ 1860.783826]  [<ffffffffa95fb9dd>] kasan_report_error+0x1ed/0x4e0
>> [ 1860.790640]  [<ffffffffa9b9b841>] ? csum_partial+0x11/0x20
>> [ 1860.796871]  [<ffffffffaa44a6b9>] ? csum_partial_ext+0x9/0x10
>> [ 1860.803571]  [<ffffffffaa453155>] ? __skb_checksum+0x115/0x8d0
>> [ 1860.810370]  [<ffffffffa95fbe81>] 
>> __asan_report_load8_noabort+0x61/0x70
>> [ 1860.818263]  [<ffffffffaa49c3fe>] ? 
>> __netif_receive_skb_core+0x1ebe/0x29a0
>> [ 1860.826215]  [<ffffffffaa49c3fe>] 
>> __netif_receive_skb_core+0x1ebe/0x29a0
>> [ 1860.833991]  [<ffffffffaa49a540>] ? netdev_info+0x100/0x100
>> [ 1860.840529]  [<ffffffffaa671792>] ? udp4_gro_receive+0x802/0x1090
>> [ 1860.847783]  [<ffffffffa9bb9a08>] ? find_next_bit+0x18/0x20
>> [ 1860.854126]  [<ffffffffaa49cf04>] __netif_receive_skb+0x24/0x150
>> [ 1860.861695]  [<ffffffffaa49d0d1>] 
>> netif_receive_skb_internal+0xa1/0x1d0
>> [ 1860.869366]  [<ffffffffaa49d030>] ? __netif_receive_skb+0x150/0x150
>> [ 1860.876464]  [<ffffffffaa49f7e9>] ? dev_gro_receive+0x969/0x1660
>> [ 1860.883924]  [<ffffffffaa4a0e1f>] napi_gro_receive+0x1df/0x300
>> [ 1860.890744]  [<ffffffffc02e885d>] 
>> mlx5e_handle_rx_cqe_rep+0x83d/0xd30 [mlx5_core]
>>
>> checking with gdb
>>
>> (gdb) l *(__netif_receive_skb_core+0x1ebe)
>> 0xffffffff8249c3fe is in __netif_receive_skb_core (net/core/dev.c:3937).
>> 3932                    *pt_prev = NULL;
>> 3933            }
>> 3934
>> 3935            qdisc_skb_cb(skb)->pkt_len = skb->len;
>> 3936            skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
>> 3937            qdisc_bstats_cpu_update(cl->q, skb);
>> 3938
>> 3939            switch (tc_classify(skb, cl, &cl_res, false)) {
>> 3940            case TC_ACT_OK:
>> 3941            case TC_ACT_RECLASSIFY:
>
> Can you elaborate some more on your test-case? Adding/dropping ingress 
> qdisc with
> some classifier on it in a loop while traffic goes through?

I first delete the qdisc ingress from the relevant interface
I start traffic on it then I add the qdisc ingress to the relevant 
interface and start adding tc flower rules to match the traffic.


>
> Thanks,
> Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24 11:01     ` Roi Dayan
@ 2016-11-24 15:20       ` Daniel Borkmann
  2016-11-24 17:18         ` Daniel Borkmann
  2016-11-26  6:46         ` Cong Wang
  0 siblings, 2 replies; 16+ messages in thread
From: Daniel Borkmann @ 2016-11-24 15:20 UTC (permalink / raw)
  To: Roi Dayan, Cong Wang, netdev; +Cc: jiri, John Fastabend

On 11/24/2016 12:01 PM, Roi Dayan wrote:
> On 24/11/2016 12:14, Daniel Borkmann wrote:
>> On 11/24/2016 09:29 AM, Roi Dayan wrote:
>>> Hi,
>>>
>>> I'm testing this patch with KASAN enabled and got into a new kernel crash I didn't hit before.
>>>
>>> [ 1860.725065] ==================================================================
>>> [ 1860.733893] BUG: KASAN: use-after-free in __netif_receive_skb_core+0x1ebe/0x29a0 at addr ffff880a68b04028
>>> [ 1860.745415] Read of size 8 by task CPU 0/KVM/5334
>>> [ 1860.751368] CPU: 8 PID: 5334 Comm: CPU 0/KVM Tainted: G O 4.9.0-rc3+ #18

(Btw, your kernel is tainted with o-o-tree module? Anything relevant?)

>>> [ 1860.760547] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
>>> [ 1860.768036] Call Trace:
>>> [ 1860.771307]  [<ffffffffa9b6dc42>] dump_stack+0x63/0x81
>>> [ 1860.777167]  [<ffffffffa95fb751>] kasan_object_err+0x21/0x70
>>> [ 1860.783826]  [<ffffffffa95fb9dd>] kasan_report_error+0x1ed/0x4e0
>>> [ 1860.790640]  [<ffffffffa9b9b841>] ? csum_partial+0x11/0x20
>>> [ 1860.796871]  [<ffffffffaa44a6b9>] ? csum_partial_ext+0x9/0x10
>>> [ 1860.803571]  [<ffffffffaa453155>] ? __skb_checksum+0x115/0x8d0
>>> [ 1860.810370]  [<ffffffffa95fbe81>] __asan_report_load8_noabort+0x61/0x70
>>> [ 1860.818263]  [<ffffffffaa49c3fe>] ? __netif_receive_skb_core+0x1ebe/0x29a0
>>> [ 1860.826215]  [<ffffffffaa49c3fe>] __netif_receive_skb_core+0x1ebe/0x29a0
>>> [ 1860.833991]  [<ffffffffaa49a540>] ? netdev_info+0x100/0x100
>>> [ 1860.840529]  [<ffffffffaa671792>] ? udp4_gro_receive+0x802/0x1090
>>> [ 1860.847783]  [<ffffffffa9bb9a08>] ? find_next_bit+0x18/0x20
>>> [ 1860.854126]  [<ffffffffaa49cf04>] __netif_receive_skb+0x24/0x150
>>> [ 1860.861695]  [<ffffffffaa49d0d1>] netif_receive_skb_internal+0xa1/0x1d0
>>> [ 1860.869366]  [<ffffffffaa49d030>] ? __netif_receive_skb+0x150/0x150
>>> [ 1860.876464]  [<ffffffffaa49f7e9>] ? dev_gro_receive+0x969/0x1660
>>> [ 1860.883924]  [<ffffffffaa4a0e1f>] napi_gro_receive+0x1df/0x300
>>> [ 1860.890744]  [<ffffffffc02e885d>] mlx5e_handle_rx_cqe_rep+0x83d/0xd30 [mlx5_core]
>>>
>>> checking with gdb
>>>
>>> (gdb) l *(__netif_receive_skb_core+0x1ebe)
>>> 0xffffffff8249c3fe is in __netif_receive_skb_core (net/core/dev.c:3937).
>>> 3932                    *pt_prev = NULL;
>>> 3933            }
>>> 3934
>>> 3935            qdisc_skb_cb(skb)->pkt_len = skb->len;
>>> 3936            skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
>>> 3937            qdisc_bstats_cpu_update(cl->q, skb);
>>> 3938
>>> 3939            switch (tc_classify(skb, cl, &cl_res, false)) {
>>> 3940            case TC_ACT_OK:
>>> 3941            case TC_ACT_RECLASSIFY:
>>
>> Can you elaborate some more on your test-case? Adding/dropping ingress qdisc with
>> some classifier on it in a loop while traffic goes through?
>
> I first delete the qdisc ingress from the relevant interface
> I start traffic on it then I add the qdisc ingress to the relevant interface and start adding tc flower rules to match the traffic.

Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
drops its entire chain via tcf_destroy_chain(), so that will be NULL
eventually. The tps are freed by call_rcu() as well as qdisc itself
later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
Outstanding readers should either bail out due to if (!cl) or can still
process the chain until read section ends, but during that time, cl->q
resp. bstats should be good. Do you happen to know what's at address
ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), but
at least on ingress (netif_receive_skb_internal()) we hold rcu_read_lock()
here. The KASAN report is reliably happening at this location, right?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24 15:20       ` Daniel Borkmann
@ 2016-11-24 17:18         ` Daniel Borkmann
  2016-11-26  6:46         ` Cong Wang
  1 sibling, 0 replies; 16+ messages in thread
From: Daniel Borkmann @ 2016-11-24 17:18 UTC (permalink / raw)
  To: Roi Dayan, Cong Wang, netdev; +Cc: jiri, John Fastabend

On 11/24/2016 04:20 PM, Daniel Borkmann wrote:
> On 11/24/2016 12:01 PM, Roi Dayan wrote:
>> On 24/11/2016 12:14, Daniel Borkmann wrote:
>>> On 11/24/2016 09:29 AM, Roi Dayan wrote:
>>>> Hi,
>>>>
>>>> I'm testing this patch with KASAN enabled and got into a new kernel crash I didn't hit before.
>>>>
>>>> [ 1860.725065] ==================================================================
>>>> [ 1860.733893] BUG: KASAN: use-after-free in __netif_receive_skb_core+0x1ebe/0x29a0 at addr ffff880a68b04028
>>>> [ 1860.745415] Read of size 8 by task CPU 0/KVM/5334
>>>> [ 1860.751368] CPU: 8 PID: 5334 Comm: CPU 0/KVM Tainted: G O 4.9.0-rc3+ #18
>
> (Btw, your kernel is tainted with o-o-tree module? Anything relevant?)
>
>>>> [ 1860.760547] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
>>>> [ 1860.768036] Call Trace:
>>>> [ 1860.771307]  [<ffffffffa9b6dc42>] dump_stack+0x63/0x81
>>>> [ 1860.777167]  [<ffffffffa95fb751>] kasan_object_err+0x21/0x70
>>>> [ 1860.783826]  [<ffffffffa95fb9dd>] kasan_report_error+0x1ed/0x4e0
>>>> [ 1860.790640]  [<ffffffffa9b9b841>] ? csum_partial+0x11/0x20
>>>> [ 1860.796871]  [<ffffffffaa44a6b9>] ? csum_partial_ext+0x9/0x10
>>>> [ 1860.803571]  [<ffffffffaa453155>] ? __skb_checksum+0x115/0x8d0
>>>> [ 1860.810370]  [<ffffffffa95fbe81>] __asan_report_load8_noabort+0x61/0x70
>>>> [ 1860.818263]  [<ffffffffaa49c3fe>] ? __netif_receive_skb_core+0x1ebe/0x29a0
>>>> [ 1860.826215]  [<ffffffffaa49c3fe>] __netif_receive_skb_core+0x1ebe/0x29a0
>>>> [ 1860.833991]  [<ffffffffaa49a540>] ? netdev_info+0x100/0x100
>>>> [ 1860.840529]  [<ffffffffaa671792>] ? udp4_gro_receive+0x802/0x1090
>>>> [ 1860.847783]  [<ffffffffa9bb9a08>] ? find_next_bit+0x18/0x20
>>>> [ 1860.854126]  [<ffffffffaa49cf04>] __netif_receive_skb+0x24/0x150
>>>> [ 1860.861695]  [<ffffffffaa49d0d1>] netif_receive_skb_internal+0xa1/0x1d0
>>>> [ 1860.869366]  [<ffffffffaa49d030>] ? __netif_receive_skb+0x150/0x150
>>>> [ 1860.876464]  [<ffffffffaa49f7e9>] ? dev_gro_receive+0x969/0x1660
>>>> [ 1860.883924]  [<ffffffffaa4a0e1f>] napi_gro_receive+0x1df/0x300
>>>> [ 1860.890744]  [<ffffffffc02e885d>] mlx5e_handle_rx_cqe_rep+0x83d/0xd30 [mlx5_core]
>>>>
>>>> checking with gdb
>>>>
>>>> (gdb) l *(__netif_receive_skb_core+0x1ebe)
>>>> 0xffffffff8249c3fe is in __netif_receive_skb_core (net/core/dev.c:3937).
>>>> 3932                    *pt_prev = NULL;
>>>> 3933            }
>>>> 3934
>>>> 3935            qdisc_skb_cb(skb)->pkt_len = skb->len;
>>>> 3936            skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
>>>> 3937            qdisc_bstats_cpu_update(cl->q, skb);
>>>> 3938
>>>> 3939            switch (tc_classify(skb, cl, &cl_res, false)) {
>>>> 3940            case TC_ACT_OK:
>>>> 3941            case TC_ACT_RECLASSIFY:
>>>
>>> Can you elaborate some more on your test-case? Adding/dropping ingress qdisc with
>>> some classifier on it in a loop while traffic goes through?
>>
>> I first delete the qdisc ingress from the relevant interface
>> I start traffic on it then I add the qdisc ingress to the relevant interface and start adding tc flower rules to match the traffic.
>
> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
> drops its entire chain via tcf_destroy_chain(), so that will be NULL
> eventually. The tps are freed by call_rcu() as well as qdisc itself
> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
> Outstanding readers should either bail out due to if (!cl) or can still
> process the chain until read section ends, but during that time, cl->q
> resp. bstats should be good. Do you happen to know what's at address
> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), but
> at least on ingress (netif_receive_skb_internal()) we hold rcu_read_lock()
> here. The KASAN report is reliably happening at this location, right?

Tried to reproduce this on my phys machine on top of Cong's patch and no
luck hitting above so far. I have a KASAN compiled kernel with pktgen
hitting ingress and ingress qdisc + flower filter rules added/destroyed
in a loop. Hmm, do you have a kernel config (particular RCU settings)?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24 15:20       ` Daniel Borkmann
  2016-11-24 17:18         ` Daniel Borkmann
@ 2016-11-26  6:46         ` Cong Wang
  2016-11-26 11:09           ` Daniel Borkmann
  1 sibling, 1 reply; 16+ messages in thread
From: Cong Wang @ 2016-11-26  6:46 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Roi Dayan, Linux Kernel Network Developers, Jiri Pirko, John Fastabend

On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
> drops its entire chain via tcf_destroy_chain(), so that will be NULL
> eventually. The tps are freed by call_rcu() as well as qdisc itself
> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
> Outstanding readers should either bail out due to if (!cl) or can still
> process the chain until read section ends, but during that time, cl->q
> resp. bstats should be good. Do you happen to know what's at address
> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), but
> at least on ingress (netif_receive_skb_internal()) we hold rcu_read_lock()
> here. The KASAN report is reliably happening at this location, right?

I am confused as well, I don't see how it could be related to my patch yet.
I will take a deep look in the weekend.

Thanks!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-26  6:46         ` Cong Wang
@ 2016-11-26 11:09           ` Daniel Borkmann
  2016-11-27  0:33             ` Daniel Borkmann
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Borkmann @ 2016-11-26 11:09 UTC (permalink / raw)
  To: Cong Wang
  Cc: Roi Dayan, Linux Kernel Network Developers, Jiri Pirko, John Fastabend

On 11/26/2016 07:46 AM, Cong Wang wrote:
> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>> Outstanding readers should either bail out due to if (!cl) or can still
>> process the chain until read section ends, but during that time, cl->q
>> resp. bstats should be good. Do you happen to know what's at address
>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), but
>> at least on ingress (netif_receive_skb_internal()) we hold rcu_read_lock()
>> here. The KASAN report is reliably happening at this location, right?
>
> I am confused as well, I don't see how it could be related to my patch yet.
> I will take a deep look in the weekend.

Ok, I'm currently on the run. Got too late yesterday night, but I'll
write what I found in the evening today, not related to ingress though.

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-26 11:09           ` Daniel Borkmann
@ 2016-11-27  0:33             ` Daniel Borkmann
  2016-11-27  4:47               ` Roi Dayan
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Borkmann @ 2016-11-27  0:33 UTC (permalink / raw)
  To: Cong Wang
  Cc: Roi Dayan, Linux Kernel Network Developers, Jiri Pirko, John Fastabend

On 11/26/2016 12:09 PM, Daniel Borkmann wrote:
> On 11/26/2016 07:46 AM, Cong Wang wrote:
>> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
[...]
>>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>>> Outstanding readers should either bail out due to if (!cl) or can still
>>> process the chain until read section ends, but during that time, cl->q
>>> resp. bstats should be good. Do you happen to know what's at address
>>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), but
>>> at least on ingress (netif_receive_skb_internal()) we hold rcu_read_lock()
>>> here. The KASAN report is reliably happening at this location, right?
>>
>> I am confused as well, I don't see how it could be related to my patch yet.
>> I will take a deep look in the weekend.
>
> Ok, I'm currently on the run. Got too late yesterday night, but I'll
> write what I found in the evening today, not related to ingress though.

Just pushed out my analysis to netdev under "[PATCH net] net, sched: respect
rcu grace period on cls destruction". My conclusion is that both issues are
actually separate, and that one is small enough where we could route it via
net actually. Perhaps this at the same time shrinks your "[PATCH net-next]
net_sched: move the empty tp check from ->destroy() to ->delete()" to a
reasonable size that it's suitable to net as well. Your ->delete()/->destroy()
one is definitely needed, too. The tp->root one is independant of ->delete()/
->destroy() as they are different races and tp->root could also happen when
you just destroy the whole tp directly. I think that seems like a good path
forward to me.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-27  0:33             ` Daniel Borkmann
@ 2016-11-27  4:47               ` Roi Dayan
  2016-11-27  6:29                 ` Roi Dayan
  0 siblings, 1 reply; 16+ messages in thread
From: Roi Dayan @ 2016-11-27  4:47 UTC (permalink / raw)
  To: Daniel Borkmann, Cong Wang
  Cc: roid, Linux Kernel Network Developers, Jiri Pirko, John Fastabend



On 27/11/2016 02:33, Daniel Borkmann wrote:
> On 11/26/2016 12:09 PM, Daniel Borkmann wrote:
>> On 11/26/2016 07:46 AM, Cong Wang wrote:
>>> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann 
>>> <daniel@iogearbox.net> wrote:
> [...]
>>>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>>>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>>>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>>>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>>>> Outstanding readers should either bail out due to if (!cl) or can 
>>>> still
>>>> process the chain until read section ends, but during that time, cl->q
>>>> resp. bstats should be good. Do you happen to know what's at address
>>>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), but
>>>> at least on ingress (netif_receive_skb_internal()) we hold 
>>>> rcu_read_lock()
>>>> here. The KASAN report is reliably happening at this location, right?
>>>
>>> I am confused as well, I don't see how it could be related to my 
>>> patch yet.
>>> I will take a deep look in the weekend.



Hi Cong,

When reported the new trace I didn't mean it's related to your patch, I 
just wanted to point it out it exposed something. I should have been 
clear about it.


>>
>> Ok, I'm currently on the run. Got too late yesterday night, but I'll
>> write what I found in the evening today, not related to ingress though.
>
> Just pushed out my analysis to netdev under "[PATCH net] net, sched: 
> respect
> rcu grace period on cls destruction". My conclusion is that both 
> issues are
> actually separate, and that one is small enough where we could route 
> it via
> net actually. Perhaps this at the same time shrinks your "[PATCH 
> net-next]
> net_sched: move the empty tp check from ->destroy() to ->delete()" to a
> reasonable size that it's suitable to net as well. Your 
> ->delete()/->destroy()
> one is definitely needed, too. The tp->root one is independant of 
> ->delete()/
> ->destroy() as they are different races and tp->root could also happen 
> when
> you just destroy the whole tp directly. I think that seems like a good 
> path
> forward to me.
>
> Thanks,
> Daniel



Hi Daniel,

As for the tainted kernel. I was in old (week or two) net-next tree and 
only cherry-picked from latest net-next related patches to Mellanox HCA, 
cls_api, cls_flower, devlink. so those are the tainted modules.
I have the issue reproducing in that tree so wanted it to check it with 
Cong's patch instead of latest net-next.
I'll try running reproducing the issue with your new patch and later try 
latest net-next as well.

Thanks,
Roi

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-27  4:47               ` Roi Dayan
@ 2016-11-27  6:29                 ` Roi Dayan
  2016-11-28  2:26                   ` John Fastabend
  2016-11-29  6:59                   ` Cong Wang
  0 siblings, 2 replies; 16+ messages in thread
From: Roi Dayan @ 2016-11-27  6:29 UTC (permalink / raw)
  To: Daniel Borkmann, Cong Wang
  Cc: roid, Linux Kernel Network Developers, Jiri Pirko, John Fastabend



On 27/11/2016 06:47, Roi Dayan wrote:
>
>
> On 27/11/2016 02:33, Daniel Borkmann wrote:
>> On 11/26/2016 12:09 PM, Daniel Borkmann wrote:
>>> On 11/26/2016 07:46 AM, Cong Wang wrote:
>>>> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann 
>>>> <daniel@iogearbox.net> wrote:
>> [...]
>>>>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>>>>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>>>>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>>>>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>>>>> Outstanding readers should either bail out due to if (!cl) or can 
>>>>> still
>>>>> process the chain until read section ends, but during that time, 
>>>>> cl->q
>>>>> resp. bstats should be good. Do you happen to know what's at address
>>>>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(), 
>>>>> but
>>>>> at least on ingress (netif_receive_skb_internal()) we hold 
>>>>> rcu_read_lock()
>>>>> here. The KASAN report is reliably happening at this location, right?
>>>>
>>>> I am confused as well, I don't see how it could be related to my 
>>>> patch yet.
>>>> I will take a deep look in the weekend.
>
>
>
> Hi Cong,
>
> When reported the new trace I didn't mean it's related to your patch, 
> I just wanted to point it out it exposed something. I should have been 
> clear about it.
>
>
>>>
>>> Ok, I'm currently on the run. Got too late yesterday night, but I'll
>>> write what I found in the evening today, not related to ingress though.
>>
>> Just pushed out my analysis to netdev under "[PATCH net] net, sched: 
>> respect
>> rcu grace period on cls destruction". My conclusion is that both 
>> issues are
>> actually separate, and that one is small enough where we could route 
>> it via
>> net actually. Perhaps this at the same time shrinks your "[PATCH 
>> net-next]
>> net_sched: move the empty tp check from ->destroy() to ->delete()" to a
>> reasonable size that it's suitable to net as well. Your 
>> ->delete()/->destroy()
>> one is definitely needed, too. The tp->root one is independant of 
>> ->delete()/
>> ->destroy() as they are different races and tp->root could also 
>> happen when
>> you just destroy the whole tp directly. I think that seems like a 
>> good path
>> forward to me.
>>
>> Thanks,
>> Daniel
>
>
>
> Hi Daniel,
>
> As for the tainted kernel. I was in old (week or two) net-next tree 
> and only cherry-picked from latest net-next related patches to 
> Mellanox HCA, cls_api, cls_flower, devlink. so those are the tainted 
> modules.
> I have the issue reproducing in that tree so wanted it to check it 
> with Cong's patch instead of latest net-next.
> I'll try running reproducing the issue with your new patch and later 
> try latest net-next as well.
>
> Thanks,
> Roi
>

Hi,

I tested "[PATCH net] net, sched: respect rcu grace period on cls 
destruction" and could not reproduce my original issue.
I rebased "[Patch net-next] net_sched: move the empty tp check from 
->destroy() to ->delete()" over to test it in the same tree and got into 
a new trace in fl_delete.

[35659.012123] BUG: KASAN: wild-memory-access on address 1ffffffff803ca31
[35659.020042] Write of size 1 by task ovs-vswitchd/20135
[35659.025878] CPU: 19 PID: 20135 Comm: ovs-vswitchd Tainted: 
G           O    4.9.0-rc3+ #18
[35659.035948] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
[35659.043730] Call Trace:
[35659.046619]  [<ffffffff95b6dc42>] dump_stack+0x63/0x81
[35659.052456]  [<ffffffff955fbbf8>] kasan_report_error+0x408/0x4e0
[35659.059402]  [<ffffffff955fc2e8>] kasan_report+0x58/0x60
[35659.065428]  [<ffffffff952d5e8d>] ? call_rcu_sched+0x1d/0x20
[35659.072119]  [<ffffffffc01e0701>] ? fl_destroy_filter+0x21/0x30 
[cls_flower]
[35659.080217]  [<ffffffffc01e1ccf>] ? fl_delete+0x1df/0x2e0 [cls_flower]
[35659.087580]  [<ffffffff955fa4ca>] __asan_store1+0x4a/0x50
[35659.093697]  [<ffffffffc01e1ccf>] fl_delete+0x1df/0x2e0 [cls_flower]
[35659.100870]  [<ffffffff9653ecba>] tc_ctl_tfilter+0x10da/0x1b90


0x1d02 is in fl_delete (net/sched/cls_flower.c:805).
800             struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
801
802             rhashtable_remove_fast(&head->ht, &f->ht_node,
803                                    head->ht_params);
804             __fl_delete(tp, f);
805             *last = list_empty(&head->filters);
806             return 0;
807     }


Thanks,
Roi

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-27  6:29                 ` Roi Dayan
@ 2016-11-28  2:26                   ` John Fastabend
  2016-11-28  2:51                     ` John Fastabend
  2016-11-29  6:59                   ` Cong Wang
  1 sibling, 1 reply; 16+ messages in thread
From: John Fastabend @ 2016-11-28  2:26 UTC (permalink / raw)
  To: Roi Dayan, Daniel Borkmann, Cong Wang
  Cc: Linux Kernel Network Developers, Jiri Pirko

On 16-11-26 10:29 PM, Roi Dayan wrote:
> 
> 
> On 27/11/2016 06:47, Roi Dayan wrote:
>>
>>
>> On 27/11/2016 02:33, Daniel Borkmann wrote:
>>> On 11/26/2016 12:09 PM, Daniel Borkmann wrote:
>>>> On 11/26/2016 07:46 AM, Cong Wang wrote:
>>>>> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann
>>>>> <daniel@iogearbox.net> wrote:
>>> [...]
>>>>>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>>>>>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>>>>>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>>>>>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>>>>>> Outstanding readers should either bail out due to if (!cl) or can
>>>>>> still
>>>>>> process the chain until read section ends, but during that time,
>>>>>> cl->q
>>>>>> resp. bstats should be good. Do you happen to know what's at address
>>>>>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(),
>>>>>> but
>>>>>> at least on ingress (netif_receive_skb_internal()) we hold
>>>>>> rcu_read_lock()
>>>>>> here. The KASAN report is reliably happening at this location, right?
>>>>>
>>>>> I am confused as well, I don't see how it could be related to my
>>>>> patch yet.
>>>>> I will take a deep look in the weekend.
>>
>>
>>
>> Hi Cong,
>>
>> When reported the new trace I didn't mean it's related to your patch,
>> I just wanted to point it out it exposed something. I should have been
>> clear about it.
>>
>>
>>>>
>>>> Ok, I'm currently on the run. Got too late yesterday night, but I'll
>>>> write what I found in the evening today, not related to ingress though.
>>>
>>> Just pushed out my analysis to netdev under "[PATCH net] net, sched:
>>> respect
>>> rcu grace period on cls destruction". My conclusion is that both
>>> issues are
>>> actually separate, and that one is small enough where we could route
>>> it via
>>> net actually. Perhaps this at the same time shrinks your "[PATCH
>>> net-next]
>>> net_sched: move the empty tp check from ->destroy() to ->delete()" to a
>>> reasonable size that it's suitable to net as well. Your
>>> ->delete()/->destroy()
>>> one is definitely needed, too. The tp->root one is independant of
>>> ->delete()/
>>> ->destroy() as they are different races and tp->root could also
>>> happen when
>>> you just destroy the whole tp directly. I think that seems like a
>>> good path
>>> forward to me.
>>>
>>> Thanks,
>>> Daniel
>>
>>
>>
>> Hi Daniel,
>>
>> As for the tainted kernel. I was in old (week or two) net-next tree
>> and only cherry-picked from latest net-next related patches to
>> Mellanox HCA, cls_api, cls_flower, devlink. so those are the tainted
>> modules.
>> I have the issue reproducing in that tree so wanted it to check it
>> with Cong's patch instead of latest net-next.
>> I'll try running reproducing the issue with your new patch and later
>> try latest net-next as well.
>>
>> Thanks,
>> Roi
>>
> 
> Hi,
> 
> I tested "[PATCH net] net, sched: respect rcu grace period on cls
> destruction" and could not reproduce my original issue.

Hi Roi,

Just so I'm 100% clear. No issue with just the above "respect rcu grace
period on cls destruction" per above statement.

> I rebased "[Patch net-next] net_sched: move the empty tp check from
> ->destroy() to ->delete()" over to test it in the same tree and got into
> a new trace in fl_delete.

In this case did you test with "net_sched: move the empty tp check from
->destroy() to ->delete()" _only_ or did this include both patches when
you see the error below.

>From my inspection we really need both patches to get correct behavior.

Thanks!
John

> 
> [35659.012123] BUG: KASAN: wild-memory-access on address 1ffffffff803ca31
> [35659.020042] Write of size 1 by task ovs-vswitchd/20135
> [35659.025878] CPU: 19 PID: 20135 Comm: ovs-vswitchd Tainted:
> G           O    4.9.0-rc3+ #18
> [35659.035948] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
> [35659.043730] Call Trace:
> [35659.046619]  [<ffffffff95b6dc42>] dump_stack+0x63/0x81
> [35659.052456]  [<ffffffff955fbbf8>] kasan_report_error+0x408/0x4e0
> [35659.059402]  [<ffffffff955fc2e8>] kasan_report+0x58/0x60
> [35659.065428]  [<ffffffff952d5e8d>] ? call_rcu_sched+0x1d/0x20
> [35659.072119]  [<ffffffffc01e0701>] ? fl_destroy_filter+0x21/0x30
> [cls_flower]
> [35659.080217]  [<ffffffffc01e1ccf>] ? fl_delete+0x1df/0x2e0 [cls_flower]
> [35659.087580]  [<ffffffff955fa4ca>] __asan_store1+0x4a/0x50
> [35659.093697]  [<ffffffffc01e1ccf>] fl_delete+0x1df/0x2e0 [cls_flower]
> [35659.100870]  [<ffffffff9653ecba>] tc_ctl_tfilter+0x10da/0x1b90
> 
> 
> 0x1d02 is in fl_delete (net/sched/cls_flower.c:805).
> 800             struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
> 801
> 802             rhashtable_remove_fast(&head->ht, &f->ht_node,
> 803                                    head->ht_params);
> 804             __fl_delete(tp, f);
> 805             *last = list_empty(&head->filters);
> 806             return 0;
> 807     }
> 
> 
> Thanks,
> Roi

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-28  2:26                   ` John Fastabend
@ 2016-11-28  2:51                     ` John Fastabend
  0 siblings, 0 replies; 16+ messages in thread
From: John Fastabend @ 2016-11-28  2:51 UTC (permalink / raw)
  To: Roi Dayan, Daniel Borkmann, Cong Wang
  Cc: Linux Kernel Network Developers, Jiri Pirko

On 16-11-27 06:26 PM, John Fastabend wrote:
> On 16-11-26 10:29 PM, Roi Dayan wrote:
>>
>>
>> On 27/11/2016 06:47, Roi Dayan wrote:
>>>
>>>
>>> On 27/11/2016 02:33, Daniel Borkmann wrote:
>>>> On 11/26/2016 12:09 PM, Daniel Borkmann wrote:
>>>>> On 11/26/2016 07:46 AM, Cong Wang wrote:
>>>>>> On Thu, Nov 24, 2016 at 7:20 AM, Daniel Borkmann
>>>>>> <daniel@iogearbox.net> wrote:
>>>> [...]
>>>>>>> Ok, strange, qdisc_destroy() calls into ops->destroy(), where ingress
>>>>>>> drops its entire chain via tcf_destroy_chain(), so that will be NULL
>>>>>>> eventually. The tps are freed by call_rcu() as well as qdisc itself
>>>>>>> later on via qdisc_rcu_free(), where it frees per-cpu bstats as well.
>>>>>>> Outstanding readers should either bail out due to if (!cl) or can
>>>>>>> still
>>>>>>> process the chain until read section ends, but during that time,
>>>>>>> cl->q
>>>>>>> resp. bstats should be good. Do you happen to know what's at address
>>>>>>> ffff880a68b04028? I was wondering wrt call_rcu() vs call_rcu_bh(),
>>>>>>> but
>>>>>>> at least on ingress (netif_receive_skb_internal()) we hold
>>>>>>> rcu_read_lock()
>>>>>>> here. The KASAN report is reliably happening at this location, right?
>>>>>>
>>>>>> I am confused as well, I don't see how it could be related to my
>>>>>> patch yet.
>>>>>> I will take a deep look in the weekend.
>>>
>>>
>>>
>>> Hi Cong,
>>>
>>> When reported the new trace I didn't mean it's related to your patch,
>>> I just wanted to point it out it exposed something. I should have been
>>> clear about it.
>>>
>>>
>>>>>
>>>>> Ok, I'm currently on the run. Got too late yesterday night, but I'll
>>>>> write what I found in the evening today, not related to ingress though.
>>>>
>>>> Just pushed out my analysis to netdev under "[PATCH net] net, sched:
>>>> respect
>>>> rcu grace period on cls destruction". My conclusion is that both
>>>> issues are
>>>> actually separate, and that one is small enough where we could route
>>>> it via
>>>> net actually. Perhaps this at the same time shrinks your "[PATCH
>>>> net-next]
>>>> net_sched: move the empty tp check from ->destroy() to ->delete()" to a
>>>> reasonable size that it's suitable to net as well. Your
>>>> ->delete()/->destroy()
>>>> one is definitely needed, too. The tp->root one is independant of
>>>> ->delete()/
>>>> ->destroy() as they are different races and tp->root could also
>>>> happen when
>>>> you just destroy the whole tp directly. I think that seems like a
>>>> good path
>>>> forward to me.
>>>>
>>>> Thanks,
>>>> Daniel
>>>
>>>
>>>
>>> Hi Daniel,
>>>
>>> As for the tainted kernel. I was in old (week or two) net-next tree
>>> and only cherry-picked from latest net-next related patches to
>>> Mellanox HCA, cls_api, cls_flower, devlink. so those are the tainted
>>> modules.
>>> I have the issue reproducing in that tree so wanted it to check it
>>> with Cong's patch instead of latest net-next.
>>> I'll try running reproducing the issue with your new patch and later
>>> try latest net-next as well.
>>>
>>> Thanks,
>>> Roi
>>>
>>
>> Hi,
>>
>> I tested "[PATCH net] net, sched: respect rcu grace period on cls
>> destruction" and could not reproduce my original issue.
> 
> Hi Roi,
> 
> Just so I'm 100% clear. No issue with just the above "respect rcu grace
> period on cls destruction" per above statement.
> 
>> I rebased "[Patch net-next] net_sched: move the empty tp check from
>> ->destroy() to ->delete()" over to test it in the same tree and got into
>> a new trace in fl_delete.
> 
> In this case did you test with "net_sched: move the empty tp check from
> ->destroy() to ->delete()" _only_ or did this include both patches when
> you see the error below.
> 
> From my inspection we really need both patches to get correct behavior.
> 
> Thanks!
> John

Ah dang nevermind I just read both patches in detail and applying them
both at the same time is nonsense. Let me reply with comments directly
to the patches.

Thanks. sorry for the noise.

> 
>>
>> [35659.012123] BUG: KASAN: wild-memory-access on address 1ffffffff803ca31
>> [35659.020042] Write of size 1 by task ovs-vswitchd/20135
>> [35659.025878] CPU: 19 PID: 20135 Comm: ovs-vswitchd Tainted:
>> G           O    4.9.0-rc3+ #18
>> [35659.035948] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 07/01/2015
>> [35659.043730] Call Trace:
>> [35659.046619]  [<ffffffff95b6dc42>] dump_stack+0x63/0x81
>> [35659.052456]  [<ffffffff955fbbf8>] kasan_report_error+0x408/0x4e0
>> [35659.059402]  [<ffffffff955fc2e8>] kasan_report+0x58/0x60
>> [35659.065428]  [<ffffffff952d5e8d>] ? call_rcu_sched+0x1d/0x20
>> [35659.072119]  [<ffffffffc01e0701>] ? fl_destroy_filter+0x21/0x30
>> [cls_flower]
>> [35659.080217]  [<ffffffffc01e1ccf>] ? fl_delete+0x1df/0x2e0 [cls_flower]
>> [35659.087580]  [<ffffffff955fa4ca>] __asan_store1+0x4a/0x50
>> [35659.093697]  [<ffffffffc01e1ccf>] fl_delete+0x1df/0x2e0 [cls_flower]
>> [35659.100870]  [<ffffffff9653ecba>] tc_ctl_tfilter+0x10da/0x1b90
>>
>>
>> 0x1d02 is in fl_delete (net/sched/cls_flower.c:805).
>> 800             struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
>> 801
>> 802             rhashtable_remove_fast(&head->ht, &f->ht_node,
>> 803                                    head->ht_params);
>> 804             __fl_delete(tp, f);
>> 805             *last = list_empty(&head->filters);
>> 806             return 0;
>> 807     }
>>
>>
>> Thanks,
>> Roi
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-24  1:58 [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete() Cong Wang
  2016-11-24  8:29 ` Roi Dayan
@ 2016-11-28  2:57 ` John Fastabend
  2016-11-29  6:57   ` Cong Wang
  1 sibling, 1 reply; 16+ messages in thread
From: John Fastabend @ 2016-11-28  2:57 UTC (permalink / raw)
  To: Cong Wang, netdev; +Cc: roid, jiri, Daniel Borkmann

On 16-11-23 05:58 PM, Cong Wang wrote:
> Roi reported we could have a race condition where in ->classify() path
> we dereference tp->root and meanwhile a parallel ->destroy() makes it
> a NULL.
> 
> This is possible because ->destroy() could be called when deleting
> a filter to check if we are the last one in tp, this tp is still
> linked and visible at that time.
> 
> The root cause of this problem is the semantic of ->destroy(), it
> does two things (for non-force case):
> 
> 1) check if tp is empty
> 2) if tp is empty we could really destroy it
> 
> and its caller, if cares, needs to check its return value to see if
> it is really destroyed. Therefore we can't unlink tp unless we know
> it is empty.
> 
> As suggested by Daniel, we could actually move the test logic to ->delete()
> so that we can safely unlink tp after ->delete() tells us the last one is
> just deleted and before ->destroy().
> 
> What's more, even we unlink it before ->destroy(), it could still have
> readers since we don't wait for a grace period here, we should not modify
> tp->root in ->destroy() either.
> 
> Fixes: 1e052be69d04 ("net_sched: destroy proto tp when all filters are gone")
> Reported-by: Roi Dayan <roid@mellanox.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---

Hi Cong,

Thanks a lot for doing this. Can you rebase it on top of Daniel's patch
though,

 [PATCH net] net, sched: respect rcu grace period on cls destruction

And then push the NULL pointer work for the cls_fw and cls_route
classifiers into another patch.

Then I believe the last thing to make this correct is to convert the
call_rcu() paths to call_rcu_bh().

.John

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-28  2:57 ` John Fastabend
@ 2016-11-29  6:57   ` Cong Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2016-11-29  6:57 UTC (permalink / raw)
  To: John Fastabend
  Cc: Linux Kernel Network Developers, Roi Dayan, Jiri Pirko, Daniel Borkmann

On Sun, Nov 27, 2016 at 6:57 PM, John Fastabend
<john.fastabend@gmail.com> wrote:
> Hi Cong,
>
> Thanks a lot for doing this. Can you rebase it on top of Daniel's patch
> though,
>
>  [PATCH net] net, sched: respect rcu grace period on cls destruction
>
> And then push the NULL pointer work for the cls_fw and cls_route
> classifiers into another patch.
>
> Then I believe the last thing to make this correct is to convert the
> call_rcu() paths to call_rcu_bh().

Sure, will rebase my patch once DaveM merges net into net-next.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete()
  2016-11-27  6:29                 ` Roi Dayan
  2016-11-28  2:26                   ` John Fastabend
@ 2016-11-29  6:59                   ` Cong Wang
  1 sibling, 0 replies; 16+ messages in thread
From: Cong Wang @ 2016-11-29  6:59 UTC (permalink / raw)
  To: Roi Dayan
  Cc: Daniel Borkmann, Linux Kernel Network Developers, Jiri Pirko,
	John Fastabend

On Sat, Nov 26, 2016 at 10:29 PM, Roi Dayan <roid@mellanox.com> wrote:
> Hi,
>
> I tested "[PATCH net] net, sched: respect rcu grace period on cls
> destruction" and could not reproduce my original issue.
> I rebased "[Patch net-next] net_sched: move the empty tp check from
> ->destroy() to ->delete()" over to test it in the same tree and got into a
> new trace in fl_delete.

I will take care of this when I rebase my patch.

Thanks for testing anyway.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-11-29  6:59 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-24  1:58 [Patch net-next] net_sched: move the empty tp check from ->destroy() to ->delete() Cong Wang
2016-11-24  8:29 ` Roi Dayan
2016-11-24 10:14   ` Daniel Borkmann
2016-11-24 11:01     ` Roi Dayan
2016-11-24 15:20       ` Daniel Borkmann
2016-11-24 17:18         ` Daniel Borkmann
2016-11-26  6:46         ` Cong Wang
2016-11-26 11:09           ` Daniel Borkmann
2016-11-27  0:33             ` Daniel Borkmann
2016-11-27  4:47               ` Roi Dayan
2016-11-27  6:29                 ` Roi Dayan
2016-11-28  2:26                   ` John Fastabend
2016-11-28  2:51                     ` John Fastabend
2016-11-29  6:59                   ` Cong Wang
2016-11-28  2:57 ` John Fastabend
2016-11-29  6:57   ` Cong Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.