linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] Convert qdisc linked list into a hashtable
@ 2016-08-10  9:00 Jiri Kosina
  2016-08-10  9:03 ` [PATCH 1/2] net: resolve symbol conflicts with generic hashtable.h Jiri Kosina
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Jiri Kosina @ 2016-08-10  9:00 UTC (permalink / raw)
  To: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang, David Miller
  Cc: linux-kernel, netdev

This is a respin of the v6 of the original patch [1], split into two-patch 
series as requested by davem; first patch fixes all symbol conflicts 
that'd happen once netdevice.h starts to include hashtable.h, the second 
one performs the actual switch to hashtable.

I've preserved Cong's Reviewed-by:, as code-wise this series is identical 
to the original v6 of the patch.

[1] lkml.kernel.org/r/alpine.LNX.2.00.1608011220580.22028@cbobk.fhfr.pm

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] net: resolve symbol conflicts with generic hashtable.h
  2016-08-10  9:00 [PATCH 0/2] Convert qdisc linked list into a hashtable Jiri Kosina
@ 2016-08-10  9:03 ` Jiri Kosina
  2016-08-10  9:05 ` [PATCH 2/2] net: sched: convert qdisc linked list to hashtable Jiri Kosina
  2016-08-11  0:22 ` [PATCH 0/2] Convert qdisc linked list into a hashtable David Miller
  2 siblings, 0 replies; 11+ messages in thread
From: Jiri Kosina @ 2016-08-10  9:03 UTC (permalink / raw)
  To: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang, David Miller
  Cc: linux-kernel, netdev

From: Jiri Kosina <jkosina@suse.cz>

This is a preparatory patch for converting qdisc linked list into a 
hashtable. As we'll need to include hashtable.h in netdevice.h, we first 
have to make sure that this will not introduce symbol conflicts for any of 
the netdevice.h users.

Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/net/ethernet/ti/davinci_emac.c | 14 +++++++-------
 net/ipv6/ip6_gre.c                     | 12 ++++++------
 net/ipv6/ip6_tunnel.c                  | 10 +++++-----
 net/ipv6/ip6_vti.c                     | 10 +++++-----
 net/ipv6/sit.c                         | 10 +++++-----
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index f56d66e..91ca2b2 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -726,14 +726,14 @@ static u32 hash_get(u8 *addr)
 }
 
 /**
- * hash_add - Hash function to add mac addr from hash table
+ * emac_hash_add - Hash function to add mac addr from hash table
  * @priv: The DaVinci EMAC private adapter structure
  * @mac_addr: mac address to delete from hash table
  *
  * Adds mac address to the internal hash table
  *
  */
-static int hash_add(struct emac_priv *priv, u8 *mac_addr)
+static int emac_hash_add(struct emac_priv *priv, u8 *mac_addr)
 {
 	struct device *emac_dev = &priv->ndev->dev;
 	u32 rc = 0;
@@ -742,7 +742,7 @@ static int hash_add(struct emac_priv *priv, u8 *mac_addr)
 
 	if (hash_value >= EMAC_NUM_MULTICAST_BITS) {
 		if (netif_msg_drv(priv)) {
-			dev_err(emac_dev, "DaVinci EMAC: hash_add(): Invalid "\
+			dev_err(emac_dev, "DaVinci EMAC: emac_hash_add(): Invalid "\
 				"Hash %08x, should not be greater than %08x",
 				hash_value, (EMAC_NUM_MULTICAST_BITS - 1));
 		}
@@ -768,14 +768,14 @@ static int hash_add(struct emac_priv *priv, u8 *mac_addr)
 }
 
 /**
- * hash_del - Hash function to delete mac addr from hash table
+ * emac_hash_del - Hash function to delete mac addr from hash table
  * @priv: The DaVinci EMAC private adapter structure
  * @mac_addr: mac address to delete from hash table
  *
  * Removes mac address from the internal hash table
  *
  */
-static int hash_del(struct emac_priv *priv, u8 *mac_addr)
+static int emac_hash_del(struct emac_priv *priv, u8 *mac_addr)
 {
 	u32 hash_value;
 	u32 hash_bit;
@@ -825,10 +825,10 @@ static void emac_add_mcast(struct emac_priv *priv, u32 action, u8 *mac_addr)
 
 	switch (action) {
 	case EMAC_MULTICAST_ADD:
-		update = hash_add(priv, mac_addr);
+		update = emac_hash_add(priv, mac_addr);
 		break;
 	case EMAC_MULTICAST_DEL:
-		update = hash_del(priv, mac_addr);
+		update = emac_hash_del(priv, mac_addr);
 		break;
 	case EMAC_ALL_MULTI_SET:
 		update = 1;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index fdc9de2..d3697a4 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -61,12 +61,12 @@ static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
-#define HASH_SIZE_SHIFT  5
-#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+#define IP6_GRE_HASH_SIZE_SHIFT  5
+#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
 
 static int ip6gre_net_id __read_mostly;
 struct ip6gre_net {
-	struct ip6_tnl __rcu *tunnels[4][HASH_SIZE];
+	struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
 	struct net_device *fb_tunnel_dev;
 };
@@ -96,12 +96,12 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
    will match fallback tunnel.
  */
 
-#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1))
+#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
 static u32 HASH_ADDR(const struct in6_addr *addr)
 {
 	u32 hash = ipv6_addr_hash(addr);
 
-	return hash_32(hash, HASH_SIZE_SHIFT);
+	return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT);
 }
 
 #define tunnels_r_l	tunnels[3]
@@ -1089,7 +1089,7 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
 
 	for (prio = 0; prio < 4; prio++) {
 		int h;
-		for (h = 0; h < HASH_SIZE; h++) {
+		for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
 			struct ip6_tnl *t;
 
 			t = rtnl_dereference(ign->tunnels[prio][h]);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 7b0481e..2050217 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -64,8 +64,8 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS_RTNL_LINK("ip6tnl");
 MODULE_ALIAS_NETDEV("ip6tnl0");
 
-#define HASH_SIZE_SHIFT  5
-#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+#define IP6_TUNNEL_HASH_SIZE_SHIFT  5
+#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT)
 
 static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
@@ -75,7 +75,7 @@ static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
 {
 	u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
 
-	return hash_32(hash, HASH_SIZE_SHIFT);
+	return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT);
 }
 
 static int ip6_tnl_dev_init(struct net_device *dev);
@@ -87,7 +87,7 @@ struct ip6_tnl_net {
 	/* the IPv6 tunnel fallback device */
 	struct net_device *fb_tnl_dev;
 	/* lists for storing tunnels in use */
-	struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+	struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
 	struct ip6_tnl __rcu *tnls_wc[1];
 	struct ip6_tnl __rcu **tnls[2];
 };
@@ -2031,7 +2031,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net)
 		if (dev->rtnl_link_ops == &ip6_link_ops)
 			unregister_netdevice_queue(dev, &list);
 
-	for (h = 0; h < HASH_SIZE; h++) {
+	for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
 		t = rtnl_dereference(ip6n->tnls_r_l[h]);
 		while (t) {
 			/* If dev is in the same netns, it has already
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d90a11f..cc7e058 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -50,14 +50,14 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
-#define HASH_SIZE_SHIFT  5
-#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+#define IP6_VTI_HASH_SIZE_SHIFT  5
+#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
 
 static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
 {
 	u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
 
-	return hash_32(hash, HASH_SIZE_SHIFT);
+	return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT);
 }
 
 static int vti6_dev_init(struct net_device *dev);
@@ -69,7 +69,7 @@ struct vti6_net {
 	/* the vti6 tunnel fallback device */
 	struct net_device *fb_tnl_dev;
 	/* lists for storing tunnels in use */
-	struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+	struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE];
 	struct ip6_tnl __rcu *tnls_wc[1];
 	struct ip6_tnl __rcu **tnls[2];
 };
@@ -1040,7 +1040,7 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
 	struct ip6_tnl *t;
 	LIST_HEAD(list);
 
-	for (h = 0; h < HASH_SIZE; h++) {
+	for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
 		t = rtnl_dereference(ip6n->tnls_r_l[h]);
 		while (t) {
 			unregister_netdevice_queue(t->dev, &list);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0a5a255..94dd0f0 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -62,7 +62,7 @@
    For comments look at net/ipv4/ip_gre.c --ANK
  */
 
-#define HASH_SIZE  16
+#define IP6_SIT_HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 
 static bool log_ecn_error = true;
@@ -78,9 +78,9 @@ static struct rtnl_link_ops sit_link_ops __read_mostly;
 
 static int sit_net_id __read_mostly;
 struct sit_net {
-	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
-	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
-	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE];
 	struct ip_tunnel __rcu *tunnels_wc[1];
 	struct ip_tunnel __rcu **tunnels[4];
 
@@ -1773,7 +1773,7 @@ static void __net_exit sit_destroy_tunnels(struct net *net,
 
 	for (prio = 1; prio < 4; prio++) {
 		int h;
-		for (h = 0; h < HASH_SIZE; h++) {
+		for (h = 0; h < IP6_SIT_HASH_SIZE; h++) {
 			struct ip_tunnel *t;
 
 			t = rtnl_dereference(sitn->tunnels[prio][h]);
-- 
1.9.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-10  9:00 [PATCH 0/2] Convert qdisc linked list into a hashtable Jiri Kosina
  2016-08-10  9:03 ` [PATCH 1/2] net: resolve symbol conflicts with generic hashtable.h Jiri Kosina
@ 2016-08-10  9:05 ` Jiri Kosina
  2016-08-12 12:52   ` Daniel Borkmann
  2016-08-11  0:22 ` [PATCH 0/2] Convert qdisc linked list into a hashtable David Miller
  2 siblings, 1 reply; 11+ messages in thread
From: Jiri Kosina @ 2016-08-10  9:05 UTC (permalink / raw)
  To: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang, David Miller
  Cc: linux-kernel, netdev

From: Jiri Kosina <jkosina@suse.cz>

Convert the per-device linked list into a hashtable. The primary 
motivation for this change is that currently, we're not tracking all the 
qdiscs in hierarchy (e.g. excluding default qdiscs), as the lookup 
performed over the linked list by qdisc_match_from_root() is rather 
expensive.

The ultimate goal is to get rid of hidden qdiscs completely, which will 
bring much more determinism in user experience.

Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/netdevice.h |  4 ++++
 include/net/pkt_sched.h   |  4 ++--
 include/net/sch_generic.h |  2 +-
 net/core/dev.c            |  3 +++
 net/sched/sch_api.c       | 23 +++++++++++++----------
 net/sched/sch_generic.c   |  8 +++++---
 net/sched/sch_mq.c        |  2 +-
 net/sched/sch_mqprio.c    |  2 +-
 8 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f45929c..17c6499 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,7 @@
 #include <uapi/linux/netdevice.h>
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
+#include <linux/hashtable.h>
 
 struct netpoll_info;
 struct device;
@@ -1778,6 +1779,9 @@ struct net_device {
 	unsigned int		num_tx_queues;
 	unsigned int		real_num_tx_queues;
 	struct Qdisc		*qdisc;
+#ifdef CONFIG_NET_SCHED
+	DECLARE_HASHTABLE	(qdisc_hash, 4);
+#endif
 	unsigned long		tx_queue_len;
 	spinlock_t		tx_global_lock;
 	int			watchdog_timeo;
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fea53f4..8ba11b4 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -90,8 +90,8 @@ int unregister_qdisc(struct Qdisc_ops *qops);
 void qdisc_get_default(char *id, size_t len);
 int qdisc_set_default(const char *id);
 
-void qdisc_list_add(struct Qdisc *q);
-void qdisc_list_del(struct Qdisc *q);
+void qdisc_hash_add(struct Qdisc *q);
+void qdisc_hash_del(struct Qdisc *q);
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
 struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle);
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 62d5531..26f5cb3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -67,7 +67,7 @@ struct Qdisc {
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
-	struct list_head	list;
+	struct hlist_node       hash;
 	u32			handle;
 	u32			parent;
 	int			(*reshape_fail)(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 904ff43..d3736d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7511,6 +7511,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
 	INIT_LIST_HEAD(&dev->ptype_all);
 	INIT_LIST_HEAD(&dev->ptype_specific);
+#ifdef CONFIG_NET_SCHED
+	hash_init(dev->qdisc_hash);
+#endif
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ddf047d..c093d32 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -29,6 +29,7 @@
 #include <linux/hrtimer.h>
 #include <linux/lockdep.h>
 #include <linux/slab.h>
+#include <linux/hashtable.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -265,33 +266,33 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 	    root->handle == handle)
 		return root;
 
-	list_for_each_entry_rcu(q, &root->list, list) {
+	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
 		if (q->handle == handle)
 			return q;
 	}
 	return NULL;
 }
 
-void qdisc_list_add(struct Qdisc *q)
+void qdisc_hash_add(struct Qdisc *q)
 {
 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 		struct Qdisc *root = qdisc_dev(q)->qdisc;
 
 		WARN_ON_ONCE(root == &noop_qdisc);
 		ASSERT_RTNL();
-		list_add_tail_rcu(&q->list, &root->list);
+		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 	}
 }
-EXPORT_SYMBOL(qdisc_list_add);
+EXPORT_SYMBOL(qdisc_hash_add);
 
-void qdisc_list_del(struct Qdisc *q)
+void qdisc_hash_del(struct Qdisc *q)
 {
 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 		ASSERT_RTNL();
-		list_del_rcu(&q->list);
+		hash_del_rcu(&q->hash);
 	}
 }
-EXPORT_SYMBOL(qdisc_list_del);
+EXPORT_SYMBOL(qdisc_hash_del);
 
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 {
@@ -1004,7 +1005,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 				goto err_out4;
 		}
 
-		qdisc_list_add(sch);
+		qdisc_hash_add(sch);
 
 		return sch;
 	}
@@ -1440,6 +1441,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
 {
 	int ret = 0, q_idx = *q_idx_p;
 	struct Qdisc *q;
+	int b;
 
 	if (!root)
 		return 0;
@@ -1454,7 +1456,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
 			goto done;
 		q_idx++;
 	}
-	list_for_each_entry(q, &root->list, list) {
+	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
 		if (q_idx < s_q_idx) {
 			q_idx++;
 			continue;
@@ -1771,6 +1773,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
 			       int *t_p, int s_t)
 {
 	struct Qdisc *q;
+	int b;
 
 	if (!root)
 		return 0;
@@ -1778,7 +1781,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
 		return -1;
 
-	list_for_each_entry(q, &root->list, list) {
+	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
 			return -1;
 	}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f9e0e9c..94d5999 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -378,7 +378,6 @@ struct Qdisc noop_qdisc = {
 	.dequeue	=	noop_dequeue,
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noop_qdisc_ops,
-	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
 	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
@@ -565,7 +564,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 		sch->padded = (char *) sch - (char *) p;
 	}
-	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
 
 	spin_lock_init(&sch->busylock);
@@ -645,7 +643,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 		return;
 
 #ifdef CONFIG_NET_SCHED
-	qdisc_list_del(qdisc);
+	qdisc_hash_del(qdisc);
 
 	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
@@ -732,6 +730,10 @@ static void attach_default_qdiscs(struct net_device *dev)
 			qdisc->ops->attach(qdisc);
 		}
 	}
+#ifdef CONFIG_NET_SCHED
+	if (dev->qdisc)
+		qdisc_hash_add(dev->qdisc);
+#endif
 }
 
 static void transition_one_qdisc(struct net_device *dev,
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 56a77b8..3bee15d 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -88,7 +88,7 @@ static void mq_attach(struct Qdisc *sch)
 			qdisc_destroy(old);
 #ifdef CONFIG_NET_SCHED
 		if (ntx < dev->real_num_tx_queues)
-			qdisc_list_add(qdisc);
+			qdisc_hash_add(qdisc);
 #endif
 
 	}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b8002ce..dbfb3a5 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -182,7 +182,7 @@ static void mqprio_attach(struct Qdisc *sch)
 		if (old)
 			qdisc_destroy(old);
 		if (ntx < dev->real_num_tx_queues)
-			qdisc_list_add(qdisc);
+			qdisc_hash_add(qdisc);
 	}
 	kfree(priv->qdiscs);
 	priv->qdiscs = NULL;
-- 
1.9.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/2] Convert qdisc linked list into a hashtable
  2016-08-10  9:00 [PATCH 0/2] Convert qdisc linked list into a hashtable Jiri Kosina
  2016-08-10  9:03 ` [PATCH 1/2] net: resolve symbol conflicts with generic hashtable.h Jiri Kosina
  2016-08-10  9:05 ` [PATCH 2/2] net: sched: convert qdisc linked list to hashtable Jiri Kosina
@ 2016-08-11  0:22 ` David Miller
  2 siblings, 0 replies; 11+ messages in thread
From: David Miller @ 2016-08-11  0:22 UTC (permalink / raw)
  To: jikos; +Cc: eric.dumazet, jhs, phil, xiyou.wangcong, linux-kernel, netdev

From: Jiri Kosina <jikos@kernel.org>
Date: Wed, 10 Aug 2016 11:00:42 +0200 (CEST)

> This is a respin of the v6 of the original patch [1], split into two-patch 
> series as requested by davem; first patch fixes all symbol conflicts 
> that'd happen once netdevice.h starts to include hashtable.h, the second 
> one performs the actual switch to hashtable.
> 
> I've preserved Cong's Reviewed-by:, as code-wise this series is identical 
> to the original v6 of the patch.
> 
> [1] lkml.kernel.org/r/alpine.LNX.2.00.1608011220580.22028@cbobk.fhfr.pm

Series applied, thank you.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-10  9:05 ` [PATCH 2/2] net: sched: convert qdisc linked list to hashtable Jiri Kosina
@ 2016-08-12 12:52   ` Daniel Borkmann
  2016-08-12 13:53     ` Jiri Kosina
  0 siblings, 1 reply; 11+ messages in thread
From: Daniel Borkmann @ 2016-08-12 12:52 UTC (permalink / raw)
  To: Jiri Kosina, Eric Dumazet, Jamal Hadi Salim, Phil Sutter,
	Cong Wang, David Miller
  Cc: linux-kernel, netdev

Hi Jiri,

On 08/10/2016 11:05 AM, Jiri Kosina wrote:
> From: Jiri Kosina <jkosina@suse.cz>
>
> Convert the per-device linked list into a hashtable. The primary
> motivation for this change is that currently, we're not tracking all the
> qdiscs in hierarchy (e.g. excluding default qdiscs), as the lookup
> performed over the linked list by qdisc_match_from_root() is rather
> expensive.
>
> The ultimate goal is to get rid of hidden qdiscs completely, which will
> bring much more determinism in user experience.
>
> Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
> Signed-off-by: Jiri Kosina <jkosina@suse.cz>

This results in below panic. Tested reverting this patch and it fixes
the panic.

Did you test this also with ingress or clsact qdisc (just try adding
it to lo dev for example) ?

What happens is the following in qdisc_match_from_root():

[  995.422187] XXX qdisc:ffff88025e4fc800 queue:ffff880262759000 dev:ffff880261cc2000 handle:ffff0000
[  995.422200] XXX qdisc:ffffffff81cf8100 queue:ffffffff81cf8240 dev:          (null) handle:ffff0000

I believe this is due to dev_ingress_queue_create() assigning the
global noop_qdisc instance as qdisc_sleeping, which later qdisc_lookup()
uses for qdisc_match_from_root().

But everything that uses things like noop_qdisc cannot work with the
new qdisc_match_from_root(), because qdisc_dev(root) will always trigger
NULL pointer dereference there. Reason is because the dev is always
NULL for noop, it's a singleton, see noop_qdisc and noop_netdev_queue
in sch_generic.c.

Now how to fix it? Creating separate noop instances each time it's set
would be quite a waste of memory. Even fuglier would be to hack a static
net device struct into sch_generic.c and let noop_netdev_queue point there
to get to the hash table. Or we just not use qdisc_dev().

I've tried the below to hand in dev pointer instead of qdisc_dev(), but
I think this is not sound yet. Despite fixing the panic, I get something
weird like:

# tc qdisc show dev wlp2s0b1
qdisc mq 0: root
qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc clsact ffff: parent ffff:fff1
qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1

Not yet sure whether this below, really quickly hacked patch is just buggy
(I guess so) or it's another side effect of the original patch.

If you have some cycles to take a look into fixing the panic, would be great.

Thanks

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 25aada7..c2c9799 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -256,7 +256,8 @@ int qdisc_set_default(const char *name)
   * Note: caller either uses rtnl or rcu_read_lock()
   */

-static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
+static struct Qdisc *qdisc_match_from_root(struct net_device *dev,
+					   struct Qdisc *root, u32 handle)
  {
  	struct Qdisc *q;

@@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
  	    root->handle == handle)
  		return root;

-	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
+	hash_for_each_possible_rcu(dev->qdisc_hash, q, hash, handle) {
  		if (q->handle == handle)
  			return q;
  	}
@@ -296,12 +297,12 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
  {
  	struct Qdisc *q;

-	q = qdisc_match_from_root(dev->qdisc, handle);
+	q = qdisc_match_from_root(dev, dev->qdisc, handle);
  	if (q)
  		goto out;

  	if (dev_ingress_queue(dev))
-		q = qdisc_match_from_root(
+		q = qdisc_match_from_root(dev,
  			dev_ingress_queue(dev)->qdisc_sleeping,
  			handle);
  out:
@@ -1430,8 +1431,8 @@ err_out:
  	return -EINVAL;
  }

-static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
-			      struct netlink_callback *cb,
+static int tc_dump_qdisc_root(struct net_device *dev, struct Qdisc *root,
+			      struct sk_buff *skb, struct netlink_callback *cb,
  			      int *q_idx_p, int s_q_idx)
  {
  	int ret = 0, q_idx = *q_idx_p;
@@ -1451,7 +1452,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
  			goto done;
  		q_idx++;
  	}
-	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
+	hash_for_each(dev->qdisc_hash, b, q, hash) {
  		if (q_idx < s_q_idx) {
  			q_idx++;
  			continue;
@@ -1492,12 +1493,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
  			s_q_idx = 0;
  		q_idx = 0;

-		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
+		if (tc_dump_qdisc_root(dev, dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
  			goto done;

  		dev_queue = dev_ingress_queue(dev);
  		if (dev_queue &&
-		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+		    tc_dump_qdisc_root(dev, dev_queue->qdisc_sleeping, skb, cb,
  				       &q_idx, s_q_idx) < 0)
  			goto done;

@@ -1762,9 +1763,9 @@ static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
  	return 0;
  }

-static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
-			       struct tcmsg *tcm, struct netlink_callback *cb,
-			       int *t_p, int s_t)
+static int tc_dump_tclass_root(struct net_device *dev, struct Qdisc *root,
+			       struct sk_buff *skb, struct tcmsg *tcm,
+			       struct netlink_callback *cb, int *t_p, int s_t)
  {
  	struct Qdisc *q;
  	int b;
@@ -1775,7 +1776,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
  	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
  		return -1;

-	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
+	hash_for_each(dev->qdisc_hash, b, q, hash) {
  		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
  			return -1;
  	}
@@ -1800,12 +1801,12 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
  	s_t = cb->args[0];
  	t = 0;

-	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
+	if (tc_dump_tclass_root(dev, dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
  		goto done;

  	dev_queue = dev_ingress_queue(dev);
  	if (dev_queue &&
-	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+	    tc_dump_tclass_root(dev, dev_queue->qdisc_sleeping, skb, tcm, cb,
  				&t, s_t) < 0)
  		goto done;

Panic output:

[ 1243.459280] BUG: unable to handle kernel NULL pointer dereference at 0000000000000410
[ 1243.459430] IP: [<ffffffff8167efac>] qdisc_match_from_root+0x2c/0x70
[ 1243.459528] PGD 1aceba067 PUD 1aceb7067 PMD 0
[ 1243.459604] Oops: 0000 [#1] PREEMPT SMP
[ 1243.459659] Modules linked in: ccm br_netfilter bridge stp llc dm_thin_pool dm_persistent_data dm_bio_prison libcrc32c loop xt_tcpudp ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 arc4 brcmsmac nf_nat nf_conntrack cordic brcmutil iptable_mangle b43 iptable_security iptable_raw iptable_filter ip_tables x_tables mac80211 bnep cfg80211 snd_hda_codec_hdmi snd_hda_codec_cirrus snd_hda_codec_generic ssb snd_hda_intel snd_hda_codec x86_pkg_temp_thermal coretemp kvm_intel btusb mmc_core kvm btrtl btbcm snd_hda_core btintel uvcvideo bluetooth nls_utf8 snd_hwdep hfsplus snd_seq videobuf2_vmalloc
[ 1243.476357]  videobuf2_memops iTCO_wdt videobuf2_v4l2 iTCO_vendor_support videobuf2_core videodev snd_seq_device bcma irqbypass snd_pcm crc32_pclmul crc32c_intel applesmc input_polldev ghash_clmulni_intel pcspkr nfsd i2c_i801 lpc_ich rfkill bcm5974 media mfd_core i2c_smbus joydev snd_timer snd mei_me auth_rpcgss sbs mei tpm_tis sbshc nfs_acl tpm_tis_core lockd tpm apple_bl soundcore grace sunrpc i915 i2c_algo_bit drm_kms_helper drm i2c_core video
[ 1243.494439] CPU: 2 PID: 2223 Comm: tc Not tainted 4.7.0+ #1181
[ 1243.499015] Hardware name: Apple Inc. MacBookAir5,1/Mac-66F35F19FE2A0D05, BIOS MBA51.88Z.00EF.B02.1211271028 11/27/2012
[ 1243.503630] task: ffff8801ec996e00 task.stack: ffff8801ec934000
[ 1243.508311] RIP: 0010:[<ffffffff8167efac>]  [<ffffffff8167efac>] qdisc_match_from_root+0x2c/0x70
[ 1243.513207] RSP: 0018:ffff8801ec937ab0  EFLAGS: 00010203
[ 1243.518053] RAX: 0000000000000408 RBX: ffff88025e612000 RCX: ffffffffffffffd8
[ 1243.522893] RDX: 0000000000000000 RSI: 00000000ffff0000 RDI: ffffffff81cf8100
[ 1243.527598] RBP: ffff8801ec937ab0 R08: 000000000001c160 R09: ffff8802668032c0
[ 1243.532378] R10: ffffffff81cf8100 R11: 0000000000000030 R12: 00000000ffff0000
[ 1243.537221] R13: ffff88025e612000 R14: ffffffff81cf3140 R15: 0000000000000000
[ 1243.542051] FS:  00007f24b9af6740(0000) GS:ffff88026f280000(0000) knlGS:0000000000000000
[ 1243.542059] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1243.542062] CR2: 0000000000000410 CR3: 00000001aceec000 CR4: 00000000001406e0
[ 1243.542064] Stack:
[ 1243.542075]  ffff8801ec937ad0 ffffffff81681210 ffff88025dd51a00 00000000fffffff1
[ 1243.542082]  ffff8801ec937b88 ffffffff81681e4e ffffffff81c42bc0 ffff880262431500
[ 1243.542090]  ffffffff81cf3140 ffff88025dd51a10 ffff88025dd51a24 00000000ec937b38
[ 1243.542092] Call Trace:
[ 1243.542104]  [<ffffffff81681210>] qdisc_lookup+0x40/0x50
[ 1243.542113]  [<ffffffff81681e4e>] tc_modify_qdisc+0x21e/0x550
[ 1243.542125]  [<ffffffff8166ae25>] rtnetlink_rcv_msg+0x95/0x220
[ 1243.542136]  [<ffffffff81209602>] ? __kmalloc_track_caller+0x172/0x230
[ 1243.542144]  [<ffffffff8166ad90>] ? rtnl_newlink+0x870/0x870
[ 1243.542151]  [<ffffffff816897b7>] netlink_rcv_skb+0xa7/0xc0
[ 1243.542158]  [<ffffffff816657c8>] rtnetlink_rcv+0x28/0x30
[ 1243.542164]  [<ffffffff8168919b>] netlink_unicast+0x15b/0x210
[ 1243.542170]  [<ffffffff81689569>] netlink_sendmsg+0x319/0x390
[ 1243.542180]  [<ffffffff816379f8>] sock_sendmsg+0x38/0x50
[ 1243.542187]  [<ffffffff81638296>] ___sys_sendmsg+0x256/0x260
[ 1243.542197]  [<ffffffff811b1275>] ? __pagevec_lru_add_fn+0x135/0x280
[ 1243.542206]  [<ffffffff811b1a90>] ? pagevec_lru_move_fn+0xd0/0xf0
[ 1243.542214]  [<ffffffff811b1140>] ? trace_event_raw_event_mm_lru_insertion+0x180/0x180
[ 1243.542222]  [<ffffffff811b1b85>] ? __lru_cache_add+0x75/0xb0
[ 1243.542230]  [<ffffffff817708a6>] ? _raw_spin_unlock+0x16/0x40
[ 1243.542237]  [<ffffffff811d8dff>] ? handle_mm_fault+0x39f/0x1160
[ 1243.542245]  [<ffffffff81638b15>] __sys_sendmsg+0x45/0x80
[ 1243.542254]  [<ffffffff81638b62>] SyS_sendmsg+0x12/0x20
[ 1243.542261]  [<ffffffff810038e7>] do_syscall_64+0x57/0xb0
[ 1243.542268]  [<ffffffff81770fa1>] entry_SYSCALL64_slow_path+0x25/0x25
[ 1243.542357] Code: 1f 44 00 00 f6 47 10 01 55 48 89 e5 75 05 39 77 38 74 48 48 8b 57 48 69 c6 47 86 c8 61 48 8b 12 c1 e8 1c 48 8d 84 c2 d0 03 00 00 <48> 8b 50 08 31 c0 48 8d 4a d8 48 85 d2 48 0f 45 c1 eb 04 48 83
[ 1243.542366] RIP  [<ffffffff8167efac>] qdisc_match_from_root+0x2c/0x70
[ 1243.542368]  RSP <ffff8801ec937ab0>
[ 1243.542370] CR2: 0000000000000410
[ 1243.565166] ---[ end trace aee041a86366c4d4 ]---

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-12 12:52   ` Daniel Borkmann
@ 2016-08-12 13:53     ` Jiri Kosina
  2016-08-12 14:26       ` Daniel Borkmann
  2016-08-14  6:19       ` Cong Wang
  0 siblings, 2 replies; 11+ messages in thread
From: Jiri Kosina @ 2016-08-12 13:53 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang,
	David Miller, linux-kernel, netdev

On Fri, 12 Aug 2016, Daniel Borkmann wrote:

> This results in below panic. Tested reverting this patch and it fixes
> the panic.
> 
> Did you test this also with ingress or clsact qdisc (just try adding
> it to lo dev for example) ?

Hi Daniel,

thanks for the report. Hmm, I am pretty sure clsact worked for me, but 
I'll recheck.

> What happens is the following in qdisc_match_from_root():
> 
> [  995.422187] XXX qdisc:ffff88025e4fc800 queue:ffff880262759000
> dev:ffff880261cc2000 handle:ffff0000
> [  995.422200] XXX qdisc:ffffffff81cf8100 queue:ffffffff81cf8240 dev:
> (null) handle:ffff0000
> 
> I believe this is due to dev_ingress_queue_create() assigning the
> global noop_qdisc instance as qdisc_sleeping, which later qdisc_lookup()
> uses for qdisc_match_from_root().
> 
> But everything that uses things like noop_qdisc cannot work with the
> new qdisc_match_from_root(), because qdisc_dev(root) will always trigger
> NULL pointer dereference there. Reason is because the dev is always
> NULL for noop, it's a singleton, see noop_qdisc and noop_netdev_queue
> in sch_generic.c.
> 
> Now how to fix it? Creating separate noop instances each time it's set
> would be quite a waste of memory. Even fuglier would be to hack a static
> net device struct into sch_generic.c and let noop_netdev_queue point there
> to get to the hash table. Or we just not use qdisc_dev().

How about we actually extend a little bit the TCQ_F_BUILTIN special case 
test in qdisc_match_from_root()?

After the change, the only way how qdisc_dev() could be NULL should be a 
TCQ_F_BUILTIN case, right?

I was thinking about something like the patch below (the reasong being 
that ->dev would be NULL only in cases of singletonish qdiscs) ... 
wouldn't that also fix the issue you're seeing? Have to think it through a 
little bit more ..


diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 25aada7..1c9faed 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -260,6 +260,9 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 {
 	struct Qdisc *q;
 
+	if (!qdisc_dev(root))
+		return (root->handle == handle ? root : NULL);
+
 	if (!(root->flags & TCQ_F_BUILTIN) &&
 	    root->handle == handle)
 		return root;


Thanks!

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-12 13:53     ` Jiri Kosina
@ 2016-08-12 14:26       ` Daniel Borkmann
  2016-08-12 14:36         ` Jiri Kosina
  2016-08-14  6:19       ` Cong Wang
  1 sibling, 1 reply; 11+ messages in thread
From: Daniel Borkmann @ 2016-08-12 14:26 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang,
	David Miller, linux-kernel, netdev

On 08/12/2016 03:53 PM, Jiri Kosina wrote:
> On Fri, 12 Aug 2016, Daniel Borkmann wrote:
>
>> This results in below panic. Tested reverting this patch and it fixes
>> the panic.
>>
>> Did you test this also with ingress or clsact qdisc (just try adding
>> it to lo dev for example) ?
>
> Hi Daniel,
>
> thanks for the report. Hmm, I am pretty sure clsact worked for me, but
> I'll recheck.
>
>> What happens is the following in qdisc_match_from_root():
>>
>> [  995.422187] XXX qdisc:ffff88025e4fc800 queue:ffff880262759000
>> dev:ffff880261cc2000 handle:ffff0000
>> [  995.422200] XXX qdisc:ffffffff81cf8100 queue:ffffffff81cf8240 dev:
>> (null) handle:ffff0000
>>
>> I believe this is due to dev_ingress_queue_create() assigning the
>> global noop_qdisc instance as qdisc_sleeping, which later qdisc_lookup()
>> uses for qdisc_match_from_root().
>>
>> But everything that uses things like noop_qdisc cannot work with the
>> new qdisc_match_from_root(), because qdisc_dev(root) will always trigger
>> NULL pointer dereference there. Reason is because the dev is always
>> NULL for noop, it's a singleton, see noop_qdisc and noop_netdev_queue
>> in sch_generic.c.
>>
>> Now how to fix it? Creating separate noop instances each time it's set
>> would be quite a waste of memory. Even fuglier would be to hack a static
>> net device struct into sch_generic.c and let noop_netdev_queue point there
>> to get to the hash table. Or we just not use qdisc_dev().
>
> How about we actually extend a little bit the TCQ_F_BUILTIN special case
> test in qdisc_match_from_root()?
>
> After the change, the only way how qdisc_dev() could be NULL should be a
> TCQ_F_BUILTIN case, right?
>
> I was thinking about something like the patch below (the reasong being
> that ->dev would be NULL only in cases of singletonish qdiscs) ...
> wouldn't that also fix the issue you're seeing? Have to think it through a
> little bit more ..

Ahh, so this has the same effect as previously observed with the other fix.

Perhaps it's just a dumping issue, but to the below clsact, there shouldn't
be pfifo_fast instances appearing.

# tc qdisc show dev wlp2s0b1
qdisc mq 0: root
qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
# tc qdisc add dev wlp2s0b1 clsact
# tc qdisc show dev wlp2s0b1
qdisc mq 0: root
qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc clsact ffff: parent ffff:fff1
qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1

> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 25aada7..1c9faed 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -260,6 +260,9 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
>   {
>   	struct Qdisc *q;
>
> +	if (!qdisc_dev(root))
> +		return (root->handle == handle ? root : NULL);
> +
>   	if (!(root->flags & TCQ_F_BUILTIN) &&
>   	    root->handle == handle)
>   		return root;
>
>
> Thanks!
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-12 14:26       ` Daniel Borkmann
@ 2016-08-12 14:36         ` Jiri Kosina
  2016-08-12 14:44           ` Daniel Borkmann
  0 siblings, 1 reply; 11+ messages in thread
From: Jiri Kosina @ 2016-08-12 14:36 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang,
	David Miller, linux-kernel, netdev

On Fri, 12 Aug 2016, Daniel Borkmann wrote:

> > I was thinking about something like the patch below (the reasong being 
> > that ->dev would be NULL only in cases of singletonish qdiscs) ... 
> > wouldn't that also fix the issue you're seeing? Have to think it 
> > through a little bit more ..
> 
> Ahh, so this has the same effect as previously observed with the other fix.

Thanks a lot for confirming that this fixes the panic. I still have to 
think a little bit more about this though.

> Perhaps it's just a dumping issue, but to the below clsact, there shouldn't
> be pfifo_fast instances appearing.
> 
> # tc qdisc show dev wlp2s0b1
> qdisc mq 0: root
> qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> # tc qdisc add dev wlp2s0b1 clsact
> # tc qdisc show dev wlp2s0b1
> qdisc mq 0: root
> qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc clsact ffff: parent ffff:fff1
> qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1

Hmm, no immediate idea where those are coming from, we'll have to figure 
it out. The mq device used here has 4 queues, right?

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-12 14:36         ` Jiri Kosina
@ 2016-08-12 14:44           ` Daniel Borkmann
  0 siblings, 0 replies; 11+ messages in thread
From: Daniel Borkmann @ 2016-08-12 14:44 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Eric Dumazet, Jamal Hadi Salim, Phil Sutter, Cong Wang,
	David Miller, linux-kernel, netdev

On 08/12/2016 04:36 PM, Jiri Kosina wrote:
> On Fri, 12 Aug 2016, Daniel Borkmann wrote:
>
>>> I was thinking about something like the patch below (the reasong being
>>> that ->dev would be NULL only in cases of singletonish qdiscs) ...
>>> wouldn't that also fix the issue you're seeing? Have to think it
>>> through a little bit more ..
>>
>> Ahh, so this has the same effect as previously observed with the other fix.
>
> Thanks a lot for confirming that this fixes the panic. I still have to
> think a little bit more about this though.
>
>> Perhaps it's just a dumping issue, but to the below clsact, there shouldn't
>> be pfifo_fast instances appearing.
>>
>> # tc qdisc show dev wlp2s0b1
>> qdisc mq 0: root
>> qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> # tc qdisc add dev wlp2s0b1 clsact
>> # tc qdisc show dev wlp2s0b1
>> qdisc mq 0: root
>> qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc clsact ffff: parent ffff:fff1
>> qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>> qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>
> Hmm, no immediate idea where those are coming from, we'll have to figure
> it out. The mq device used here has 4 queues, right?

Yes, the first tc qdisc show is after boot and how it should normally
look like, so 4 tx queues.

# ls /sys/class/net/wlp2s0b1/queues/
rx-0  tx-0  tx-1  tx-2  tx-3

When adding clsact, only the 'qdisc clsact' line should be extra. Given
the extra pfifo_fast ones look the same as above, I would suspect a htab
dumping issue, perhaps. I can debug a bit later tonight on this.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-12 13:53     ` Jiri Kosina
  2016-08-12 14:26       ` Daniel Borkmann
@ 2016-08-14  6:19       ` Cong Wang
  2016-08-15 23:27         ` Jiri Kosina
  1 sibling, 1 reply; 11+ messages in thread
From: Cong Wang @ 2016-08-14  6:19 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Daniel Borkmann, Eric Dumazet, Jamal Hadi Salim, Phil Sutter,
	David Miller, LKML, Linux Kernel Network Developers

On Fri, Aug 12, 2016 at 6:53 AM, Jiri Kosina <jikos@kernel.org> wrote:
> On Fri, 12 Aug 2016, Daniel Borkmann wrote:
>
>> This results in below panic. Tested reverting this patch and it fixes
>> the panic.
>>
>> Did you test this also with ingress or clsact qdisc (just try adding
>> it to lo dev for example) ?
>
> Hi Daniel,
>
> thanks for the report. Hmm, I am pretty sure clsact worked for me, but
> I'll recheck.
>
>> What happens is the following in qdisc_match_from_root():
>>
>> [  995.422187] XXX qdisc:ffff88025e4fc800 queue:ffff880262759000
>> dev:ffff880261cc2000 handle:ffff0000
>> [  995.422200] XXX qdisc:ffffffff81cf8100 queue:ffffffff81cf8240 dev:
>> (null) handle:ffff0000
>>
>> I believe this is due to dev_ingress_queue_create() assigning the
>> global noop_qdisc instance as qdisc_sleeping, which later qdisc_lookup()
>> uses for qdisc_match_from_root().
>>
>> But everything that uses things like noop_qdisc cannot work with the
>> new qdisc_match_from_root(), because qdisc_dev(root) will always trigger
>> NULL pointer dereference there. Reason is because the dev is always
>> NULL for noop, it's a singleton, see noop_qdisc and noop_netdev_queue
>> in sch_generic.c.
>>
>> Now how to fix it? Creating separate noop instances each time it's set
>> would be quite a waste of memory. Even fuglier would be to hack a static
>> net device struct into sch_generic.c and let noop_netdev_queue point there
>> to get to the hash table. Or we just not use qdisc_dev().
>
> How about we actually extend a little bit the TCQ_F_BUILTIN special case
> test in qdisc_match_from_root()?
>
> After the change, the only way how qdisc_dev() could be NULL should be a
> TCQ_F_BUILTIN case, right?
>
> I was thinking about something like the patch below (the reasong being
> that ->dev would be NULL only in cases of singletonish qdiscs) ...
> wouldn't that also fix the issue you're seeing? Have to think it through a
> little bit more ..

I think this is probably why we never show noop qdisc in dump. So I think
we should relax the singleton rule for noop_qdisc, to save some code
for noop_qdisc case and also for dumping noop_qdisc.

I will try to work on a patch tomorrow.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] net: sched: convert qdisc linked list to hashtable
  2016-08-14  6:19       ` Cong Wang
@ 2016-08-15 23:27         ` Jiri Kosina
  0 siblings, 0 replies; 11+ messages in thread
From: Jiri Kosina @ 2016-08-15 23:27 UTC (permalink / raw)
  To: Cong Wang
  Cc: Daniel Borkmann, Eric Dumazet, Jamal Hadi Salim, Phil Sutter,
	David Miller, LKML, Linux Kernel Network Developers

On Sat, 13 Aug 2016, Cong Wang wrote:

> > How about we actually extend a little bit the TCQ_F_BUILTIN special case
> > test in qdisc_match_from_root()?
> >
> > After the change, the only way how qdisc_dev() could be NULL should be a
> > TCQ_F_BUILTIN case, right?
> >
> > I was thinking about something like the patch below (the reasong being
> > that ->dev would be NULL only in cases of singletonish qdiscs) ...
> > wouldn't that also fix the issue you're seeing? Have to think it through a
> > little bit more ..
> 
> I think this is probably why we never show noop qdisc in dump. 

Well, partially. A lot of 'default' qdiscs are omitted in a not really 
uniform and deterministic way. That's actually the primary point of this 
whole effort -- to get rid of the hidden qdiscs entirely.

> So I think we should relax the singleton rule for noop_qdisc, to save 
> some code for noop_qdisc case and also for dumping noop_qdisc.

Completely moving away from singleton qdiscs is one of the possibilities, 
but OTOH I think that my special-casing of !qdisc_dev(root) in 
qdisc_match_from_root() is correct handling of singletons. I've been 
completely off the grid for the past three days, but I plan to submit this 
as a proper followup fix tomorrow if noone has any objections.

> I will try to work on a patch tomorrow.

What still needs to be looked into are the duplicate clsact entries for 
multiqueue.

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-08-15 23:27 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-10  9:00 [PATCH 0/2] Convert qdisc linked list into a hashtable Jiri Kosina
2016-08-10  9:03 ` [PATCH 1/2] net: resolve symbol conflicts with generic hashtable.h Jiri Kosina
2016-08-10  9:05 ` [PATCH 2/2] net: sched: convert qdisc linked list to hashtable Jiri Kosina
2016-08-12 12:52   ` Daniel Borkmann
2016-08-12 13:53     ` Jiri Kosina
2016-08-12 14:26       ` Daniel Borkmann
2016-08-12 14:36         ` Jiri Kosina
2016-08-12 14:44           ` Daniel Borkmann
2016-08-14  6:19       ` Cong Wang
2016-08-15 23:27         ` Jiri Kosina
2016-08-11  0:22 ` [PATCH 0/2] Convert qdisc linked list into a hashtable David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).