All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector
@ 2015-03-02 21:32 Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 1/6] fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush Alexander Duyck
                   ` (6 more replies)
  0 siblings, 7 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

This patch series is meant to mostly just clean up the fib_trie to prepare
it for the introduction of the key_vector.  As such there are a number of
minor clean-ups such as reformatting the tnode to match the format once the
key vector is introduced, some optimizations to drop the need for a leaf
parent pointer, and some changes to remove duplication of effort such as
the 2 look-ups that were essentially being done per node insertion.

---

Alexander Duyck (6):
      fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush
      fib_trie: Fib walk rcu should take a tnode and key instead of a trie and a leaf
      fib_trie: Fib find node should return parent
      fib_trie: Update insert and delete to make use of tp from find_node
      fib_trie: move leaf and tnode to occupy the same spot in the key vector
      fib_trie: Make fib_table rcu safe


 include/net/ip_fib.h     |   70 +++--
 include/net/netns/ipv4.h |    7 -
 net/ipv4/fib_frontend.c  |   52 +++-
 net/ipv4/fib_trie.c      |  628 +++++++++++++++++++++++-----------------------
 4 files changed, 399 insertions(+), 358 deletions(-)

--

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [net-next PATCH 1/6] fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
@ 2015-03-02 21:32 ` Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 2/6] fib_trie: Fib walk rcu should take a tnode and key instead of a trie and a leaf Alexander Duyck
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

This change makes it so that we only call resize on the tnodes, instead of
from each of the leaves.  By doing this we can significantly reduce the
amount of time spent resizing as we can update all of the leaves in the
tnode first before we make any determinations about resizing.  As a result
we can simply free the tnode in the case that all of the leaves from a
given tnode are flushed instead of resizing with each leaf removed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  141 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 63 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f485345..d8b68b4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1400,25 +1400,6 @@ found:
 EXPORT_SYMBOL_GPL(fib_table_lookup);
 
 /*
- * Remove the leaf and return parent.
- */
-static void trie_leaf_remove(struct trie *t, struct tnode *l)
-{
-	struct tnode *tp = node_parent(l);
-
-	pr_debug("entering trie_leaf_remove(%p)\n", l);
-
-	if (tp) {
-		put_child(tp, get_index(l->key, tp), NULL);
-		trie_rebalance(t, tp);
-	} else {
-		RCU_INIT_POINTER(t->trie, NULL);
-	}
-
-	node_free(l);
-}
-
-/*
  * Caller must hold RTNL.
  */
 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
@@ -1483,8 +1464,18 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	if (!plen)
 		tb->tb_num_default--;
 
-	if (hlist_empty(&l->leaf))
-		trie_leaf_remove(t, l);
+	if (hlist_empty(&l->leaf)) {
+		struct tnode *tp = node_parent(l);
+
+		if (tp) {
+			put_child(tp, get_index(l->key, tp), NULL);
+			trie_rebalance(t, tp);
+		} else {
+			RCU_INIT_POINTER(t->trie, NULL);
+		}
+
+		node_free(l);
+	}
 
 	if (fa->fa_state & FA_S_ACCESSED)
 		rt_cache_flush(cfg->fc_nlinfo.nl_net);
@@ -1494,33 +1485,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	return 0;
 }
 
-static int trie_flush_leaf(struct tnode *l)
-{
-	struct hlist_node *tmp;
-	unsigned char slen = 0;
-	struct fib_alias *fa;
-	int found = 0;
-
-	hlist_for_each_entry_safe(fa, tmp, &l->leaf, fa_list) {
-		struct fib_info *fi = fa->fa_info;
-
-		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
-			hlist_del_rcu(&fa->fa_list);
-			fib_release_info(fa->fa_info);
-			alias_free_mem_rcu(fa);
-			found++;
-
-			continue;
-		}
-
-		slen = fa->fa_slen;
-	}
-
-	l->slen = slen;
-
-	return found;
-}
-
 /* Scan for the next right leaf starting at node p->child[idx]
  * Since we have back pointer, no recursion necessary.
  */
@@ -1588,30 +1552,81 @@ static struct tnode *trie_leafindex(struct trie *t, int index)
  */
 int fib_table_flush(struct fib_table *tb)
 {
-	struct trie *t = (struct trie *) tb->tb_data;
-	struct tnode *l, *ll = NULL;
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct hlist_node *tmp;
+	struct fib_alias *fa;
+	struct tnode *n, *pn;
+	unsigned long cindex;
+	unsigned char slen;
 	int found = 0;
 
-	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
-		found += trie_flush_leaf(l);
+	n = rcu_dereference(t->trie);
+	if (!n)
+		goto flush_complete;
+
+	pn = NULL;
+	cindex = 0;
+
+	while (IS_TNODE(n)) {
+		/* record pn and cindex for leaf walking */
+		pn = n;
+		cindex = 1ul << n->bits;
+backtrace:
+		/* walk trie in reverse order */
+		do {
+			while (!(cindex--)) {
+				t_key pkey = pn->key;
+
+				n = pn;
+				pn = node_parent(n);
+
+				/* resize completed node */
+				resize(t, n);
+
+				/* if we got the root we are done */
+				if (!pn)
+					goto flush_complete;
 
-		if (ll) {
-			if (hlist_empty(&ll->leaf))
-				trie_leaf_remove(t, ll);
-			else
-				leaf_pull_suffix(ll);
+				cindex = get_index(pkey, pn);
+			}
+
+			/* grab the next available node */
+			n = tnode_get_child(pn, cindex);
+		} while (!n);
+	}
+
+	/* track slen in case any prefixes survive */
+	slen = 0;
+
+	hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+			hlist_del_rcu(&fa->fa_list);
+			fib_release_info(fa->fa_info);
+			alias_free_mem_rcu(fa);
+			found++;
+
+			continue;
 		}
 
-		ll = l;
+		slen = fa->fa_slen;
 	}
 
-	if (ll) {
-		if (hlist_empty(&ll->leaf))
-			trie_leaf_remove(t, ll);
-		else
-			leaf_pull_suffix(ll);
+	/* update leaf slen */
+	n->slen = slen;
+
+	if (hlist_empty(&n->leaf)) {
+		put_child_root(pn, t, n->key, NULL);
+		node_free(n);
+	} else {
+		leaf_pull_suffix(n);
 	}
 
+	/* if trie is leaf only loop is completed */
+	if (pn)
+		goto backtrace;
+flush_complete:
 	pr_debug("trie_flush found=%d\n", found);
 	return found;
 }

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next PATCH 2/6] fib_trie: Fib walk rcu should take a tnode and key instead of a trie and a leaf
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 1/6] fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush Alexander Duyck
@ 2015-03-02 21:32 ` Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 3/6] fib_trie: Fib find node should return parent Alexander Duyck
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

This change makes it so that leaf_walk_rcu takes a tnode and a key instead
of the trie and a leaf.

The main idea behind this is to avoid using the leaf parent pointer as that
can have additional overhead in the future as I am trying to reduce the
size of a leaf down to 16 bytes on 64b systems and 12b on 32b systems.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  216 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 120 insertions(+), 96 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d8b68b4..5f8081f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1485,71 +1485,71 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	return 0;
 }
 
-/* Scan for the next right leaf starting at node p->child[idx]
- * Since we have back pointer, no recursion necessary.
- */
-static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
+/* Scan for the next leaf starting at the provided key value */
+static struct tnode *leaf_walk_rcu(struct tnode **pn, t_key key)
 {
-	do {
-		unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
-
-		while (idx < tnode_child_length(p)) {
-			c = tnode_get_child_rcu(p, idx++);
-			if (!c)
-				continue;
-
-			if (IS_LEAF(c))
-				return c;
-
-			/* Rescan start scanning in new node */
-			p = c;
-			idx = 0;
-		}
+	struct tnode *tn = NULL, *n = *pn;
+	unsigned long cindex;
 
-		/* Node empty, walk back up to parent */
-		c = p;
-	} while ((p = node_parent_rcu(c)) != NULL);
+	/* record parent node for backtracing */
+	tn = n;
+	cindex = n ? get_index(key, n) : 0;
 
-	return NULL; /* Root of trie */
-}
+	/* this loop is meant to try and find the key in the trie */
+	while (n) {
+		unsigned long idx = get_index(key, n);
 
-static struct tnode *trie_firstleaf(struct trie *t)
-{
-	struct tnode *n = rcu_dereference_rtnl(t->trie);
+		/* guarantee forward progress on the keys */
+		if (IS_LEAF(n) && (n->key >= key))
+			goto found;
+		if (idx >> n->bits)
+			break;
 
-	if (!n)
-		return NULL;
+		/* record parent and next child index */
+		tn = n;
+		cindex = idx;
 
-	if (IS_LEAF(n))          /* trie is just a leaf */
-		return n;
+		/* descend into the next child */
+		n = tnode_get_child_rcu(tn, cindex++);
+	}
 
-	return leaf_walk_rcu(n, NULL);
-}
+	/* this loop will search for the next leaf with a greater key */
+	while (tn) {
+		/* if we exhausted the parent node we will need to climb */
+		if (cindex >> tn->bits) {
+			t_key pkey = tn->key;
 
-static struct tnode *trie_nextleaf(struct tnode *l)
-{
-	struct tnode *p = node_parent_rcu(l);
+			tn = node_parent_rcu(tn);
+			if (!tn)
+				break;
 
-	if (!p)
-		return NULL;	/* trie with just one leaf */
+			cindex = get_index(pkey, tn) + 1;
+			continue;
+		}
 
-	return leaf_walk_rcu(p, l);
-}
+		/* grab the next available node */
+		n = tnode_get_child_rcu(tn, cindex++);
+		if (!n)
+			continue;
 
-static struct tnode *trie_leafindex(struct trie *t, int index)
-{
-	struct tnode *l = trie_firstleaf(t);
+		/* no need to compare keys since we bumped the index */
+		if (IS_LEAF(n))
+			goto found;
 
-	while (l && index-- > 0)
-		l = trie_nextleaf(l);
+		/* Rescan start scanning in new node */
+		tn = n;
+		cindex = 0;
+	}
 
-	return l;
+	*pn = tn;
+	return NULL; /* Root of trie */
+found:
+	/* if we are at the limit for keys just return NULL for the tnode */
+	*pn = (n->key == KEY_MAX) ? NULL : tn;
+	return n;
 }
 
-
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
 int fib_table_flush(struct fib_table *tb)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
@@ -1680,42 +1680,42 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 		   struct netlink_callback *cb)
 {
-	struct tnode *l;
-	struct trie *t = (struct trie *) tb->tb_data;
-	t_key key = cb->args[2];
-	int count = cb->args[3];
-
-	rcu_read_lock();
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct tnode *l, *tp;
 	/* Dump starting at last key.
 	 * Note: 0.0.0.0/0 (ie default) is first key.
 	 */
-	if (count == 0)
-		l = trie_firstleaf(t);
-	else {
-		/* Normally, continue from last key, but if that is missing
-		 * fallback to using slow rescan
-		 */
-		l = fib_find_node(t, key);
-		if (!l)
-			l = trie_leafindex(t, count);
-	}
+	int count = cb->args[2];
+	t_key key = cb->args[3];
+
+	rcu_read_lock();
 
-	while (l) {
-		cb->args[2] = l->key;
+	tp = rcu_dereference_rtnl(t->trie);
+
+	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
-			cb->args[3] = count;
+			cb->args[3] = key;
+			cb->args[2] = count;
 			rcu_read_unlock();
 			return -1;
 		}
 
 		++count;
-		l = trie_nextleaf(l);
+		key = l->key + 1;
+
 		memset(&cb->args[4], 0,
 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+		/* stop loop if key wrapped back to 0 */
+		if (key < l->key)
+			break;
 	}
-	cb->args[3] = count;
+
 	rcu_read_unlock();
 
+	cb->args[3] = key;
+	cb->args[2] = count;
+
 	return skb->len;
 }
 
@@ -2186,31 +2186,46 @@ static const struct file_operations fib_trie_fops = {
 
 struct fib_route_iter {
 	struct seq_net_private p;
-	struct trie *main_trie;
+	struct fib_table *main_tb;
+	struct tnode *tnode;
 	loff_t	pos;
 	t_key	key;
 };
 
 static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
 {
-	struct tnode *l = NULL;
-	struct trie *t = iter->main_trie;
+	struct fib_table *tb = iter->main_tb;
+	struct tnode *l, **tp = &iter->tnode;
+	struct trie *t;
+	t_key key;
 
-	/* use cache location of last found key */
-	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
+	/* use cache location of next-to-find key */
+	if (iter->pos > 0 && pos >= iter->pos) {
 		pos -= iter->pos;
-	else {
+		key = iter->key;
+	} else {
+		t = (struct trie *)tb->tb_data;
+		iter->tnode = rcu_dereference_rtnl(t->trie);
 		iter->pos = 0;
-		l = trie_firstleaf(t);
+		key = 0;
 	}
 
-	while (l && pos-- > 0) {
+	while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+		key = l->key + 1;
 		iter->pos++;
-		l = trie_nextleaf(l);
+
+		if (pos-- <= 0)
+			break;
+
+		l = NULL;
+
+		/* handle unlikely case of a key wrap */
+		if (!key)
+			break;
 	}
 
 	if (l)
-		iter->key = pos;	/* remember it */
+		iter->key = key;	/* remember it */
 	else
 		iter->pos = 0;		/* forget it */
 
@@ -2222,37 +2237,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct fib_route_iter *iter = seq->private;
 	struct fib_table *tb;
+	struct trie *t;
 
 	rcu_read_lock();
+
 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
 	if (!tb)
 		return NULL;
 
-	iter->main_trie = (struct trie *) tb->tb_data;
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
-	else
-		return fib_route_get_idx(iter, *pos - 1);
+	iter->main_tb = tb;
+
+	if (*pos != 0)
+		return fib_route_get_idx(iter, *pos);
+
+	t = (struct trie *)tb->tb_data;
+	iter->tnode = rcu_dereference_rtnl(t->trie);
+	iter->pos = 0;
+	iter->key = 0;
+
+	return SEQ_START_TOKEN;
 }
 
 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct fib_route_iter *iter = seq->private;
-	struct tnode *l = v;
+	struct tnode *l = NULL;
+	t_key key = iter->key;
 
 	++*pos;
-	if (v == SEQ_START_TOKEN) {
-		iter->pos = 0;
-		l = trie_firstleaf(iter->main_trie);
-	} else {
+
+	/* only allow key of 0 for start of sequence */
+	if ((v == SEQ_START_TOKEN) || key)
+		l = leaf_walk_rcu(&iter->tnode, key);
+
+	if (l) {
+		iter->key = l->key + 1;
 		iter->pos++;
-		l = trie_nextleaf(l);
+	} else {
+		iter->pos = 0;
 	}
 
-	if (l)
-		iter->key = l->key;
-	else
-		iter->pos = 0;
 	return l;
 }
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next PATCH 3/6] fib_trie: Fib find node should return parent
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 1/6] fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 2/6] fib_trie: Fib walk rcu should take a tnode and key instead of a trie and a leaf Alexander Duyck
@ 2015-03-02 21:32 ` Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 4/6] fib_trie: Update insert and delete to make use of tp from find_node Alexander Duyck
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

This change makes it so that the parent pointer is returned by reference in
fib_find_node.  By doing this I can use it to find the parent node when I
am performing an insertion and I don't have to look for it again in
fib_insert_node.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |   28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5f8081f..6b9cc70 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -912,10 +912,12 @@ static void fib_insert_alias(struct tnode *l, struct fib_alias *fa,
 }
 
 /* rcu_read_lock needs to be hold by caller from readside */
-static struct tnode *fib_find_node(struct trie *t, u32 key)
+static struct tnode *fib_find_node(struct trie *t, struct tnode **tp, u32 key)
 {
 	struct tnode *n = rcu_dereference_rtnl(t->trie);
 
+	*tp = NULL;
+
 	while (n) {
 		unsigned long index = get_index(key, n);
 
@@ -936,6 +938,7 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 		if (IS_LEAF(n))
 			break;
 
+		*tp = n;
 		n = tnode_get_child_rcu(n, index);
 	}
 
@@ -1071,15 +1074,15 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen)
  */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
-	struct trie *t = (struct trie *) tb->tb_data;
+	struct trie *t = (struct trie *)tb->tb_data;
 	struct fib_alias *fa, *new_fa;
+	struct tnode *l, *tp;
 	struct fib_info *fi;
 	u8 plen = cfg->fc_dst_len;
 	u8 slen = KEYLENGTH - plen;
 	u8 tos = cfg->fc_tos;
-	u32 key, mask;
+	u32 key;
 	int err;
-	struct tnode *l;
 
 	if (plen > KEYLENGTH)
 		return -EINVAL;
@@ -1088,9 +1091,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 
 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
 
-	mask = ntohl(inet_make_mask(plen));
-
-	if (key & ~mask)
+	if ((plen < KEYLENGTH) && (key << plen))
 		return -EINVAL;
 
 	fi = fib_create_info(cfg);
@@ -1099,7 +1100,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		goto err;
 	}
 
-	l = fib_find_node(t, key);
+	l = fib_find_node(t, &tp, key);
 	fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority) : NULL;
 
 	/* Now fa, if non-NULL, points to the first fib alias
@@ -1406,22 +1407,21 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	struct fib_alias *fa, *fa_to_delete;
+	struct tnode *l, *tp;
 	u8 plen = cfg->fc_dst_len;
-	u8 tos = cfg->fc_tos;
 	u8 slen = KEYLENGTH - plen;
-	struct tnode *l;
-	u32 key, mask;
+	u8 tos = cfg->fc_tos;
+	u32 key;
 
 	if (plen > KEYLENGTH)
 		return -EINVAL;
 
 	key = ntohl(cfg->fc_dst);
-	mask = ntohl(inet_make_mask(plen));
 
-	if (key & ~mask)
+	if ((plen < KEYLENGTH) && (key << plen))
 		return -EINVAL;
 
-	l = fib_find_node(t, key);
+	l = fib_find_node(t, &tp, key);
 	if (!l)
 		return -ESRCH;
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next PATCH 4/6] fib_trie: Update insert and delete to make use of tp from find_node
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
                   ` (2 preceding siblings ...)
  2015-03-02 21:32 ` [net-next PATCH 3/6] fib_trie: Fib find node should return parent Alexander Duyck
@ 2015-03-02 21:32 ` Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 5/6] fib_trie: move leaf and tnode to occupy the same spot in the key vector Alexander Duyck
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

This change makes it so that the insert and delete functions make use of
the tnode pointer returned in the fib_find_node call.  By doing this we
will not have to rely on the parent pointer in the leaf which will be going
away soon.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  237 ++++++++++++++++++++-------------------------------
 1 file changed, 95 insertions(+), 142 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 6b9cc70..454b8c5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -300,7 +300,7 @@ static inline void empty_child_dec(struct tnode *n)
 	n->empty_children-- ? : n->full_children--;
 }
 
-static struct tnode *leaf_new(t_key key)
+static struct tnode *leaf_new(t_key key, struct fib_alias *fa)
 {
 	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 	if (l) {
@@ -310,12 +310,14 @@ static struct tnode *leaf_new(t_key key)
 		 * as the nodes are searched
 		 */
 		l->key = key;
-		l->slen = 0;
+		l->slen = fa->fa_slen;
 		l->pos = 0;
 		/* set bits to 0 indicating we are not a tnode */
 		l->bits = 0;
 
+		/* link leaf to fib alias */
 		INIT_HLIST_HEAD(&l->leaf);
+		hlist_add_head(&fa->fa_list, &l->leaf);
 	}
 	return l;
 }
@@ -842,10 +844,8 @@ static void resize(struct trie *t, struct tnode *tn)
 	}
 }
 
-static void leaf_pull_suffix(struct tnode *l)
+static void leaf_pull_suffix(struct tnode *tp, struct tnode *l)
 {
-	struct tnode *tp = node_parent(l);
-
 	while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
 		if (update_suffix(tp) > l->slen)
 			break;
@@ -853,10 +853,8 @@ static void leaf_pull_suffix(struct tnode *l)
 	}
 }
 
-static void leaf_push_suffix(struct tnode *l)
+static void leaf_push_suffix(struct tnode *tn, struct tnode *l)
 {
-	struct tnode *tn = node_parent(l);
-
 	/* if this is a new leaf then tn will be NULL and we can sort
 	 * out parent suffix lengths as a part of trie_rebalance
 	 */
@@ -866,51 +864,6 @@ static void leaf_push_suffix(struct tnode *l)
 	}
 }
 
-static void fib_remove_alias(struct tnode *l, struct fib_alias *old)
-{
-	/* record the location of the previous list_info entry */
-	struct hlist_node **pprev = old->fa_list.pprev;
-	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
-
-	/* remove the fib_alias from the list */
-	hlist_del_rcu(&old->fa_list);
-
-	/* only access fa if it is pointing at the last valid hlist_node */
-	if (hlist_empty(&l->leaf) || (*pprev))
-		return;
-
-	/* update the trie with the latest suffix length */
-	l->slen = fa->fa_slen;
-	leaf_pull_suffix(l);
-}
-
-static void fib_insert_alias(struct tnode *l, struct fib_alias *fa,
-			     struct fib_alias *new)
-{
-	if (fa) {
-		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
-	} else {
-		struct fib_alias *last;
-
-		hlist_for_each_entry(last, &l->leaf, fa_list) {
-			if (new->fa_slen < last->fa_slen)
-				break;
-			fa = last;
-		}
-
-		if (fa)
-			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
-		else
-			hlist_add_head_rcu(&new->fa_list, &l->leaf);
-	}
-
-	/* if we added to the tail node then we need to update slen */
-	if (l->slen < new->fa_slen) {
-		l->slen = new->fa_slen;
-		leaf_push_suffix(l);
-	}
-}
-
 /* rcu_read_lock needs to be hold by caller from readside */
 static struct tnode *fib_find_node(struct trie *t, struct tnode **tp, u32 key)
 {
@@ -974,61 +927,28 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 {
 	struct tnode *tp;
 
-	while ((tp = node_parent(tn)) != NULL) {
+	while (tn) {
+		tp = node_parent(tn);
 		resize(t, tn);
 		tn = tp;
 	}
-
-	/* Handle last (top) tnode */
-	if (IS_TNODE(tn))
-		resize(t, tn);
 }
 
 /* only used from updater-side */
-
-static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen)
+static int fib_insert_node(struct trie *t, struct tnode *tp,
+			   struct fib_alias *new, t_key key)
 {
-	struct tnode *l, *n, *tp = NULL;
-
-	n = rtnl_dereference(t->trie);
-
-	/* If we point to NULL, stop. Either the tree is empty and we should
-	 * just put a new leaf in if, or we have reached an empty child slot,
-	 * and we should just put our new leaf in that.
-	 *
-	 * If we hit a node with a key that does't match then we should stop
-	 * and create a new tnode to replace that node and insert ourselves
-	 * and the other node into the new tnode.
-	 */
-	while (n) {
-		unsigned long index = get_index(key, n);
-
-		/* This bit of code is a bit tricky but it combines multiple
-		 * checks into a single check.  The prefix consists of the
-		 * prefix plus zeros for the "bits" in the prefix. The index
-		 * is the difference between the key and this value.  From
-		 * this we can actually derive several pieces of data.
-		 *   if !(index >> bits)
-		 *     we know the value is child index
-		 *   else
-		 *     we have a mismatch in skip bits and failed
-		 */
-		if (index >> n->bits)
-			break;
-
-		/* we have found a leaf. Prefixes have already been compared */
-		if (IS_LEAF(n)) {
-			/* Case 1: n is a leaf, and prefixes match*/
-			return n;
-		}
-
-		tp = n;
-		n = tnode_get_child_rcu(n, index);
-	}
+	struct tnode *n, *l;
 
-	l = leaf_new(key);
+	l = leaf_new(key, new);
 	if (!l)
-		return NULL;
+		return -ENOMEM;
+
+	/* retrieve child from parent node */
+	if (tp)
+		n = tnode_get_child(tp, get_index(key, tp));
+	else
+		n = rcu_dereference_rtnl(t->trie);
 
 	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
 	 *
@@ -1042,7 +962,7 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen)
 		tn = tnode_new(key, __fls(key ^ n->key), 1);
 		if (!tn) {
 			node_free(l);
-			return NULL;
+			return -ENOMEM;
 		}
 
 		/* initialize routes out of node */
@@ -1058,20 +978,47 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen)
 	}
 
 	/* Case 3: n is NULL, and will just insert a new leaf */
-	if (tp) {
-		NODE_INIT_PARENT(l, tp);
-		put_child(tp, get_index(key, tp), l);
-		trie_rebalance(t, tp);
+	NODE_INIT_PARENT(l, tp);
+	put_child_root(tp, t, key, l);
+	trie_rebalance(t, tp);
+
+	return 0;
+}
+
+static int fib_insert_alias(struct trie *t, struct tnode *tp,
+			    struct tnode *l, struct fib_alias *new,
+			    struct fib_alias *fa, t_key key)
+{
+	if (!l)
+		return fib_insert_node(t, tp, new, key);
+
+	if (fa) {
+		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
 	} else {
-		rcu_assign_pointer(t->trie, l);
+		struct fib_alias *last;
+
+		hlist_for_each_entry(last, &l->leaf, fa_list) {
+			if (new->fa_slen < last->fa_slen)
+				break;
+			fa = last;
+		}
+
+		if (fa)
+			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+		else
+			hlist_add_head_rcu(&new->fa_list, &l->leaf);
 	}
 
-	return l;
+	/* if we added to the tail node then we need to update slen */
+	if (l->slen < new->fa_slen) {
+		l->slen = new->fa_slen;
+		leaf_push_suffix(tp, l);
+	}
+
+	return 0;
 }
 
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
@@ -1199,19 +1146,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->fa_slen = slen;
 
 	/* Insert new entry to the list. */
-	if (!l) {
-		l = fib_insert_node(t, key, plen);
-		if (unlikely(!l)) {
-			err = -ENOMEM;
-			goto out_free_new_fa;
-		}
-	}
+	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+	if (err)
+		goto out_free_new_fa;
 
 	if (!plen)
 		tb->tb_num_default++;
 
-	fib_insert_alias(l, fa, new_fa);
-
 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
@@ -1400,9 +1341,36 @@ found:
 }
 EXPORT_SYMBOL_GPL(fib_table_lookup);
 
-/*
- * Caller must hold RTNL.
- */
+static void fib_remove_alias(struct trie *t, struct tnode *tp,
+			     struct tnode *l, struct fib_alias *old)
+{
+	/* record the location of the previous list_info entry */
+	struct hlist_node **pprev = old->fa_list.pprev;
+	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
+
+	/* remove the fib_alias from the list */
+	hlist_del_rcu(&old->fa_list);
+
+	/* if we emptied the list this leaf will be freed and we can sort
+	 * out parent suffix lengths as a part of trie_rebalance
+	 */
+	if (hlist_empty(&l->leaf)) {
+		put_child_root(tp, t, l->key, NULL);
+		node_free(l);
+		trie_rebalance(t, tp);
+		return;
+	}
+
+	/* only access fa if it is pointing at the last valid hlist_node */
+	if (*pprev)
+		return;
+
+	/* update the trie with the latest suffix length */
+	l->slen = fa->fa_slen;
+	leaf_pull_suffix(tp, l);
+}
+
+/* Caller must hold RTNL. */
 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
@@ -1426,7 +1394,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 		return -ESRCH;
 
 	fa = fib_find_alias(&l->leaf, slen, tos, 0);
-
 	if (!fa)
 		return -ESRCH;
 
@@ -1455,33 +1422,19 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	if (!fa_to_delete)
 		return -ESRCH;
 
-	fa = fa_to_delete;
-	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
+	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 
-	fib_remove_alias(l, fa);
-
 	if (!plen)
 		tb->tb_num_default--;
 
-	if (hlist_empty(&l->leaf)) {
-		struct tnode *tp = node_parent(l);
-
-		if (tp) {
-			put_child(tp, get_index(l->key, tp), NULL);
-			trie_rebalance(t, tp);
-		} else {
-			RCU_INIT_POINTER(t->trie, NULL);
-		}
-
-		node_free(l);
-	}
+	fib_remove_alias(t, tp, l, fa_to_delete);
 
-	if (fa->fa_state & FA_S_ACCESSED)
+	if (fa_to_delete->fa_state & FA_S_ACCESSED)
 		rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
-	fib_release_info(fa->fa_info);
-	alias_free_mem_rcu(fa);
+	fib_release_info(fa_to_delete->fa_info);
+	alias_free_mem_rcu(fa_to_delete);
 	return 0;
 }
 
@@ -1620,7 +1573,7 @@ backtrace:
 		put_child_root(pn, t, n->key, NULL);
 		node_free(n);
 	} else {
-		leaf_pull_suffix(n);
+		leaf_pull_suffix(pn, n);
 	}
 
 	/* if trie is leaf only loop is completed */

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next PATCH 5/6] fib_trie: move leaf and tnode to occupy the same spot in the key vector
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
                   ` (3 preceding siblings ...)
  2015-03-02 21:32 ` [net-next PATCH 4/6] fib_trie: Update insert and delete to make use of tp from find_node Alexander Duyck
@ 2015-03-02 21:32 ` Alexander Duyck
  2015-03-02 21:32 ` [net-next PATCH 6/6] fib_trie: Make fib_table rcu safe Alexander Duyck
  2015-03-04  5:16 ` [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector David Miller
  6 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

If we are going to compact the leaf and tnode we first need to make sure
the fields are all in the same place.  In that regard I am moving the leaf
pointer which represents the fib_alias hash list to occupy what is
currently the first key_vector pointer.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |   51 +++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 454b8c5..502c89c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -94,24 +94,27 @@ typedef unsigned int t_key;
 #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
 
 struct tnode {
+	struct rcu_head rcu;
+
+	t_key empty_children; /* KEYLENGTH bits needed */
+	t_key full_children;  /* KEYLENGTH bits needed */
+	struct tnode __rcu *parent;
+
 	t_key key;
-	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 	unsigned char slen;
-	struct tnode __rcu *parent;
-	struct rcu_head rcu;
 	union {
-		/* The fields in this struct are valid if bits > 0 (TNODE) */
-		struct {
-			t_key empty_children; /* KEYLENGTH bits needed */
-			t_key full_children;  /* KEYLENGTH bits needed */
-			struct tnode __rcu *child[0];
-		};
-		/* This list pointer if valid if bits == 0 (LEAF) */
+		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
 		struct hlist_head leaf;
+		/* This array is valid if (pos | bits) > 0 (TNODE) */
+		struct tnode __rcu *tnode[0];
 	};
 };
 
+#define TNODE_SIZE(n)	offsetof(struct tnode, tnode[n])
+#define LEAF_SIZE	TNODE_SIZE(1)
+
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 struct trie_use_stats {
 	unsigned int gets;
@@ -180,14 +183,14 @@ static inline unsigned long tnode_child_length(const struct tnode *tn)
 static inline struct tnode *tnode_get_child(const struct tnode *tn,
 					    unsigned long i)
 {
-	return rtnl_dereference(tn->child[i]);
+	return rtnl_dereference(tn->tnode[i]);
 }
 
 /* caller must hold RCU read lock or RTNL */
 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
 						unsigned long i)
 {
-	return rcu_dereference_rtnl(tn->child[i]);
+	return rcu_dereference_rtnl(tn->tnode[i]);
 }
 
 /* To understand this stuff, an understanding of keys and all their bits is
@@ -266,7 +269,7 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
 }
 
 #define TNODE_KMALLOC_MAX \
-	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
+	ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct tnode *))
 
 static void __node_free_rcu(struct rcu_head *head)
 {
@@ -324,7 +327,7 @@ static struct tnode *leaf_new(t_key key, struct fib_alias *fa)
 
 static struct tnode *tnode_new(t_key key, int pos, int bits)
 {
-	size_t sz = offsetof(struct tnode, child[1ul << bits]);
+	size_t sz = TNODE_SIZE(1ul << bits);
 	struct tnode *tn = tnode_alloc(sz);
 	unsigned int shift = pos + bits;
 
@@ -343,7 +346,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 			tn->empty_children = 1ul << bits;
 	}
 
-	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+	pr_debug("AT %p s=%zu %zu\n", tn, TNODE_SIZE(0),
 		 sizeof(struct tnode *) << bits);
 	return tn;
 }
@@ -384,7 +387,7 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 	if (n && (tn->slen < n->slen))
 		tn->slen = n->slen;
 
-	rcu_assign_pointer(tn->child[i], n);
+	rcu_assign_pointer(tn->tnode[i], n);
 }
 
 static void update_children(struct tnode *tn)
@@ -435,7 +438,7 @@ static void tnode_free(struct tnode *tn)
 
 	while (head) {
 		head = head->next;
-		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
 		node_free(tn);
 
 		tn = container_of(head, struct tnode, rcu);
@@ -788,7 +791,7 @@ static void resize(struct trie *t, struct tnode *tn)
 	 * doing it ourselves.  This way we can let RCU fully do its
 	 * thing without us interfering
 	 */
-	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
+	cptr = tp ? &tp->tnode[get_index(tn->key, tp)] : &t->trie;
 	BUG_ON(tn != rtnl_dereference(*cptr));
 
 	/* Double as long as the resulting node has a number of
@@ -1235,7 +1238,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
 	for (;;) {
 		/* record the pointer where our next node pointer is stored */
-		struct tnode __rcu **cptr = n->child;
+		struct tnode __rcu **cptr = n->tnode;
 
 		/* This test verifies that none of the bits that differ
 		 * between the key and the prefix exist in the region of
@@ -1281,7 +1284,7 @@ backtrace:
 			cindex &= cindex - 1;
 
 			/* grab pointer for next child node */
-			cptr = &pn->child[cindex];
+			cptr = &pn->tnode[cindex];
 		}
 	}
 
@@ -1679,7 +1682,7 @@ void __init fib_trie_init(void)
 					  0, SLAB_PANIC, NULL);
 
 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
-					   sizeof(struct tnode),
+					   LEAF_SIZE,
 					   0, SLAB_PANIC, NULL);
 }
 
@@ -1837,13 +1840,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
 
 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
-	bytes = sizeof(struct tnode) * stat->leaves;
+	bytes = LEAF_SIZE * stat->leaves;
 
 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
 	bytes += sizeof(struct fib_alias) * stat->prefixes;
 
 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
-	bytes += sizeof(struct tnode) * stat->tnodes;
+	bytes += TNODE_SIZE(0) * stat->tnodes;
 
 	max = MAX_STAT_DEPTH;
 	while (max > 0 && stat->nodesizes[max-1] == 0)
@@ -1912,7 +1915,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq,
 		   "Basic info: size of leaf:"
 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
-		   sizeof(struct tnode), sizeof(struct tnode));
+		   LEAF_SIZE, TNODE_SIZE(0));
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next PATCH 6/6] fib_trie: Make fib_table rcu safe
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
                   ` (4 preceding siblings ...)
  2015-03-02 21:32 ` [net-next PATCH 5/6] fib_trie: move leaf and tnode to occupy the same spot in the key vector Alexander Duyck
@ 2015-03-02 21:32 ` Alexander Duyck
  2015-03-04  5:16 ` [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector David Miller
  6 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-02 21:32 UTC (permalink / raw)
  To: netdev; +Cc: davem

The fib_table was wrapped in several places with an
rcu_read_lock/rcu_read_unlock however after looking over the code I found
several spots where the tables were being accessed as just standard
pointers without any protections.  This change fixes that so that all of
the proper protections are in place when accessing the table to take RCU
replacement or removal of the table into account.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 include/net/ip_fib.h     |   70 ++++++++++++++++++++++++++++------------------
 include/net/netns/ipv4.h |    7 +++--
 net/ipv4/fib_frontend.c  |   52 ++++++++++++++++++++++++----------
 net/ipv4/fib_trie.c      |   23 +++++++++++----
 4 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index cba4b7c..825cb28 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -185,6 +185,7 @@ struct fib_table {
 	u32			tb_id;
 	int			tb_default;
 	int			tb_num_default;
+	struct rcu_head		rcu;
 	unsigned long		tb_data[0];
 };
 
@@ -206,12 +207,16 @@ void fib_free_table(struct fib_table *tb);
 
 static inline struct fib_table *fib_get_table(struct net *net, u32 id)
 {
+	struct hlist_node *tb_hlist;
 	struct hlist_head *ptr;
 
 	ptr = id == RT_TABLE_LOCAL ?
 		&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
 		&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
-	return hlist_entry(ptr->first, struct fib_table, tb_hlist);
+
+	tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr));
+
+	return hlist_entry(tb_hlist, struct fib_table, tb_hlist);
 }
 
 static inline struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -222,15 +227,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id)
 static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 			     struct fib_result *res)
 {
-	int err = -ENETUNREACH;
+	struct fib_table *tb;
+	int err;
 
 	rcu_read_lock();
 
-	if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res,
-			      FIB_LOOKUP_NOREF) ||
-	    !fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res,
-			      FIB_LOOKUP_NOREF))
-		err = 0;
+	for (err = 0; !err; err = -ENETUNREACH) {
+		tb = fib_get_table(net, RT_TABLE_LOCAL);
+		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+			break;
+		tb = fib_get_table(net, RT_TABLE_MAIN);
+		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+			break;
+	}
 
 	rcu_read_unlock();
 
@@ -249,28 +258,33 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res);
 static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res)
 {
-	if (!net->ipv4.fib_has_custom_rules) {
-		int err = -ENETUNREACH;
-
-		rcu_read_lock();
-
-		res->tclassid = 0;
-		if ((net->ipv4.fib_local &&
-		     !fib_table_lookup(net->ipv4.fib_local, flp, res,
-				       FIB_LOOKUP_NOREF)) ||
-		    (net->ipv4.fib_main &&
-		     !fib_table_lookup(net->ipv4.fib_main, flp, res,
-				       FIB_LOOKUP_NOREF)) ||
-		    (net->ipv4.fib_default &&
-		     !fib_table_lookup(net->ipv4.fib_default, flp, res,
-				       FIB_LOOKUP_NOREF)))
-			err = 0;
-
-		rcu_read_unlock();
-
-		return err;
+	struct fib_table *tb;
+	int err;
+
+	if (net->ipv4.fib_has_custom_rules)
+		return __fib_lookup(net, flp, res);
+
+	rcu_read_lock();
+
+	res->tclassid = 0;
+
+	for (err = 0; !err; err = -ENETUNREACH) {
+		tb = rcu_dereference_rtnl(net->ipv4.fib_local);
+		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+			break;
+
+		tb = rcu_dereference_rtnl(net->ipv4.fib_main);
+		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+			break;
+
+		tb = rcu_dereference_rtnl(net->ipv4.fib_default);
+		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+			break;
 	}
-	return __fib_lookup(net, flp, res);
+
+	rcu_read_unlock();
+
+	return err;
 }
 
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 1b26c6c..db1db15 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
 
 #include <linux/uidgid.h>
 #include <net/inet_frag.h>
+#include <linux/rcupdate.h>
 
 struct tcpm_hash_bucket;
 struct ctl_table_header;
@@ -38,9 +39,9 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rules_ops	*rules_ops;
 	bool			fib_has_custom_rules;
-	struct fib_table	*fib_local;
-	struct fib_table	*fib_main;
-	struct fib_table	*fib_default;
+	struct fib_table __rcu	*fib_local;
+	struct fib_table __rcu	*fib_main;
+	struct fib_table __rcu	*fib_default;
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	int			fib_num_tclassid_users;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 57be71d..220c4b4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -89,17 +89,14 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
 
 	switch (id) {
 	case RT_TABLE_LOCAL:
-		net->ipv4.fib_local = tb;
+		rcu_assign_pointer(net->ipv4.fib_local, tb);
 		break;
-
 	case RT_TABLE_MAIN:
-		net->ipv4.fib_main = tb;
+		rcu_assign_pointer(net->ipv4.fib_main, tb);
 		break;
-
 	case RT_TABLE_DEFAULT:
-		net->ipv4.fib_default = tb;
+		rcu_assign_pointer(net->ipv4.fib_default, tb);
 		break;
-
 	default:
 		break;
 	}
@@ -132,13 +129,14 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
 static void fib_flush(struct net *net)
 {
 	int flushed = 0;
-	struct fib_table *tb;
-	struct hlist_head *head;
 	unsigned int h;
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
-		head = &net->ipv4.fib_table_hash[h];
-		hlist_for_each_entry(tb, head, tb_hlist)
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct hlist_node *tmp;
+		struct fib_table *tb;
+
+		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
 			flushed += fib_table_flush(tb);
 	}
 
@@ -665,10 +663,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
+	rcu_read_lock();
+
 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
 		e = 0;
 		head = &net->ipv4.fib_table_hash[h];
-		hlist_for_each_entry(tb, head, tb_hlist) {
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 			if (e < s_e)
 				goto next;
 			if (dumped)
@@ -682,6 +682,8 @@ next:
 		}
 	}
 out:
+	rcu_read_unlock();
+
 	cb->args[1] = e;
 	cb->args[0] = h;
 
@@ -1117,14 +1119,34 @@ static void ip_fib_net_exit(struct net *net)
 
 	rtnl_lock();
 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
-		struct fib_table *tb;
-		struct hlist_head *head;
+		struct hlist_head *head = &net->ipv4.fib_table_hash[i];
 		struct hlist_node *tmp;
+		struct fib_table *tb;
+
+		/* this is done in two passes as flushing the table could
+		 * cause it to be reallocated in order to accommodate new
+		 * tnodes at the root as the table shrinks.
+		 */
+		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+			fib_table_flush(tb);
 
-		head = &net->ipv4.fib_table_hash[i];
 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+			switch (tb->tb_id) {
+			case RT_TABLE_LOCAL:
+				RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
+				break;
+			case RT_TABLE_MAIN:
+				RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+				break;
+			case RT_TABLE_DEFAULT:
+				RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+				break;
+			default:
+				break;
+			}
+#endif
 			hlist_del(&tb->tb_hlist);
-			fib_table_flush(tb);
 			fib_free_table(tb);
 		}
 	}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 502c89c..209a4f2 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -193,6 +193,13 @@ static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
 	return rcu_dereference_rtnl(tn->tnode[i]);
 }
 
+static inline struct fib_table *trie_get_table(struct trie *t)
+{
+	unsigned long *tb_data = (unsigned long *)t;
+
+	return container_of(tb_data, struct fib_table, tb_data[0]);
+}
+
 /* To understand this stuff, an understanding of keys and all their bits is
  * necessary. Every node in the trie has a key associated with it, but not
  * all of the bits in that key are significant.
@@ -1587,16 +1594,22 @@ flush_complete:
 	return found;
 }
 
-void fib_free_table(struct fib_table *tb)
+static void __trie_free_rcu(struct rcu_head *head)
 {
+	struct fib_table *tb = container_of(head, struct fib_table, rcu);
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-	struct trie *t = (struct trie *)tb->tb_data;
+	struct trie *t = (struct trie *)tb->data;
 
 	free_percpu(t->stats);
 #endif /* CONFIG_IP_FIB_TRIE_STATS */
 	kfree(tb);
 }
 
+void fib_free_table(struct fib_table *tb)
+{
+	call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
 static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
 			     struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -1633,6 +1646,7 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
 	return skb->len;
 }
 
+/* rcu_read_lock needs to be hold by caller from readside */
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 		   struct netlink_callback *cb)
 {
@@ -1644,15 +1658,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	int count = cb->args[2];
 	t_key key = cb->args[3];
 
-	rcu_read_lock();
-
 	tp = rcu_dereference_rtnl(t->trie);
 
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
 			cb->args[3] = key;
 			cb->args[2] = count;
-			rcu_read_unlock();
 			return -1;
 		}
 
@@ -1667,8 +1678,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 			break;
 	}
 
-	rcu_read_unlock();
-
 	cb->args[3] = key;
 	cb->args[2] = count;
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector
  2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
                   ` (5 preceding siblings ...)
  2015-03-02 21:32 ` [net-next PATCH 6/6] fib_trie: Make fib_table rcu safe Alexander Duyck
@ 2015-03-04  5:16 ` David Miller
  2015-03-04 14:51   ` Alexander Duyck
  6 siblings, 1 reply; 11+ messages in thread
From: David Miller @ 2015-03-04  5:16 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev

From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Mon, 02 Mar 2015 13:32:16 -0800

> This patch series is meant to mostly just clean up the fib_trie to prepare
> it for the introduction of the key_vector.  As such there are a number of
> minor clean-ups such as reformatting the tnode to match the format once the
> key vector is introduced, some optimizations to drop the need for a leaf
> parent pointer, and some changes to remove duplication of effort such as
> the 2 look-ups that were essentially being done per node insertion.

This doesn't compile with trie stats enabled, I see something sneaking
in from the main/local table collapsing patch :-)

net/ipv4/fib_trie.c: In function ‘__trie_free_rcu’:
net/ipv4/fib_trie.c:1601:36: error: ‘struct fib_table’ has no member named ‘data’
  struct trie *t = (struct trie *)tb->data;

Also, two comments:

1) When you go "if (idx >> n->bits)", can n->bits be == 32?  If so, this
   expression is undefined.

2) In your simplification of fib_find_node(), don't keep storing over
   and over again into the on-stack variable '*tp', instead just maintain
   a one behind pointer in a local variable for the parent, and store it
   one time as your exit the function.

Thanks.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector
  2015-03-04  5:16 ` [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector David Miller
@ 2015-03-04 14:51   ` Alexander Duyck
  2015-03-04 17:53     ` David Miller
  0 siblings, 1 reply; 11+ messages in thread
From: Alexander Duyck @ 2015-03-04 14:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


On 03/03/2015 09:16 PM, David Miller wrote:
> From: Alexander Duyck <alexander.h.duyck@redhat.com>
> Date: Mon, 02 Mar 2015 13:32:16 -0800
>
>> This patch series is meant to mostly just clean up the fib_trie to prepare
>> it for the introduction of the key_vector.  As such there are a number of
>> minor clean-ups such as reformatting the tnode to match the format once the
>> key vector is introduced, some optimizations to drop the need for a leaf
>> parent pointer, and some changes to remove duplication of effort such as
>> the 2 look-ups that were essentially being done per node insertion.
> This doesn't compile with trie stats enabled, I see something sneaking
> in from the main/local table collapsing patch :-)
>
> net/ipv4/fib_trie.c: In function ‘__trie_free_rcu’:
> net/ipv4/fib_trie.c:1601:36: error: ‘struct fib_table’ has no member named ‘data’
>    struct trie *t = (struct trie *)tb->data;

Ugh, sorry about that I will fix it this morning.

> Also, two comments:
>
> 1) When you go "if (idx >> n->bits)", can n->bits be == 32?  If so, this
>     expression is undefined.

If bits can be 32 then idx should be an unsigned long which is 64 bits.  
I will double check to make sure that is still the case.

The general idea is if a long is 32 bits then we likely cannot allocate 
a 32 bit tnode since that would more than consume all of the available 
memory in the system.  I can look into adding a comment to that effect 
somewhere.

> 2) In your simplification of fib_find_node(), don't keep storing over
>     and over again into the on-stack variable '*tp', instead just maintain
>     a one behind pointer in a local variable for the parent, and store it
>     one time as your exit the function.
>
> Thanks.

Okay I will update it.  As I recall that is what the compiler ends up 
doing anyway since fib_find_node ends up being inlined leaving the 
parent as just a local variable.

- Alex

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector
  2015-03-04 14:51   ` Alexander Duyck
@ 2015-03-04 17:53     ` David Miller
  2015-03-04 18:22       ` Alexander Duyck
  0 siblings, 1 reply; 11+ messages in thread
From: David Miller @ 2015-03-04 17:53 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev

From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 04 Mar 2015 06:51:27 -0800

> If bits can be 32 then idx should be an unsigned long which is 64
> bits.

unsigned long is 32-bit on 32-bit platforms

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector
  2015-03-04 17:53     ` David Miller
@ 2015-03-04 18:22       ` Alexander Duyck
  0 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2015-03-04 18:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


On 03/04/2015 09:53 AM, David Miller wrote:
> From: Alexander Duyck <alexander.h.duyck@redhat.com>
> Date: Wed, 04 Mar 2015 06:51:27 -0800
>
>> If bits can be 32 then idx should be an unsigned long which is 64
>> bits.
> unsigned long is 32-bit on 32-bit platforms

Right, but if unsigned long is 32 bits then usually a pointer is as 
well.  As a result we wouldn't be able to access the upper bits of a 
node with bits == 32 since the size of the tnode would be over 4 * 2^32.

The general idea is the vmalloc should fail when we attempt to allocate 
a bits == 32 tnode on a system w/ only 32b longs.  Then again I think we 
will probably cause a memory corruption on a 32b system since we are 
probably overflowing size_t on the allocation if bits ==32.  I'll have 
to take a look as I believe we are using the offsetof macro and I am not 
sure how that handles a 64b address on a 32b system.

- Alex

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-03-04 18:22 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-02 21:32 [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector Alexander Duyck
2015-03-02 21:32 ` [net-next PATCH 1/6] fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush Alexander Duyck
2015-03-02 21:32 ` [net-next PATCH 2/6] fib_trie: Fib walk rcu should take a tnode and key instead of a trie and a leaf Alexander Duyck
2015-03-02 21:32 ` [net-next PATCH 3/6] fib_trie: Fib find node should return parent Alexander Duyck
2015-03-02 21:32 ` [net-next PATCH 4/6] fib_trie: Update insert and delete to make use of tp from find_node Alexander Duyck
2015-03-02 21:32 ` [net-next PATCH 5/6] fib_trie: move leaf and tnode to occupy the same spot in the key vector Alexander Duyck
2015-03-02 21:32 ` [net-next PATCH 6/6] fib_trie: Make fib_table rcu safe Alexander Duyck
2015-03-04  5:16 ` [net-next PATCH 0/6] ipv4/fib_trie: Cleanups to prepare for introduction of key vector David Miller
2015-03-04 14:51   ` Alexander Duyck
2015-03-04 17:53     ` David Miller
2015-03-04 18:22       ` Alexander Duyck

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.