From mboxrd@z Thu Jan 1 00:00:00 1970 From: David Ahern Subject: [PATCH RFC net-next 2/7] net/ipv6: Simplify route replace and appending into multipath route Date: Mon, 14 May 2018 19:51:07 -0700 Message-ID: <20180515025112.16647-3-dsahern@gmail.com> References: <20180515025112.16647-1-dsahern@gmail.com> Cc: Thomas.Winter@alliedtelesis.co.nz, idosch@mellanox.com, sharpd@cumulusnetworks.com, roopa@cumulusnetworks.com, David Ahern To: netdev@vger.kernel.org Return-path: Received: from mail-pg0-f68.google.com ([74.125.83.68]:38907 "EHLO mail-pg0-f68.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752238AbeEOCv1 (ORCPT ); Mon, 14 May 2018 22:51:27 -0400 Received: by mail-pg0-f68.google.com with SMTP id n9-v6so6317802pgq.5 for ; Mon, 14 May 2018 19:51:26 -0700 (PDT) In-Reply-To: <20180515025112.16647-1-dsahern@gmail.com> Sender: netdev-owner@vger.kernel.org List-ID: Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases: 1. can not replace a route with a reject route. Existing code appends a new route instead of replacing the existing one. 2. can not have a multipath route where a leg uses a dev only nexthop Existing use cases affected by this change: 1. adding a route with existing prefix and metric using NLM_F_CREATE without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls 'prepend'). Existing code auto-determines that the new nexthop can be appended to an existing route to create a multipath route. This change breaks that by requiring the APPEND flag for the new route to be added to an existing one. Instead the prepend just adds another route entry. 2. route replace. Existing code replaces first matching multipath route if new route is multipath capable and fallback to first matching non-ECMP route (reject or dev only route) in case one isn't available. New behavior replaces first matching route. (Thanks to Ido for spotting this one) Note: Newer iproute2 is needed to display multipath routes with a dev-only nexthop. This is due to a bug in iproute2 and parsing nexthops. Signed-off-by: David Ahern --- include/net/ip6_route.h | 6 -- net/ipv6/ip6_fib.c | 157 ++++++++++++++++++++++-------------------------- net/ipv6/route.c | 3 +- 3 files changed, 73 insertions(+), 93 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 4cf1ef935ed9..9e4d0f0aeb6d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) -{ - return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f0a4262a4789..f57ec3bc12d1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -927,19 +927,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - struct fib6_info *iter = NULL; + struct fib6_info *iter = NULL, *match = NULL; struct fib6_info __rcu **ins; - struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int append = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; - bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) + if (append) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -961,13 +961,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { - found++; - break; - } - if (rt_can_ecmp) - fallback_ins = fallback_ins ?: ins; - goto next_iter; + found++; + break; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -982,86 +977,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - /* If we have the same destination and the same metric, - * but not the same gateway, then the route we try to - * add is sibling to this route, increment our counter - * of siblings, and later we will add our route to the - * list. - * Only static routes (which don't have flag - * RTF_EXPIRES) are used for ECMPv6. - * - * To avoid long list, we only had siblings if the - * route have a gateway. - */ - if (rt_can_ecmp && - rt6_qualify_for_ecmp(iter)) - rt->fib6_nsiblings++; + + /* first route that matches */ + if (!match) + match = iter; } if (iter->fib6_metric > rt->fib6_metric) break; -next_iter: ins = &iter->fib6_next; } - if (fallback_ins && !found) { - /* No ECMP-able route found, replace first non-ECMP one */ - ins = fallback_ins; - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - found++; - } - /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->fib6_nsiblings) { - unsigned int fib6_nsiblings; + if (append && match) { struct fib6_info *sibling, *temp_sibling; - /* Find the first route that have the same metric */ - sibling = leaf; - while (sibling) { - if (sibling->fib6_metric == rt->fib6_metric && - rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); - break; - } - sibling = rcu_dereference_protected(sibling->fib6_next, - lockdep_is_held(&rt->fib6_table->tb6_lock)); + if (rt->fib6_flags & RTF_REJECT) { + NL_SET_ERR_MSG(extack, + "Can not append a REJECT route"); + return -EINVAL; + } else if (match->fib6_flags & RTF_REJECT) { + NL_SET_ERR_MSG(extack, + "Can not append to a REJECT route"); + return -EINVAL; } + rt->fib6_nsiblings = match->fib6_nsiblings; + list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); + match->fib6_nsiblings++; + /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->fib6_siblings, fib6_siblings) { + &match->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); - fib6_nsiblings++; + BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); } - BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); - rt6_multipath_rebalance(temp_sibling); + + rt6_multipath_rebalance(match); } /* * insert node */ if (!replace) { + enum fib_event_type event; + if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); + event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD; + err = call_fib6_entry_notifiers(info->nl_net, event, rt, + extack); if (err) return err; @@ -1079,7 +1055,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, } } else { - int nsiblings; + struct fib6_info *tmp; if (!found) { if (add) @@ -1094,48 +1070,57 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, if (err) return err; + /* if route being replaced has siblings, set tmp to + * last one, otherwise tmp is current route. this is + * used to set fib6_next for new route + */ + if (iter->fib6_nsiblings) + tmp = list_last_entry(&iter->fib6_siblings, + struct fib6_info, + fib6_siblings); + else + tmp = iter; + + /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = iter->fib6_next; + rt->fib6_next = tmp->fib6_next; rcu_assign_pointer(*ins, rt); + if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } - nsiblings = iter->fib6_nsiblings; - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - if (nsiblings) { + /* delete old route */ + rt = iter; + + if (rt->fib6_nsiblings) { + struct fib6_info *tmp; + /* Replacing an ECMP route, remove all siblings */ - ins = &rt->fib6_next; - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - while (iter) { - if (iter->fib6_metric > rt->fib6_metric) - break; - if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->fib6_next; - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; - } else { - ins = &iter->fib6_next; - } - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); + list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, + fib6_siblings) { + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + + rt->fib6_nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } - WARN_ON(nsiblings != 0); } + + WARN_ON(rt->fib6_nsiblings != 0); + + rt->fib6_node = NULL; + fib6_purge_rt(rt, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == rt) + fn->rr_ptr = NULL; + fib6_info_release(rt); } return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index af0416701fb2..f267e320ee2e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3781,7 +3781,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - rt6_qualify_for_ecmp(iter)) + iter->fib6_nsiblings) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4371,6 +4371,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } -- 2.11.0