All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath
@ 2018-01-09 14:40 Ido Schimmel
  2018-01-09 14:40 ` [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops Ido Schimmel
                   ` (5 more replies)
  0 siblings, 6 replies; 24+ messages in thread
From: Ido Schimmel @ 2018-01-09 14:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, dsahern, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw, Ido Schimmel

This set aims to add support for IPv6 non-equal-cost multipath routes.
The first three patches convert multipath selection to use the
hash-threshold method (RFC 2992) instead of modulo-N. The same method is
employed by the IPv4 routing code since commit 0e884c78ee19 ("ipv4: L3
hash-based multipath").

Unlike modulo-N, with hash-threshold only the flows near the region
boundaries are affected when a nexthop is added or removed. In addition,
it allows us to easily add support for non-equal-cost multipath in the
last patch by sizing the different regions according to the provided
weights.

Ido Schimmel (4):
  ipv6: Calculate hash thresholds for IPv6 nexthops
  ipv6: Use a 31-bit multipath hash
  ipv6: Use hash-threshold instead of modulo-N
  ipv6: Add support for non-equal-cost multipath

 include/net/ip6_fib.h   |   2 +
 include/net/ip6_route.h |   7 +++
 net/ipv6/ip6_fib.c      |   8 +--
 net/ipv6/route.c        | 141 +++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 126 insertions(+), 32 deletions(-)

-- 
2.14.3

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
@ 2018-01-09 14:40 ` Ido Schimmel
  2018-01-10  3:43   ` David Ahern
  2018-01-09 14:40 ` [PATCH net-next 2/4] ipv6: Use a 31-bit multipath hash Ido Schimmel
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-01-09 14:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, dsahern, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw, Ido Schimmel

Before we convert IPv6 to use hash-threshold instead of modulo-N, we
first need each nexthop to store its region boundary in the hash
function's output space.

The boundary is calculated by dividing the output space equally between
the different active nexthops. That is, nexthops that are not dead or
linkdown.

The boundaries are rebalanced whenever a nexthop is added or removed to
a multipath route and whenever a nexthop becomes active or inactive.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 include/net/ip6_fib.h   |  1 +
 include/net/ip6_route.h |  7 ++++
 net/ipv6/ip6_fib.c      |  8 ++---
 net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ddf53dd1e948..97cd05d87780 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -149,6 +149,7 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
+	atomic_t			rt6i_nh_upper_bound;
 
 	atomic_t			rt6i_ref;
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 34cd3b0c6ded..27d23a65f3cd 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
+{
+	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	       RTF_GATEWAY;
+}
+
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
@@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
+void rt6_multipath_rebalance(struct rt6_info *rt);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index edda5ad3b405..f5b052d8c481 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -796,12 +796,6 @@ static struct fib6_node *fib6_add_1(struct net *net,
 	return ln;
 }
 
-static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
-{
-	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
-}
-
 static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
 {
 	int i;
@@ -991,6 +985,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			rt6i_nsiblings++;
 		}
 		BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+		rt6_multipath_rebalance(temp_sibling);
 	}
 
 	/*
@@ -1667,6 +1662,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 			sibling->rt6i_nsiblings--;
 		rt->rt6i_nsiblings = 0;
 		list_del_init(&rt->rt6i_siblings);
+		rt6_multipath_rebalance(next_sibling);
 	}
 
 	/* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1054b059747f..ced2c9bed10b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3481,6 +3481,99 @@ struct arg_netdev_event {
 	};
 };
 
+static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
+{
+	struct rt6_info *iter;
+	struct fib6_node *fn;
+
+	fn = rcu_dereference_protected(rt->rt6i_node,
+			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	iter = rcu_dereference_protected(fn->leaf,
+			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	while (iter) {
+		if (iter->rt6i_metric == rt->rt6i_metric &&
+		    rt6_qualify_for_ecmp(iter))
+			return iter;
+		iter = rcu_dereference_protected(iter->rt6_next,
+				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	}
+
+	return NULL;
+}
+
+static bool rt6_is_dead(const struct rt6_info *rt)
+{
+	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
+	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+		return true;
+
+	return false;
+}
+
+static int rt6_multipath_total_weight(const struct rt6_info *rt)
+{
+	struct rt6_info *iter;
+	int total = 0;
+
+	if (!rt6_is_dead(rt))
+		total++;
+
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+		if (!rt6_is_dead(iter))
+			total++;
+	}
+
+	return total;
+}
+
+static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
+{
+	int upper_bound = -1;
+
+	if (!rt6_is_dead(rt)) {
+		(*weight)++;
+		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
+						    total) - 1;
+	}
+	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
+}
+
+static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
+{
+	struct rt6_info *iter;
+	int weight = 0;
+
+	rt6_upper_bound_set(rt, &weight, total);
+
+	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+		rt6_upper_bound_set(iter, &weight, total);
+}
+
+void rt6_multipath_rebalance(struct rt6_info *rt)
+{
+	struct rt6_info *first;
+	int total;
+
+	/* In case the entire multipath route was marked for flushing,
+	 * then there is no need to rebalance upon the removal of every
+	 * sibling route.
+	 */
+	if (!rt->rt6i_nsiblings || rt->should_flush)
+		return;
+
+	/* During lookup routes are evaluated in order, so we need to
+	 * make sure upper bounds are assigned from the first sibling
+	 * onwards.
+	 */
+	first = rt6_multipath_first_sibling(rt);
+	if (WARN_ON_ONCE(!first))
+		return;
+
+	total = rt6_multipath_total_weight(first);
+	rt6_multipath_upper_bound_set(first, total);
+}
+
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
@@ -3489,6 +3582,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
 		rt->rt6i_nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+		rt6_multipath_rebalance(rt);
 	}
 
 	return 0;
@@ -3588,6 +3682,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
 						   RTNH_F_LINKDOWN);
 			fib6_update_sernum(rt);
+			rt6_multipath_rebalance(rt);
 		}
 		return -2;
 	case NETDEV_CHANGE:
@@ -3595,6 +3690,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+		rt6_multipath_rebalance(rt);
 		break;
 	}
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH net-next 2/4] ipv6: Use a 31-bit multipath hash
  2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
  2018-01-09 14:40 ` [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops Ido Schimmel
@ 2018-01-09 14:40 ` Ido Schimmel
  2018-01-10  3:43   ` David Ahern
  2018-01-09 14:40 ` [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N Ido Schimmel
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-01-09 14:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, dsahern, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw, Ido Schimmel

The hash thresholds assigned to IPv6 nexthops are in the range of
[-1, 2^31 - 1], where a negative value is assigned to nexthops that
should not be considered during multipath selection.

Therefore, in a similar fashion to IPv4, we need to use the upper
31-bits of the multipath hash for multipath selection.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 net/ipv6/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ced2c9bed10b..09e8e10b101d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1833,10 +1833,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
 
 	if (skb) {
 		ip6_multipath_l3_keys(skb, &hash_keys);
-		return flow_hash_from_keys(&hash_keys);
+		return flow_hash_from_keys(&hash_keys) >> 1;
 	}
 
-	return get_hash_from_flowi6(fl6);
+	return get_hash_from_flowi6(fl6) >> 1;
 }
 
 void ip6_route_input(struct sk_buff *skb)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N
  2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
  2018-01-09 14:40 ` [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops Ido Schimmel
  2018-01-09 14:40 ` [PATCH net-next 2/4] ipv6: Use a 31-bit multipath hash Ido Schimmel
@ 2018-01-09 14:40 ` Ido Schimmel
  2018-01-10  3:54   ` David Ahern
  2018-01-09 14:40 ` [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-01-09 14:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, dsahern, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw, Ido Schimmel

Now that each nexthop stores its region boundary in the multipath hash
function's output space, we can use hash-threshold instead of modulo-N
in multipath selection.

This reduces the number of checks we need to perform during lookup, as
dead and linkdown nexthops are assigned a negative region boundary. In
addition, in contrast to modulo-N, only flows near region boundaries are
affected when a nexthop is added or removed.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 net/ipv6/route.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09e8e10b101d..7837b8c754a3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 					     int strict)
 {
 	struct rt6_info *sibling, *next_sibling;
-	int route_choosen;
 
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
@@ -463,28 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
 
-	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
-	/* Don't change the route, if route_choosen == 0
-	 * (siblings does not include ourself)
-	 */
-	if (route_choosen)
-		list_for_each_entry_safe(sibling, next_sibling,
-				&match->rt6i_siblings, rt6i_siblings) {
-			route_choosen--;
-			if (route_choosen == 0) {
-				struct inet6_dev *idev = sibling->rt6i_idev;
-
-				if (sibling->rt6i_nh_flags & RTNH_F_DEAD)
-					break;
-				if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
-				    idev->cnf.ignore_routes_with_linkdown)
-					break;
-				if (rt6_score_route(sibling, oif, strict) < 0)
-					break;
-				match = sibling;
-				break;
-			}
-		}
+	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
+		return match;
+
+	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
+				 rt6i_siblings) {
+		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
+			continue;
+		if (rt6_score_route(sibling, oif, strict) < 0)
+			break;
+		match = sibling;
+		break;
+	}
+
 	return match;
 }
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath
  2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
                   ` (2 preceding siblings ...)
  2018-01-09 14:40 ` [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N Ido Schimmel
@ 2018-01-09 14:40 ` Ido Schimmel
  2018-01-10  3:48   ` David Ahern
  2018-01-10  4:38 ` [PATCH net-next 0/4] " David Ahern
  2018-01-10 20:15 ` David Miller
  5 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-01-09 14:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, dsahern, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw, Ido Schimmel

The use of hash-threshold instead of modulo-N makes it trivial to add
support for non-equal-cost multipath.

Instead of dividing the multipath hash function's output space equally
between the nexthops, each nexthop is assigned a region size which is
proportional to its weight.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/route.c      | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 97cd05d87780..34ec321d6a03 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -171,6 +171,7 @@ struct rt6_info {
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
+	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
 	u8				exception_bucket_flushed:1,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7837b8c754a3..1076ae0ea9d5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2594,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 #endif
 
 	rt->rt6i_metric = cfg->fc_metric;
+	rt->rt6i_nh_weight = 1;
 
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
@@ -3507,11 +3508,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total++;
+		total += rt->rt6i_nh_weight;
 
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
 		if (!rt6_is_dead(iter))
-			total++;
+			total += iter->rt6i_nh_weight;
 	}
 
 	return total;
@@ -3522,7 +3523,7 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		(*weight)++;
+		*weight += rt->rt6i_nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
@@ -4024,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
+		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
+
 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
 		if (err) {
 			dst_release_immediate(&rt->dst);
@@ -4246,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = 0;
+	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
 
 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-01-09 14:40 ` [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops Ido Schimmel
@ 2018-01-10  3:43   ` David Ahern
  2018-05-02 16:43     ` Eric Dumazet
  0 siblings, 1 reply; 24+ messages in thread
From: David Ahern @ 2018-01-10  3:43 UTC (permalink / raw)
  To: Ido Schimmel, netdev; +Cc: davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 1/9/18 7:40 AM, Ido Schimmel wrote:
> Before we convert IPv6 to use hash-threshold instead of modulo-N, we
> first need each nexthop to store its region boundary in the hash
> function's output space.
> 
> The boundary is calculated by dividing the output space equally between
> the different active nexthops. That is, nexthops that are not dead or
> linkdown.
> 
> The boundaries are rebalanced whenever a nexthop is added or removed to
> a multipath route and whenever a nexthop becomes active or inactive.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---
>  include/net/ip6_fib.h   |  1 +
>  include/net/ip6_route.h |  7 ++++
>  net/ipv6/ip6_fib.c      |  8 ++---
>  net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 106 insertions(+), 6 deletions(-)
> 

LGTM.
Acked-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 2/4] ipv6: Use a 31-bit multipath hash
  2018-01-09 14:40 ` [PATCH net-next 2/4] ipv6: Use a 31-bit multipath hash Ido Schimmel
@ 2018-01-10  3:43   ` David Ahern
  0 siblings, 0 replies; 24+ messages in thread
From: David Ahern @ 2018-01-10  3:43 UTC (permalink / raw)
  To: Ido Schimmel, netdev; +Cc: davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 1/9/18 7:40 AM, Ido Schimmel wrote:
> The hash thresholds assigned to IPv6 nexthops are in the range of
> [-1, 2^31 - 1], where a negative value is assigned to nexthops that
> should not be considered during multipath selection.
> 
> Therefore, in a similar fashion to IPv4, we need to use the upper
> 31-bits of the multipath hash for multipath selection.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---
>  net/ipv6/route.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 

Acked-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath
  2018-01-09 14:40 ` [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
@ 2018-01-10  3:48   ` David Ahern
  2018-01-10 11:47     ` Ido Schimmel
  0 siblings, 1 reply; 24+ messages in thread
From: David Ahern @ 2018-01-10  3:48 UTC (permalink / raw)
  To: Ido Schimmel, netdev; +Cc: davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 1/9/18 7:40 AM, Ido Schimmel wrote:
> The use of hash-threshold instead of modulo-N makes it trivial to add
> support for non-equal-cost multipath.
> 
> Instead of dividing the multipath hash function's output space equally
> between the nexthops, each nexthop is assigned a region size which is
> proportional to its weight.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---
>  include/net/ip6_fib.h |  1 +
>  net/ipv6/route.c      | 11 +++++++----
>  2 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index 97cd05d87780..34ec321d6a03 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -171,6 +171,7 @@ struct rt6_info {
>  	u32				rt6i_metric;
>  	u32				rt6i_pmtu;
>  	/* more non-fragment space at head required */
> +	int				rt6i_nh_weight;
>  	unsigned short			rt6i_nfheader_len;
>  	u8				rt6i_protocol;
>  	u8				exception_bucket_flushed:1,

Since dst is cacheline aligned there is a hole after rt6i_nh_flags. In
patch 1 you put rt6i_nh_upper_bound in that hole. Putting the weight
there too keeps those variables together as well as using the open space.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N
  2018-01-09 14:40 ` [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N Ido Schimmel
@ 2018-01-10  3:54   ` David Ahern
  2018-01-10 12:02     ` Ido Schimmel
  0 siblings, 1 reply; 24+ messages in thread
From: David Ahern @ 2018-01-10  3:54 UTC (permalink / raw)
  To: Ido Schimmel, netdev; +Cc: davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 1/9/18 7:40 AM, Ido Schimmel wrote:
> Now that each nexthop stores its region boundary in the multipath hash
> function's output space, we can use hash-threshold instead of modulo-N
> in multipath selection.
> 
> This reduces the number of checks we need to perform during lookup, as
> dead and linkdown nexthops are assigned a negative region boundary. In
> addition, in contrast to modulo-N, only flows near region boundaries are
> affected when a nexthop is added or removed.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---
>  net/ipv6/route.c | 36 +++++++++++++-----------------------
>  1 file changed, 13 insertions(+), 23 deletions(-)
> 

Did you consider adding the net->ipv4.sysctl_fib_multipath_use_neigh
check as well? ie., if set, verify neighbor is alive before picking that hop

Acked-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath
  2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
                   ` (3 preceding siblings ...)
  2018-01-09 14:40 ` [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
@ 2018-01-10  4:38 ` David Ahern
  2018-01-10 12:31   ` Ido Schimmel
  2018-01-10 20:15 ` David Miller
  5 siblings, 1 reply; 24+ messages in thread
From: David Ahern @ 2018-01-10  4:38 UTC (permalink / raw)
  To: Ido Schimmel, netdev; +Cc: davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 1/9/18 7:40 AM, Ido Schimmel wrote:
> This set aims to add support for IPv6 non-equal-cost multipath routes.
> The first three patches convert multipath selection to use the
> hash-threshold method (RFC 2992) instead of modulo-N. The same method is
> employed by the IPv4 routing code since commit 0e884c78ee19 ("ipv4: L3
> hash-based multipath").
> 
> Unlike modulo-N, with hash-threshold only the flows near the region
> boundaries are affected when a nexthop is added or removed. In addition,
> it allows us to easily add support for non-equal-cost multipath in the
> last patch by sizing the different regions according to the provided
> weights.
> 
> Ido Schimmel (4):
>   ipv6: Calculate hash thresholds for IPv6 nexthops
>   ipv6: Use a 31-bit multipath hash
>   ipv6: Use hash-threshold instead of modulo-N
>   ipv6: Add support for non-equal-cost multipath
> 
>  include/net/ip6_fib.h   |   2 +
>  include/net/ip6_route.h |   7 +++
>  net/ipv6/ip6_fib.c      |   8 +--
>  net/ipv6/route.c        | 141 +++++++++++++++++++++++++++++++++++++++---------
>  4 files changed, 126 insertions(+), 32 deletions(-)
> 

Also, have you considered support for sysctl_fib_multipath_hash_policy?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath
  2018-01-10  3:48   ` David Ahern
@ 2018-01-10 11:47     ` Ido Schimmel
  2018-01-10 15:53       ` David Ahern
  0 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-01-10 11:47 UTC (permalink / raw)
  To: David Ahern
  Cc: Ido Schimmel, netdev, davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

Hi David,

On Tue, Jan 09, 2018 at 08:48:37PM -0700, David Ahern wrote:
> On 1/9/18 7:40 AM, Ido Schimmel wrote:
> > The use of hash-threshold instead of modulo-N makes it trivial to add
> > support for non-equal-cost multipath.
> > 
> > Instead of dividing the multipath hash function's output space equally
> > between the nexthops, each nexthop is assigned a region size which is
> > proportional to its weight.
> > 
> > Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> > ---
> >  include/net/ip6_fib.h |  1 +
> >  net/ipv6/route.c      | 11 +++++++----
> >  2 files changed, 8 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> > index 97cd05d87780..34ec321d6a03 100644
> > --- a/include/net/ip6_fib.h
> > +++ b/include/net/ip6_fib.h
> > @@ -171,6 +171,7 @@ struct rt6_info {
> >  	u32				rt6i_metric;
> >  	u32				rt6i_pmtu;
> >  	/* more non-fragment space at head required */
> > +	int				rt6i_nh_weight;
> >  	unsigned short			rt6i_nfheader_len;
> >  	u8				rt6i_protocol;
> >  	u8				exception_bucket_flushed:1,
> 
> Since dst is cacheline aligned there is a hole after rt6i_nh_flags. In
> patch 1 you put rt6i_nh_upper_bound in that hole. Putting the weight
> there too keeps those variables together as well as using the open space.

Before patch 1 there's a hole of 4 bytes after rt6i_nh_flags which I use
for rt6i_nh_upper_bound. If I put rt6i_nh_weight there as well, then I
create a 60 bytes hole because the dst needs to be cached aligned.

Since rt6i_nh_weight isn't used in fast-path, I just put it at the end.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N
  2018-01-10  3:54   ` David Ahern
@ 2018-01-10 12:02     ` Ido Schimmel
  0 siblings, 0 replies; 24+ messages in thread
From: Ido Schimmel @ 2018-01-10 12:02 UTC (permalink / raw)
  To: David Ahern
  Cc: Ido Schimmel, netdev, davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On Tue, Jan 09, 2018 at 08:54:00PM -0700, David Ahern wrote:
> Did you consider adding the net->ipv4.sysctl_fib_multipath_use_neigh
> check as well? ie., if set, verify neighbor is alive before picking that hop

No, but I'll add it to my TODO list.

It's what effectively happens in mlxsw. If the neighbour isn't
NUD_VALID, then we don't have a MAC we can use in the adjacency table
and the nexthop isn't considered during multipath selection.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath
  2018-01-10  4:38 ` [PATCH net-next 0/4] " David Ahern
@ 2018-01-10 12:31   ` Ido Schimmel
  0 siblings, 0 replies; 24+ messages in thread
From: Ido Schimmel @ 2018-01-10 12:31 UTC (permalink / raw)
  To: David Ahern
  Cc: Ido Schimmel, netdev, davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On Tue, Jan 09, 2018 at 09:38:14PM -0700, David Ahern wrote:
> Also, have you considered support for sysctl_fib_multipath_hash_policy?

No, but I'm not sure what you expect it control? Note that unlike IPv4,
we have the flow label in the IP header, which we use for the multipath
hash.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath
  2018-01-10 11:47     ` Ido Schimmel
@ 2018-01-10 15:53       ` David Ahern
  0 siblings, 0 replies; 24+ messages in thread
From: David Ahern @ 2018-01-10 15:53 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Ido Schimmel, netdev, davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 1/10/18 4:47 AM, Ido Schimmel wrote:
> Hi David,
> 
> On Tue, Jan 09, 2018 at 08:48:37PM -0700, David Ahern wrote:
>> On 1/9/18 7:40 AM, Ido Schimmel wrote:
>>> The use of hash-threshold instead of modulo-N makes it trivial to add
>>> support for non-equal-cost multipath.
>>>
>>> Instead of dividing the multipath hash function's output space equally
>>> between the nexthops, each nexthop is assigned a region size which is
>>> proportional to its weight.
>>>
>>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>>> ---
>>>  include/net/ip6_fib.h |  1 +
>>>  net/ipv6/route.c      | 11 +++++++----
>>>  2 files changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
>>> index 97cd05d87780..34ec321d6a03 100644
>>> --- a/include/net/ip6_fib.h
>>> +++ b/include/net/ip6_fib.h
>>> @@ -171,6 +171,7 @@ struct rt6_info {
>>>  	u32				rt6i_metric;
>>>  	u32				rt6i_pmtu;
>>>  	/* more non-fragment space at head required */
>>> +	int				rt6i_nh_weight;
>>>  	unsigned short			rt6i_nfheader_len;
>>>  	u8				rt6i_protocol;
>>>  	u8				exception_bucket_flushed:1,
>>
>> Since dst is cacheline aligned there is a hole after rt6i_nh_flags. In
>> patch 1 you put rt6i_nh_upper_bound in that hole. Putting the weight
>> there too keeps those variables together as well as using the open space.
> 
> Before patch 1 there's a hole of 4 bytes after rt6i_nh_flags which I use
> for rt6i_nh_upper_bound. If I put rt6i_nh_weight there as well, then I
> create a 60 bytes hole because the dst needs to be cached aligned.
> 
> Since rt6i_nh_weight isn't used in fast-path, I just put it at the end.
> 

Apparently, I was on a 4.14 branch when I ran pahole to dump the layout
of rt6_info. The patch looks good to me.

Acked-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath
  2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
                   ` (4 preceding siblings ...)
  2018-01-10  4:38 ` [PATCH net-next 0/4] " David Ahern
@ 2018-01-10 20:15 ` David Miller
  5 siblings, 0 replies; 24+ messages in thread
From: David Miller @ 2018-01-10 20:15 UTC (permalink / raw)
  To: idosch; +Cc: netdev, dsahern, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

From: Ido Schimmel <idosch@mellanox.com>
Date: Tue,  9 Jan 2018 16:40:24 +0200

> This set aims to add support for IPv6 non-equal-cost multipath routes.
> The first three patches convert multipath selection to use the
> hash-threshold method (RFC 2992) instead of modulo-N. The same method is
> employed by the IPv4 routing code since commit 0e884c78ee19 ("ipv4: L3
> hash-based multipath").
> 
> Unlike modulo-N, with hash-threshold only the flows near the region
> boundaries are affected when a nexthop is added or removed. In addition,
> it allows us to easily add support for non-equal-cost multipath in the
> last patch by sizing the different regions according to the provided
> weights.

This looks great, series applied, thanks!

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-01-10  3:43   ` David Ahern
@ 2018-05-02 16:43     ` Eric Dumazet
  2018-05-02 17:21       ` Ido Schimmel
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2018-05-02 16:43 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, netdev
  Cc: davem, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw



On 01/09/2018 07:43 PM, David Ahern wrote:
> On 1/9/18 7:40 AM, Ido Schimmel wrote:
>> Before we convert IPv6 to use hash-threshold instead of modulo-N, we
>> first need each nexthop to store its region boundary in the hash
>> function's output space.
>>
>> The boundary is calculated by dividing the output space equally between
>> the different active nexthops. That is, nexthops that are not dead or
>> linkdown.
>>
>> The boundaries are rebalanced whenever a nexthop is added or removed to
>> a multipath route and whenever a nexthop becomes active or inactive.
>>
>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>> ---
>>  include/net/ip6_fib.h   |  1 +
>>  include/net/ip6_route.h |  7 ++++
>>  net/ipv6/ip6_fib.c      |  8 ++---
>>  net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 106 insertions(+), 6 deletions(-)
>>
> 
> LGTM.
> Acked-by: David Ahern <dsahern@gmail.com>
> 

For some reason I have a divide by zero error booting my hosts with latest net tree.

What guarantee do we have that total is not zero when rt6_upper_bound_set() is called ?



[    8.498639] divide error: 0000 [#1] SMP PTI
[    8.503178] gsmi: Log Shutdown Reason 0x03
[    8.507270] Modules linked in: bnx2x mdio
[    8.511276] CPU: 17 PID: 116 Comm: kworker/17:0 Not tainted 4.17.0-smp-DEV #110
[    8.518571] Hardware name: Intel RML,PCH/Ibis_QC_18, BIOS 2.40.0 06/22/2016
[    8.525526] Workqueue: ipv6_addrconf addrconf_dad_work
[    8.530662] RIP: 0010:rt6_multipath_rebalance.part.82+0x1cb/0x1f0
[    8.536752] RSP: 0018:ffffba72867cbbf8 EFLAGS: 00010246
[    8.541966] RAX: 0000000000000000 RBX: 0000000000000025 RCX: ffff9d555ab73180
[    8.549090] RDX: 0000000000000000 RSI: ffff9d4d5a34b1c0 RDI: 0000000000000000
[    8.556212] RBP: ffffba72867cbc00 R08: 0000000000000000 R09: 0000000000000000
[    8.563336] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9d5559f95680
[    8.570457] R13: ffff9d4d5a34b1c0 R14: ffff9d555ab73180 R15: 0000000000000000
[    8.577579] FS:  0000000000000000(0000) GS:ffff9d4d5fc40000(0000) knlGS:0000000000000000
[    8.585654] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    8.591391] CR2: 00007fffe47ff000 CR3: 0000000c39c0a001 CR4: 00000000000606e0
[    8.598515] Call Trace:
[    8.600961]  ? rt6_multipath_rebalance+0x21/0x30
[    8.605579]  fib6_add+0x75f/0xf70
[    8.608899]  ? __wake_up+0x13/0x20
[    8.612303]  ? netlink_broadcast_filtered+0x14c/0x3c0
[    8.617355]  __ip6_ins_rt+0x4c/0x70
[    8.620847]  ip6_ins_rt+0x6e/0xa0
[    8.624157]  __ipv6_ifa_notify+0x226/0x2e0
[    8.628249]  ipv6_ifa_notify+0x2a/0x40
[    8.631999]  addrconf_dad_completed+0x59/0x360
[    8.636438]  addrconf_dad_work+0x11c/0x400
[    8.640536]  ? addrconf_dad_work+0x11c/0x400
[    8.644810]  process_one_work+0x184/0x370
[    8.648820]  ? process_one_work+0x184/0x370
[    8.652996]  worker_thread+0x35/0x3a0
[    8.656654]  kthread+0x121/0x140
[    8.659887]  ? process_one_work+0x370/0x370
[    8.664073]  ? kthread_create_worker_on_cpu+0x70/0x70
[    8.669118]  ret_from_fork+0x35/0x40
[    8.672693] Code: c3 8b b9 38 01 00 00 eb aa 48 63 81 38 01 00 00 89 fa 41 89 f8 c1 ea 1f 01 fa d1 fa 48 63 d2 49 89 c2 48 c1 e0 1f 48 01 d0 31 d2 <49> f7 f0 83 e8 01 48 39 ce 89 81 b4 00 00 00 0f 85 e2 fe ff ff 
[    8.691533] RIP: rt6_multipath_rebalance.part.82+0x1cb/0x1f0 RSP: ffffba72867cbbf8
[    8.699135] ---[ end trace 9ae26819121cdc3a ]---
[    8.703760] Kernel panic - not syncing: Fatal exception in interrupt
[    8.710169] Kernel Offset: 0x3d200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[    8.721256] gsmi: Log Shutdown Reason 0x02
[    8.725357] Rebooting in 10 seconds..

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 16:43     ` Eric Dumazet
@ 2018-05-02 17:21       ` Ido Schimmel
  2018-05-02 17:52         ` Ido Schimmel
  0 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-05-02 17:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Ahern, Ido Schimmel, netdev, davem, roopa, nikolay, pch,
	jkbs, yoshfuji, mlxsw

On Wed, May 02, 2018 at 09:43:50AM -0700, Eric Dumazet wrote:
> 
> 
> On 01/09/2018 07:43 PM, David Ahern wrote:
> > On 1/9/18 7:40 AM, Ido Schimmel wrote:
> >> Before we convert IPv6 to use hash-threshold instead of modulo-N, we
> >> first need each nexthop to store its region boundary in the hash
> >> function's output space.
> >>
> >> The boundary is calculated by dividing the output space equally between
> >> the different active nexthops. That is, nexthops that are not dead or
> >> linkdown.
> >>
> >> The boundaries are rebalanced whenever a nexthop is added or removed to
> >> a multipath route and whenever a nexthop becomes active or inactive.
> >>
> >> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> >> ---
> >>  include/net/ip6_fib.h   |  1 +
> >>  include/net/ip6_route.h |  7 ++++
> >>  net/ipv6/ip6_fib.c      |  8 ++---
> >>  net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  4 files changed, 106 insertions(+), 6 deletions(-)
> >>
> > 
> > LGTM.
> > Acked-by: David Ahern <dsahern@gmail.com>
> > 
> 
> For some reason I have a divide by zero error booting my hosts with latest net tree.
> 
> What guarantee do we have that total is not zero when rt6_upper_bound_set() is called ?

Thanks for the report, Eric. I believe I didn't cover all the cases and
'rt6i_nh_weight' might be 0 is some cases. I'll try to reproduce and
work on a fix.

> 
> 
> 
> [    8.498639] divide error: 0000 [#1] SMP PTI
> [    8.503178] gsmi: Log Shutdown Reason 0x03
> [    8.507270] Modules linked in: bnx2x mdio
> [    8.511276] CPU: 17 PID: 116 Comm: kworker/17:0 Not tainted 4.17.0-smp-DEV #110
> [    8.518571] Hardware name: Intel RML,PCH/Ibis_QC_18, BIOS 2.40.0 06/22/2016
> [    8.525526] Workqueue: ipv6_addrconf addrconf_dad_work
> [    8.530662] RIP: 0010:rt6_multipath_rebalance.part.82+0x1cb/0x1f0
> [    8.536752] RSP: 0018:ffffba72867cbbf8 EFLAGS: 00010246
> [    8.541966] RAX: 0000000000000000 RBX: 0000000000000025 RCX: ffff9d555ab73180
> [    8.549090] RDX: 0000000000000000 RSI: ffff9d4d5a34b1c0 RDI: 0000000000000000
> [    8.556212] RBP: ffffba72867cbc00 R08: 0000000000000000 R09: 0000000000000000
> [    8.563336] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9d5559f95680
> [    8.570457] R13: ffff9d4d5a34b1c0 R14: ffff9d555ab73180 R15: 0000000000000000
> [    8.577579] FS:  0000000000000000(0000) GS:ffff9d4d5fc40000(0000) knlGS:0000000000000000
> [    8.585654] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    8.591391] CR2: 00007fffe47ff000 CR3: 0000000c39c0a001 CR4: 00000000000606e0
> [    8.598515] Call Trace:
> [    8.600961]  ? rt6_multipath_rebalance+0x21/0x30
> [    8.605579]  fib6_add+0x75f/0xf70
> [    8.608899]  ? __wake_up+0x13/0x20
> [    8.612303]  ? netlink_broadcast_filtered+0x14c/0x3c0
> [    8.617355]  __ip6_ins_rt+0x4c/0x70
> [    8.620847]  ip6_ins_rt+0x6e/0xa0
> [    8.624157]  __ipv6_ifa_notify+0x226/0x2e0
> [    8.628249]  ipv6_ifa_notify+0x2a/0x40
> [    8.631999]  addrconf_dad_completed+0x59/0x360
> [    8.636438]  addrconf_dad_work+0x11c/0x400
> [    8.640536]  ? addrconf_dad_work+0x11c/0x400
> [    8.644810]  process_one_work+0x184/0x370
> [    8.648820]  ? process_one_work+0x184/0x370
> [    8.652996]  worker_thread+0x35/0x3a0
> [    8.656654]  kthread+0x121/0x140
> [    8.659887]  ? process_one_work+0x370/0x370
> [    8.664073]  ? kthread_create_worker_on_cpu+0x70/0x70
> [    8.669118]  ret_from_fork+0x35/0x40
> [    8.672693] Code: c3 8b b9 38 01 00 00 eb aa 48 63 81 38 01 00 00 89 fa 41 89 f8 c1 ea 1f 01 fa d1 fa 48 63 d2 49 89 c2 48 c1 e0 1f 48 01 d0 31 d2 <49> f7 f0 83 e8 01 48 39 ce 89 81 b4 00 00 00 0f 85 e2 fe ff ff 
> [    8.691533] RIP: rt6_multipath_rebalance.part.82+0x1cb/0x1f0 RSP: ffffba72867cbbf8
> [    8.699135] ---[ end trace 9ae26819121cdc3a ]---
> [    8.703760] Kernel panic - not syncing: Fatal exception in interrupt
> [    8.710169] Kernel Offset: 0x3d200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> [    8.721256] gsmi: Log Shutdown Reason 0x02
> [    8.725357] Rebooting in 10 seconds..
> 
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 17:21       ` Ido Schimmel
@ 2018-05-02 17:52         ` Ido Schimmel
  2018-05-02 18:53           ` Ido Schimmel
  0 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-05-02 17:52 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Ahern, Ido Schimmel, netdev, davem, roopa, nikolay, pch,
	jkbs, yoshfuji, mlxsw

On Wed, May 02, 2018 at 08:21:06PM +0300, Ido Schimmel wrote:
> On Wed, May 02, 2018 at 09:43:50AM -0700, Eric Dumazet wrote:
> > 
> > 
> > On 01/09/2018 07:43 PM, David Ahern wrote:
> > > On 1/9/18 7:40 AM, Ido Schimmel wrote:
> > >> Before we convert IPv6 to use hash-threshold instead of modulo-N, we
> > >> first need each nexthop to store its region boundary in the hash
> > >> function's output space.
> > >>
> > >> The boundary is calculated by dividing the output space equally between
> > >> the different active nexthops. That is, nexthops that are not dead or
> > >> linkdown.
> > >>
> > >> The boundaries are rebalanced whenever a nexthop is added or removed to
> > >> a multipath route and whenever a nexthop becomes active or inactive.
> > >>
> > >> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> > >> ---
> > >>  include/net/ip6_fib.h   |  1 +
> > >>  include/net/ip6_route.h |  7 ++++
> > >>  net/ipv6/ip6_fib.c      |  8 ++---
> > >>  net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >>  4 files changed, 106 insertions(+), 6 deletions(-)
> > >>
> > > 
> > > LGTM.
> > > Acked-by: David Ahern <dsahern@gmail.com>
> > > 
> > 
> > For some reason I have a divide by zero error booting my hosts with latest net tree.
> > 
> > What guarantee do we have that total is not zero when rt6_upper_bound_set() is called ?
> 
> Thanks for the report, Eric. I believe I didn't cover all the cases and
> 'rt6i_nh_weight' might be 0 is some cases. I'll try to reproduce and
> work on a fix.

Hmmm, I think it's due to commit edd7ceb78296 ("ipv6: Allow non-gateway
ECMP for IPv6") which allows routes without a gateway (such as those
configured using slaac) to have siblings.

Can you please check if reverting the patch / applying the below fixes
the issue?

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4d61736c41a..129dd4f4b264 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3606,6 +3606,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	rt->dst.input = ip6_input;
 	rt->dst.output = ip6_output;
 	rt->rt6i_idev = idev;
+	rt->rt6i_nh_weight = 1;
 
 	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 17:52         ` Ido Schimmel
@ 2018-05-02 18:53           ` Ido Schimmel
  2018-05-02 18:58             ` David Ahern
  0 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-05-02 18:53 UTC (permalink / raw)
  To: Eric Dumazet, Thomas.Winter
  Cc: David Ahern, Ido Schimmel, netdev, davem, roopa, nikolay, pch,
	jkbs, yoshfuji, mlxsw

On Wed, May 02, 2018 at 08:52:44PM +0300, Ido Schimmel wrote:
> On Wed, May 02, 2018 at 08:21:06PM +0300, Ido Schimmel wrote:
> > On Wed, May 02, 2018 at 09:43:50AM -0700, Eric Dumazet wrote:
> > > 
> > > 
> > > On 01/09/2018 07:43 PM, David Ahern wrote:
> > > > On 1/9/18 7:40 AM, Ido Schimmel wrote:
> > > >> Before we convert IPv6 to use hash-threshold instead of modulo-N, we
> > > >> first need each nexthop to store its region boundary in the hash
> > > >> function's output space.
> > > >>
> > > >> The boundary is calculated by dividing the output space equally between
> > > >> the different active nexthops. That is, nexthops that are not dead or
> > > >> linkdown.
> > > >>
> > > >> The boundaries are rebalanced whenever a nexthop is added or removed to
> > > >> a multipath route and whenever a nexthop becomes active or inactive.
> > > >>
> > > >> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> > > >> ---
> > > >>  include/net/ip6_fib.h   |  1 +
> > > >>  include/net/ip6_route.h |  7 ++++
> > > >>  net/ipv6/ip6_fib.c      |  8 ++---
> > > >>  net/ipv6/route.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > >>  4 files changed, 106 insertions(+), 6 deletions(-)
> > > >>
> > > > 
> > > > LGTM.
> > > > Acked-by: David Ahern <dsahern@gmail.com>
> > > > 
> > > 
> > > For some reason I have a divide by zero error booting my hosts with latest net tree.
> > > 
> > > What guarantee do we have that total is not zero when rt6_upper_bound_set() is called ?
> > 
> > Thanks for the report, Eric. I believe I didn't cover all the cases and
> > 'rt6i_nh_weight' might be 0 is some cases. I'll try to reproduce and
> > work on a fix.
> 
> Hmmm, I think it's due to commit edd7ceb78296 ("ipv6: Allow non-gateway
> ECMP for IPv6") which allows routes without a gateway (such as those
> configured using slaac) to have siblings.
> 
> Can you please check if reverting the patch / applying the below fixes
> the issue?

So this fixes the issue for me. To reproduce:

# ip -6 address add 2001:db8::1/64 dev dummy0
# ip -6 address add 2001:db8::1/64 dev dummy1

This reproduces the issue because due to above commit both local routes
are considered siblings... :/

local 2001:db8::1 proto kernel metric 0 
        nexthop dev dummy0 weight 1 
        nexthop dev dummy1 weight 1 pref medium

I think it's best to revert the patch and have Thomas submit a fixed
version to net-next. I was actually surprised to see it applied to net.

> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index f4d61736c41a..129dd4f4b264 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -3606,6 +3606,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
>  	rt->dst.input = ip6_input;
>  	rt->dst.output = ip6_output;
>  	rt->rt6i_idev = idev;
> +	rt->rt6i_nh_weight = 1;
>  
>  	rt->rt6i_protocol = RTPROT_KERNEL;
>  	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 18:53           ` Ido Schimmel
@ 2018-05-02 18:58             ` David Ahern
  2018-05-02 19:04               ` Ido Schimmel
  0 siblings, 1 reply; 24+ messages in thread
From: David Ahern @ 2018-05-02 18:58 UTC (permalink / raw)
  To: Ido Schimmel, Eric Dumazet, Thomas.Winter, davem
  Cc: Ido Schimmel, netdev, roopa, nikolay, pch, jkbs, yoshfuji, mlxsw

On 5/2/18 12:53 PM, Ido Schimmel wrote:
> 
> So this fixes the issue for me. To reproduce:
> 
> # ip -6 address add 2001:db8::1/64 dev dummy0
> # ip -6 address add 2001:db8::1/64 dev dummy1
> 
> This reproduces the issue because due to above commit both local routes
> are considered siblings... :/
> 
> local 2001:db8::1 proto kernel metric 0 
>         nexthop dev dummy0 weight 1 
>         nexthop dev dummy1 weight 1 pref medium
> 
> I think it's best to revert the patch and have Thomas submit a fixed
> version to net-next. I was actually surprised to see it applied to net.

ugly side effect of the way ecmp routes are managed in IPv6. I think
revert is the best option for now.

I need to look into a bug report related to v6 and route replace with
ecmp. I'll take a look at why the above is consolidated as well. Those
should not become an ecmp route.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 18:58             ` David Ahern
@ 2018-05-02 19:04               ` Ido Schimmel
  2018-05-02 20:48                 ` Thomas Winter
  0 siblings, 1 reply; 24+ messages in thread
From: Ido Schimmel @ 2018-05-02 19:04 UTC (permalink / raw)
  To: David Ahern
  Cc: Eric Dumazet, Thomas.Winter, davem, Ido Schimmel, netdev, roopa,
	nikolay, pch, jkbs, yoshfuji, mlxsw

On Wed, May 02, 2018 at 12:58:56PM -0600, David Ahern wrote:
> On 5/2/18 12:53 PM, Ido Schimmel wrote:
> > 
> > So this fixes the issue for me. To reproduce:
> > 
> > # ip -6 address add 2001:db8::1/64 dev dummy0
> > # ip -6 address add 2001:db8::1/64 dev dummy1
> > 
> > This reproduces the issue because due to above commit both local routes
> > are considered siblings... :/
> > 
> > local 2001:db8::1 proto kernel metric 0 
> >         nexthop dev dummy0 weight 1 
> >         nexthop dev dummy1 weight 1 pref medium
> > 
> > I think it's best to revert the patch and have Thomas submit a fixed
> > version to net-next. I was actually surprised to see it applied to net.
> 
> ugly side effect of the way ecmp routes are managed in IPv6. I think
> revert is the best option for now.

OK. I'll send a patch.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 19:04               ` Ido Schimmel
@ 2018-05-02 20:48                 ` Thomas Winter
  2018-05-02 20:56                   ` David Ahern
  0 siblings, 1 reply; 24+ messages in thread
From: Thomas Winter @ 2018-05-02 20:48 UTC (permalink / raw)
  To: Ido Schimmel, David Ahern
  Cc: Eric Dumazet, davem, Ido Schimmel, netdev, roopa, nikolay, pch,
	jkbs, yoshfuji, mlxsw

> On Wed, May 02, 2018 at 12:58:56PM -0600, David Ahern wrote:
> > On 5/2/18 12:53 PM, Ido Schimmel wrote:
> > > 
> > > So this fixes the issue for me. To reproduce:
> > > 
> > > # ip -6 address add 2001:db8::1/64 dev dummy0
> > > # ip -6 address add 2001:db8::1/64 dev dummy1
> > > 
> > > This reproduces the issue because due to above commit both local routes
> > > are considered siblings... :/
> > > 
> > > local 2001:db8::1 proto kernel metric 0 
> > >         nexthop dev dummy0 weight 1 
> > >         nexthop dev dummy1 weight 1 pref medium
> > > 
> > > I think it's best to revert the patch and have Thomas submit a fixed
> > > version to net-next. I was actually surprised to see it applied to net.
> > 
> > ugly side effect of the way ecmp routes are managed in IPv6. I think
> > revert is the best option for now.
> 
> OK. I'll send a patch.

fe80::/64  proto kernel  metric 256 
        nexthop dev vlan1 weight 1
        nexthop dev vlan10 weight 1
        nexthop dev vlan30 weight 1
        nexthop dev tunnel11 weight 1
        nexthop dev tunnel12 weight 1

Sorry I completely missed that, I was always looking at other route tables. 
Should I look at reworking this? It would be great to have these ECMP routes for other purposes.

ip -6 ro show table 601
default  metric 1024 
        nexthop dev tunnel11 weight 1
        nexthop dev tunnel12 weight 1

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 20:48                 ` Thomas Winter
@ 2018-05-02 20:56                   ` David Ahern
  2018-05-04  1:13                     ` David Ahern
  0 siblings, 1 reply; 24+ messages in thread
From: David Ahern @ 2018-05-02 20:56 UTC (permalink / raw)
  To: Thomas Winter, Ido Schimmel
  Cc: Eric Dumazet, davem, Ido Schimmel, netdev, roopa, nikolay, pch,
	jkbs, yoshfuji, mlxsw

On 5/2/18 2:48 PM, Thomas Winter wrote:
> Should I look at reworking this? It would be great to have these ECMP routes for other purposes.

Looking at my IPv6 bug list this change is on it -- allowing ECMP routes
to have a device only hop.

Let me take a look at it at the same time as a few other bugs.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
  2018-05-02 20:56                   ` David Ahern
@ 2018-05-04  1:13                     ` David Ahern
  0 siblings, 0 replies; 24+ messages in thread
From: David Ahern @ 2018-05-04  1:13 UTC (permalink / raw)
  To: Thomas Winter, Ido Schimmel
  Cc: Eric Dumazet, davem, Ido Schimmel, netdev, roopa, nikolay, pch,
	jkbs, yoshfuji, mlxsw

On 5/2/18 2:56 PM, David Ahern wrote:
> On 5/2/18 2:48 PM, Thomas Winter wrote:
>> Should I look at reworking this? It would be great to have these ECMP routes for other purposes.
> 
> Looking at my IPv6 bug list this change is on it -- allowing ECMP routes
> to have a device only hop.
> 
> Let me take a look at it at the same time as a few other bugs.
> 

I see the problem: the multipath code for IPv6 tries to helpful and
auto-determine that a new route can be appended to an existing one --
basically adding another nexthop if it already exists. What it should be
doing is requiring the NLM_F_APPEND to modify an existing route. If the
same prefix and metric comes down and APPEND or REPLACE is not set it
should fail EEXISTS rather than consolidating into an ECMP.

Fixing it to do the right thing will break existing userspace, but as it
stands it prevents dev only nexthops (no gateway) and replace with a
REJECT route ends up adding another route

e.g., ip -6 ro replace unreachable 2001:db8:104::/64

leaves the existing route and adds a new entry which can never be hit:

$ ip -6 ro ls
...
2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 pref medium
unreachable 2001:db8:104::/64 dev lo metric 1024 pref medium
...

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2018-05-04  1:13 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-09 14:40 [PATCH net-next 0/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
2018-01-09 14:40 ` [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops Ido Schimmel
2018-01-10  3:43   ` David Ahern
2018-05-02 16:43     ` Eric Dumazet
2018-05-02 17:21       ` Ido Schimmel
2018-05-02 17:52         ` Ido Schimmel
2018-05-02 18:53           ` Ido Schimmel
2018-05-02 18:58             ` David Ahern
2018-05-02 19:04               ` Ido Schimmel
2018-05-02 20:48                 ` Thomas Winter
2018-05-02 20:56                   ` David Ahern
2018-05-04  1:13                     ` David Ahern
2018-01-09 14:40 ` [PATCH net-next 2/4] ipv6: Use a 31-bit multipath hash Ido Schimmel
2018-01-10  3:43   ` David Ahern
2018-01-09 14:40 ` [PATCH net-next 3/4] ipv6: Use hash-threshold instead of modulo-N Ido Schimmel
2018-01-10  3:54   ` David Ahern
2018-01-10 12:02     ` Ido Schimmel
2018-01-09 14:40 ` [PATCH net-next 4/4] ipv6: Add support for non-equal-cost multipath Ido Schimmel
2018-01-10  3:48   ` David Ahern
2018-01-10 11:47     ` Ido Schimmel
2018-01-10 15:53       ` David Ahern
2018-01-10  4:38 ` [PATCH net-next 0/4] " David Ahern
2018-01-10 12:31   ` Ido Schimmel
2018-01-10 20:15 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.