netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next RFC] mpls: support for dead routes
@ 2015-10-29 15:49 Roopa Prabhu
  2015-10-29 16:53 ` Robert Shearman
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Roopa Prabhu @ 2015-10-29 15:49 UTC (permalink / raw)
  To: ebiederm, rshearma; +Cc: davem, netdev

From: Roopa Prabhu <roopa@cumulusnetworks.com>

Adds support for both RTNH_F_DEAD and RTNH_F_LINKDOWN flags.
This resembles ipv4 fib code. I also picked fib_rebalance from
ipv4. Enabled weights support for nexthop, just because the
infrastructure is already there.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
I want to get this in before net-next closes as promised.
I have tested it for the dead/linkdown flags. The multipath selection
and hash calculation in the face of dead routes needs some more
work. I am short on cycles this week and thought of getting some 
early feedback. Hence sending this out as RFC. I will continue with some
more testing.  Robert, I am using your hash algo but it needs some more
work with dead routes. If you already have any thoughts on this, i will
take them. thanks!.


 net/mpls/af_mpls.c  | 228 +++++++++++++++++++++++++++++++++++++++++++++-------
 net/mpls/internal.h |   4 +
 2 files changed, 202 insertions(+), 30 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750..7db9678 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -27,6 +27,8 @@
  */
 #define MAX_MP_SELECT_LABELS 4
 
+u32 mpls_multipath_secret __read_mostly;
+
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
 
@@ -96,22 +98,52 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
-static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
-					     struct sk_buff *skb, bool bos)
+static void mpls_multipath_rebalance(struct mpls_route *rt)
+{
+	int total;
+	int w;
+
+	if (rt->rt_nhn < 2)
+		return;
+
+	total = 0;
+	for_nexthops(rt) {
+		if ((nh->nh_flags & RTNH_F_DEAD) ||
+		    (nh->nh_flags & RTNH_F_LINKDOWN))
+			continue;
+
+		total += nh->nh_weight;
+	} endfor_nexthops(rt);
+
+	w = 0;
+	change_nexthops(rt) {
+		int upper_bound;
+
+		if ((nh->nh_flags & RTNH_F_DEAD) ||
+		    (nh->nh_flags & RTNH_F_LINKDOWN)) {
+			upper_bound = -1;
+		} else {
+			w += nh->nh_weight;
+			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+							    total) - 1;
+		}
+
+		atomic_set(&nh->nh_upper_bound, upper_bound);
+	} endfor_nexthops(rt);
+
+	net_get_random_once(&mpls_multipath_secret,
+			    sizeof(mpls_multipath_secret));
+}
+
+static u32 mpls_multipath_hash(struct mpls_route *rt,
+			       struct sk_buff *skb, bool bos)
 {
 	struct mpls_entry_decoded dec;
 	struct mpls_shim_hdr *hdr;
 	bool eli_seen = false;
 	int label_index;
-	int nh_index = 0;
 	u32 hash = 0;
 
-	/* No need to look further into packet if there's only
-	 * one path
-	 */
-	if (rt->rt_nhn == 1)
-		goto out;
-
 	for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
 	     label_index++) {
 		if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
@@ -165,9 +197,29 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 		}
 	}
 
-	nh_index = hash % rt->rt_nhn;
+	return hash;
+}
+
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+					     struct sk_buff *skb, bool bos)
+{
+	u32 hash = 0;
+
+	/* No need to look further into packet if there's only
+	 * one path
+	 */
+	if (rt->rt_nhn == 1)
+		goto out;
+
+	hash = mpls_multipath_hash(rt, skb, bos);
+	for_nexthops(rt) {
+		if (hash > atomic_read(&nh->nh_upper_bound))
+			continue;
+		return nh;
+	} endfor_nexthops(rt);
+
 out:
-	return &rt->rt_nh[nh_index];
+	return &rt->rt_nh[0];
 }
 
 static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
@@ -577,7 +629,7 @@ errout:
 }
 
 static int mpls_nh_build(struct net *net, struct mpls_route *rt,
-			 struct mpls_nh *nh, int oif,
+			 struct mpls_nh *nh, int oif, int hops,
 			 struct nlattr *via, struct nlattr *newdst)
 {
 	int err = -ENOMEM;
@@ -597,6 +649,7 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
 	if (err)
 		goto errout;
 
+	nh->nh_weight = hops + 1;
 	err = mpls_nh_assign_dev(net, rt, nh, oif);
 	if (err)
 		goto errout;
@@ -663,10 +716,9 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
 		if (!rtnh_ok(rtnh, remaining))
 			goto errout;
 
-		/* neither weighted multipath nor any flags
-		 * are supported
+		/* flags are not supported
 		 */
-		if (rtnh->rtnh_hops || rtnh->rtnh_flags)
+		if (rtnh->rtnh_flags)
 			goto errout;
 
 		attrlen = rtnh_attrlen(rtnh);
@@ -681,8 +733,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
 			goto errout;
 
 		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
-				    rtnh->rtnh_ifindex, nla_via,
-				    nla_newdst);
+				    rtnh->rtnh_ifindex, rtnh->rtnh_hops,
+				    nla_via, nla_newdst);
 		if (err)
 			goto errout;
 
@@ -875,34 +927,111 @@ free:
 	return ERR_PTR(err);
 }
 
-static void mpls_ifdown(struct net_device *dev)
+static void mpls_ifdown(struct net_device *dev, int event)
 {
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
-	struct mpls_dev *mdev;
 	unsigned index;
+	int dead;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	for (index = 0; index < net->mpls.platform_labels; index++) {
 		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+		int changed = 0;
+
 		if (!rt)
 			continue;
+		dead = 0;
 		for_nexthops(rt) {
+			if ((event == NETDEV_DOWN &&
+			     (nh->nh_flags & RTNH_F_DEAD)) ||
+			     (event == NETDEV_CHANGE &&
+			     (nh->nh_flags & RTNH_F_LINKDOWN))) {
+				dead++;
+				continue;
+			}
+
 			if (rtnl_dereference(nh->nh_dev) != dev)
 				continue;
-			nh->nh_dev = NULL;
+			switch (event) {
+			case NETDEV_DOWN:
+			case NETDEV_UNREGISTER:
+				nh->nh_flags |= RTNH_F_DEAD;
+				/* fall through */
+			case NETDEV_CHANGE:
+				nh->nh_flags |= RTNH_F_LINKDOWN;
+				changed = 1;
+				break;
+			}
+			if (event == NETDEV_UNREGISTER) {
+				nh->nh_dev = NULL;
+				dead = rt->rt_nhn;
+				changed = 1;
+				break;
+			}
+			dead++;
 		} endfor_nexthops(rt);
+
+		if (dead == rt->rt_nhn) {
+			switch (event) {
+			case NETDEV_DOWN:
+			case NETDEV_UNREGISTER:
+				rt->rt_flags |= RTNH_F_DEAD;
+				/* fall through */
+			case NETDEV_CHANGE:
+				rt->rt_flags |= RTNH_F_LINKDOWN;
+				changed = 1;
+				break;
+			}
+		}
+
+		if (changed)
+			mpls_multipath_rebalance(rt);
 	}
 
-	mdev = mpls_dev_get(dev);
-	if (!mdev)
-		return;
+	return;
+}
+
+static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
+{
+	struct mpls_route __rcu **platform_label;
+	struct net *net = dev_net(dev);
+	unsigned index;
+	int alive;
+
+	platform_label = rtnl_dereference(net->mpls.platform_label);
+	for (index = 0; index < net->mpls.platform_labels; index++) {
+		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+		int changed = 0;
+
+		if (!rt)
+			continue;
+		alive = 0;
+		for_nexthops(rt) {
+			struct net_device *nh_dev =
+				rtnl_dereference(nh->nh_dev);
+
+			if (!(nh->nh_flags & nh_flags)) {
+				alive++;
+				continue;
+			}
+			if (nh_dev != dev)
+				continue;
+			alive++;
+			nh->nh_flags &= ~nh_flags;
+			changed = 1;
+		} endfor_nexthops(rt);
 
-	mpls_dev_sysctl_unregister(mdev);
+		if (alive > 0) {
+			rt->rt_flags &= ~nh_flags;
+			changed = 1;
+		}
 
-	RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+		if (changed)
+			mpls_multipath_rebalance(rt);
+	}
 
-	kfree_rcu(mdev, rcu);
+	return;
 }
 
 static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
@@ -910,9 +1039,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct mpls_dev *mdev;
+	unsigned int flags;
 
-	switch(event) {
-	case NETDEV_REGISTER:
+	if (event == NETDEV_REGISTER) {
 		/* For now just support ethernet devices */
 		if ((dev->type == ARPHRD_ETHER) ||
 		    (dev->type == ARPHRD_LOOPBACK)) {
@@ -920,10 +1049,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 			if (IS_ERR(mdev))
 				return notifier_from_errno(PTR_ERR(mdev));
 		}
-		break;
+		return NOTIFY_OK;
+	}
 
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return NOTIFY_OK;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		mpls_ifdown(dev, event);
+		break;
+	case NETDEV_UP:
+		flags = dev_get_flags(dev);
+		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
+		else
+			mpls_ifup(dev, RTNH_F_DEAD);
+		break;
+	case NETDEV_CHANGE:
+		flags = dev_get_flags(dev);
+		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
+		else
+			mpls_ifdown(dev, event);
+		break;
 	case NETDEV_UNREGISTER:
-		mpls_ifdown(dev);
+		mpls_ifdown(dev, event);
+		mdev = mpls_dev_get(dev);
+		if (mdev) {
+			mpls_dev_sysctl_unregister(mdev);
+			RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+			kfree_rcu(mdev, rcu);
+		}
 		break;
 	case NETDEV_CHANGENAME:
 		mdev = mpls_dev_get(dev);
@@ -1237,6 +1395,10 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		dev = rtnl_dereference(nh->nh_dev);
 		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
 			goto nla_put_failure;
+		if (nh->nh_flags & RTNH_F_LINKDOWN)
+			rtm->rtm_flags |= RTNH_F_LINKDOWN;
+		if (nh->nh_flags & RTNH_F_DEAD)
+			rtm->rtm_flags |= RTNH_F_DEAD;
 	} else {
 		struct rtnexthop *rtnh;
 		struct nlattr *mp;
@@ -1253,6 +1415,12 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			dev = rtnl_dereference(nh->nh_dev);
 			if (dev)
 				rtnh->rtnh_ifindex = dev->ifindex;
+			if (nh->nh_flags & RTNH_F_LINKDOWN)
+				rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
+			if (nh->nh_flags & RTNH_F_DEAD)
+				rtnh->rtnh_flags |= RTNH_F_DEAD;
+
+			rtnh->rtnh_hops = nh->nh_weight - 1;
 			if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
 							    nh->nh_labels,
 							    nh->nh_label))
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bde52ce..7014032 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -41,6 +41,9 @@ enum mpls_payload_type {
 
 struct mpls_nh { /* next hop label forwarding entry */
 	struct net_device __rcu *nh_dev;
+	unsigned int		nh_flags;
+	int			nh_weight;
+	atomic_t                nh_upper_bound;
 	u32			nh_label[MAX_NEW_LABELS];
 	u8			nh_labels;
 	u8			nh_via_alen;
@@ -70,6 +73,7 @@ struct mpls_nh { /* next hop label forwarding entry */
  */
 struct mpls_route { /* next hop label forwarding entry */
 	struct rcu_head		rt_rcu;
+	unsigned int		rt_flags;
 	u8			rt_protocol;
 	u8			rt_payload_type;
 	u8			rt_max_alen;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-10-29 15:49 [PATCH net-next RFC] mpls: support for dead routes Roopa Prabhu
@ 2015-10-29 16:53 ` Robert Shearman
  2015-10-29 18:46   ` roopa
  2015-11-01 21:27 ` Eric W. Biederman
  2015-11-02 19:29 ` [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete Eric W. Biederman
  2 siblings, 1 reply; 14+ messages in thread
From: Robert Shearman @ 2015-10-29 16:53 UTC (permalink / raw)
  To: Roopa Prabhu, ebiederm; +Cc: davem, netdev

On 29/10/15 15:49, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> Adds support for both RTNH_F_DEAD and RTNH_F_LINKDOWN flags.
> This resembles ipv4 fib code. I also picked fib_rebalance from
> ipv4. Enabled weights support for nexthop, just because the
> infrastructure is already there.
>
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
> I want to get this in before net-next closes as promised.
> I have tested it for the dead/linkdown flags. The multipath selection
> and hash calculation in the face of dead routes needs some more
> work. I am short on cycles this week and thought of getting some
> early feedback. Hence sending this out as RFC. I will continue with some
> more testing.  Robert, I am using your hash algo but it needs some more
> work with dead routes. If you already have any thoughts on this, i will
> take them. thanks!.

If you were to sort the array of nexthops (and by implication via 
addresses) by their non-deadness keeping a count of the alive nexthops, 
then there's no need to resort to an O(n) algorithm for selecting the 
nexthop, and no need to store per-nh flags.

E.g. before eth0 link down:

+----------------------+
| rt_nhn = 3           |
| rt_nhn_alive = 3     |
+----------------------+
| nh 0:                |
| dev = eth0, ...      |
+----------------------+
| nh 1:                |
| dev = eth1, ...      |
+----------------------+
| nh 2:                |
| dev = eth0, ...      |
+----------------------+
| vias ...             |
+----------------------+

after eth0 link down:

+----------------------+
| rt_nhn = 3           |
| rt_nhn_alive = 1     |
+----------------------+
| nh 0:                |
| dev = eth1, ...      |
+----------------------+
| nh 1:                |
| dev = eth0, ...      |
+----------------------+
| nh 2:                |
| dev = eth0, ...      |
+----------------------+
| vias ...             |
+----------------------+

The mpls_select_multipath algorithm just then needs to be changed to use 
rt_nhn_alive instead of rt_nhn and will work otherwise as-is.

On link down you'll need to alloc a new route for RCU-safety, but you 
can presumably just do a kmemdup to reduce the amount of code you have 
to write and sort the nexthops in the copy. Link up will be similar.

Then on the mpls_dump_route, if the index of the nexthop is >= 
rt_nhn_alive then the path is link-down. If the nh_dev is NULL then 
generate RTNH_F_DEAD|RTNH_F_LINKDOWN for the flags, otherwise just 
RTNH_F_LINKDOWN.

This would use less memory and be faster for forwarding.

Thanks,
Rob

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-10-29 16:53 ` Robert Shearman
@ 2015-10-29 18:46   ` roopa
  2015-10-30 15:06     ` Robert Shearman
  0 siblings, 1 reply; 14+ messages in thread
From: roopa @ 2015-10-29 18:46 UTC (permalink / raw)
  To: Robert Shearman; +Cc: ebiederm, davem, netdev

On 10/29/15, 9:53 AM, Robert Shearman wrote:
> On 29/10/15 15:49, Roopa Prabhu wrote:
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> Adds support for both RTNH_F_DEAD and RTNH_F_LINKDOWN flags.
>> This resembles ipv4 fib code. I also picked fib_rebalance from
>> ipv4. Enabled weights support for nexthop, just because the
>> infrastructure is already there.
>>
>> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
>> ---
>> I want to get this in before net-next closes as promised.
>> I have tested it for the dead/linkdown flags. The multipath selection
>> and hash calculation in the face of dead routes needs some more
>> work. I am short on cycles this week and thought of getting some
>> early feedback. Hence sending this out as RFC. I will continue with some
>> more testing.  Robert, I am using your hash algo but it needs some more
>> work with dead routes. If you already have any thoughts on this, i will
>> take them. thanks!.
>
> If you were to sort the array of nexthops (and by implication via addresses) by their non-deadness keeping a count of the alive nexthops, then there's no need to resort to an O(n) algorithm for selecting the nexthop, and no need to store per-nh flags.
>
> E.g. before eth0 link down:
>
> +----------------------+
> | rt_nhn = 3           |
> | rt_nhn_alive = 3     |
> +----------------------+
> | nh 0:                |
> | dev = eth0, ...      |
> +----------------------+
> | nh 1:                |
> | dev = eth1, ...      |
> +----------------------+
> | nh 2:                |
> | dev = eth0, ...      |
> +----------------------+
> | vias ...             |
> +----------------------+
>
> after eth0 link down:
>
> +----------------------+
> | rt_nhn = 3           |
> | rt_nhn_alive = 1     |
> +----------------------+
> | nh 0:                |
> | dev = eth1, ...      |
> +----------------------+
> | nh 1:                |
> | dev = eth0, ...      |
> +----------------------+
> | nh 2:                |
> | dev = eth0, ...      |
> +----------------------+
> | vias ...             |
> +----------------------+
>
> The mpls_select_multipath algorithm just then needs to be changed to use rt_nhn_alive instead of rt_nhn and will work otherwise as-is.
>
> On link down you'll need to alloc a new route for RCU-safety, but you can presumably just do a kmemdup to reduce the amount of code you have to write and sort the nexthops in the copy. Link up will be similar.
You mean sort the nexthops on every link and carrier event ?. I don't see a need for it.
>
> Then on the mpls_dump_route, if the index of the nexthop is >= rt_nhn_alive then the path is link-down. If the nh_dev is NULL then generate RTNH_F_DEAD|RTNH_F_LINKDOWN for the flags, otherwise just RTNH_F_LINKDOWN.
I was not thinking of making nh_dev NULL on RTNH_F_DEAD. And i would prefer to store the RTNH flags instead of deriving them on every dump.
>
> This would use less memory and be faster for forwarding.
Thanks for your inputs Robert. I am not see a huge advantage in sorting the nexthops on link events.
And i will be only saving an 'int' in a nexthop.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-10-29 18:46   ` roopa
@ 2015-10-30 15:06     ` Robert Shearman
  2015-11-01 21:24       ` Eric W. Biederman
  0 siblings, 1 reply; 14+ messages in thread
From: Robert Shearman @ 2015-10-30 15:06 UTC (permalink / raw)
  To: roopa; +Cc: ebiederm, davem, netdev

On 29/10/15 18:46, roopa wrote:
> On 10/29/15, 9:53 AM, Robert Shearman wrote:
>> On 29/10/15 15:49, Roopa Prabhu wrote:
>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>
>>> Adds support for both RTNH_F_DEAD and RTNH_F_LINKDOWN flags.
>>> This resembles ipv4 fib code. I also picked fib_rebalance from
>>> ipv4. Enabled weights support for nexthop, just because the
>>> infrastructure is already there.
>>>
>>> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
>>> ---
>>> I want to get this in before net-next closes as promised.
>>> I have tested it for the dead/linkdown flags. The multipath selection
>>> and hash calculation in the face of dead routes needs some more
>>> work. I am short on cycles this week and thought of getting some
>>> early feedback. Hence sending this out as RFC. I will continue with some
>>> more testing.  Robert, I am using your hash algo but it needs some more
>>> work with dead routes. If you already have any thoughts on this, i will
>>> take them. thanks!.
>>
>> If you were to sort the array of nexthops (and by implication via addresses) by their non-deadness keeping a count of the alive nexthops, then there's no need to resort to an O(n) algorithm for selecting the nexthop, and no need to store per-nh flags.
>>
>> E.g. before eth0 link down:
>>
>> +----------------------+
>> | rt_nhn = 3           |
>> | rt_nhn_alive = 3     |
>> +----------------------+
>> | nh 0:                |
>> | dev = eth0, ...      |
>> +----------------------+
>> | nh 1:                |
>> | dev = eth1, ...      |
>> +----------------------+
>> | nh 2:                |
>> | dev = eth0, ...      |
>> +----------------------+
>> | vias ...             |
>> +----------------------+
>>
>> after eth0 link down:
>>
>> +----------------------+
>> | rt_nhn = 3           |
>> | rt_nhn_alive = 1     |
>> +----------------------+
>> | nh 0:                |
>> | dev = eth1, ...      |
>> +----------------------+
>> | nh 1:                |
>> | dev = eth0, ...      |
>> +----------------------+
>> | nh 2:                |
>> | dev = eth0, ...      |
>> +----------------------+
>> | vias ...             |
>> +----------------------+
>>
>> The mpls_select_multipath algorithm just then needs to be changed to use rt_nhn_alive instead of rt_nhn and will work otherwise as-is.
>>
>> On link down you'll need to alloc a new route for RCU-safety, but you can presumably just do a kmemdup to reduce the amount of code you have to write and sort the nexthops in the copy. Link up will be similar.
> You mean sort the nexthops on every link and carrier event ?. I don't see a need for it.
>>
>> Then on the mpls_dump_route, if the index of the nexthop is >= rt_nhn_alive then the path is link-down. If the nh_dev is NULL then generate RTNH_F_DEAD|RTNH_F_LINKDOWN for the flags, otherwise just RTNH_F_LINKDOWN.
> I was not thinking of making nh_dev NULL on RTNH_F_DEAD. And i would prefer to store the RTNH flags instead of deriving them on every dump.
>>
>> This would use less memory and be faster for forwarding.
> Thanks for your inputs Robert. I am not see a huge advantage in sorting the nexthops on link events.
> And i will be only saving an 'int' in a nexthop.

It avoids the extra 12 bytes per nexthop and it means that you don't 
need to walk through every nexthop in the worst case to select a path 
during forwarding.

Thanks,
Rob

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-10-30 15:06     ` Robert Shearman
@ 2015-11-01 21:24       ` Eric W. Biederman
  0 siblings, 0 replies; 14+ messages in thread
From: Eric W. Biederman @ 2015-11-01 21:24 UTC (permalink / raw)
  To: Robert Shearman; +Cc: roopa, davem, netdev

Robert Shearman <rshearma@brocade.com> writes:

> On 29/10/15 18:46, roopa wrote:
>> On 10/29/15, 9:53 AM, Robert Shearman wrote:
>>> On 29/10/15 15:49, Roopa Prabhu wrote:
>>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>>
>>>> Adds support for both RTNH_F_DEAD and RTNH_F_LINKDOWN flags.
>>>> This resembles ipv4 fib code. I also picked fib_rebalance from
>>>> ipv4. Enabled weights support for nexthop, just because the
>>>> infrastructure is already there.
>>>>
>>>> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>> ---
>>>> I want to get this in before net-next closes as promised.
>>>> I have tested it for the dead/linkdown flags. The multipath selection
>>>> and hash calculation in the face of dead routes needs some more
>>>> work. I am short on cycles this week and thought of getting some
>>>> early feedback. Hence sending this out as RFC. I will continue with some
>>>> more testing.  Robert, I am using your hash algo but it needs some more
>>>> work with dead routes. If you already have any thoughts on this, i will
>>>> take them. thanks!.
>>>
>>> If you were to sort the array of nexthops (and by implication via addresses) by their non-deadness keeping a count of the alive nexthops, then there's no need to resort to an O(n) algorithm for selecting the nexthop, and no need to store per-nh flags.
>>>
>>> E.g. before eth0 link down:
>>>
>>> +----------------------+
>>> | rt_nhn = 3           |
>>> | rt_nhn_alive = 3     |
>>> +----------------------+
>>> | nh 0:                |
>>> | dev = eth0, ...      |
>>> +----------------------+
>>> | nh 1:                |
>>> | dev = eth1, ...      |
>>> +----------------------+
>>> | nh 2:                |
>>> | dev = eth0, ...      |
>>> +----------------------+
>>> | vias ...             |
>>> +----------------------+
>>>
>>> after eth0 link down:
>>>
>>> +----------------------+
>>> | rt_nhn = 3           |
>>> | rt_nhn_alive = 1     |
>>> +----------------------+
>>> | nh 0:                |
>>> | dev = eth1, ...      |
>>> +----------------------+
>>> | nh 1:                |
>>> | dev = eth0, ...      |
>>> +----------------------+
>>> | nh 2:                |
>>> | dev = eth0, ...      |
>>> +----------------------+
>>> | vias ...             |
>>> +----------------------+
>>>
>>> The mpls_select_multipath algorithm just then needs to be changed to use rt_nhn_alive instead of rt_nhn and will work otherwise as-is.
>>>
>>> On link down you'll need to alloc a new route for RCU-safety, but you can presumably just do a kmemdup to reduce the amount of code you have to write and sort the nexthops in the copy. Link up will be similar.
>> You mean sort the nexthops on every link and carrier event ?. I don't see a need for it.
>>>
>>> Then on the mpls_dump_route, if the index of the nexthop is >= rt_nhn_alive then the path is link-down. If the nh_dev is NULL then generate RTNH_F_DEAD|RTNH_F_LINKDOWN for the flags, otherwise just RTNH_F_LINKDOWN.
>> I was not thinking of making nh_dev NULL on RTNH_F_DEAD. And i would prefer to store the RTNH flags instead of deriving them on every dump.
>>>
>>> This would use less memory and be faster for forwarding.
>> Thanks for your inputs Robert. I am not see a huge advantage in sorting the nexthops on link events.
>> And i will be only saving an 'int' in a nexthop.
>
> It avoids the extra 12 bytes per nexthop and it means that you don't
> need to walk through every nexthop in the worst case to select a path
> during forwarding.

And the walk appears both inherent inherent in the notion of weighted
multipath forward.  Always forcing the code to use a O(N) algorithm when
forwarding packets seems unfortunate.

So please for this first round let's get equal cost multipath forwarding
working and then we can consider weighted multipath routing on it's own
merits.

Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-10-29 15:49 [PATCH net-next RFC] mpls: support for dead routes Roopa Prabhu
  2015-10-29 16:53 ` Robert Shearman
@ 2015-11-01 21:27 ` Eric W. Biederman
  2015-11-02  1:01   ` roopa
  2015-11-02 19:29 ` [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete Eric W. Biederman
  2 siblings, 1 reply; 14+ messages in thread
From: Eric W. Biederman @ 2015-11-01 21:27 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: rshearma, davem, netdev

Roopa Prabhu <roopa@cumulusnetworks.com> writes:

> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index c70d750..7db9678 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -27,6 +27,8 @@
>   */
>  #define MAX_MP_SELECT_LABELS 4
>  
> +u32 mpls_multipath_secret __read_mostly;

This mpls_multipath_secret is never used.

Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-11-01 21:27 ` Eric W. Biederman
@ 2015-11-02  1:01   ` roopa
  2015-11-02  5:08     ` Eric W. Biederman
  0 siblings, 1 reply; 14+ messages in thread
From: roopa @ 2015-11-02  1:01 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: rshearma, davem, netdev

On 11/1/15, 1:27 PM, Eric W. Biederman wrote:
> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>
>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>> index c70d750..7db9678 100644
>> --- a/net/mpls/af_mpls.c
>> +++ b/net/mpls/af_mpls.c
>> @@ -27,6 +27,8 @@
>>   */
>>  #define MAX_MP_SELECT_LABELS 4
>>  
>> +u32 mpls_multipath_secret __read_mostly;
> This mpls_multipath_secret is never used.
>
It is used in mpls_multipath_rebalance, same as fib_rebalance.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-11-02  1:01   ` roopa
@ 2015-11-02  5:08     ` Eric W. Biederman
  2015-11-02 21:18       ` roopa
  0 siblings, 1 reply; 14+ messages in thread
From: Eric W. Biederman @ 2015-11-02  5:08 UTC (permalink / raw)
  To: roopa; +Cc: rshearma, davem, netdev

roopa <roopa@cumulusnetworks.com> writes:

> On 11/1/15, 1:27 PM, Eric W. Biederman wrote:
>> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>>
>>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>>> index c70d750..7db9678 100644
>>> --- a/net/mpls/af_mpls.c
>>> +++ b/net/mpls/af_mpls.c
>>> @@ -27,6 +27,8 @@
>>>   */
>>>  #define MAX_MP_SELECT_LABELS 4
>>>  
>>> +u32 mpls_multipath_secret __read_mostly;
>> This mpls_multipath_secret is never used.
>>
> It is used in mpls_multipath_rebalance, same as fib_rebalance.

fib_multipath_secret is only initialized in fib_rebalalance.
fib_multipath_secret is used used in fib_multipath_hash,
which is called in fib_select_path.

The equivalent code does not exist in your mpls patch.

Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete
  2015-10-29 15:49 [PATCH net-next RFC] mpls: support for dead routes Roopa Prabhu
  2015-10-29 16:53 ` Robert Shearman
  2015-11-01 21:27 ` Eric W. Biederman
@ 2015-11-02 19:29 ` Eric W. Biederman
  2015-11-02 19:49   ` Sergei Shtylyov
  2 siblings, 1 reply; 14+ messages in thread
From: Eric W. Biederman @ 2015-11-02 19:29 UTC (permalink / raw)
  To: David Miller; +Cc: rshearma, netdev, Roopa Prabhu


Currently the multipath code has a nasty failure mode in that it will
fail to notice link down or administrative device down and will
instead black hole packets instead of sending them to their nexthop
destination.

Half the point of multipath is to gracefully handle forwarding path
failures and as the current code does not handle forwarding failures the
current code is dangerous to use.

As mpls multipath has never been exported to userspace and as the
implementation was not complete before the merge window disable the mpls
multipath code by rejecting all multipath configuration requests.  This
will give us another kernel development cycle to cleanly sort out the
issues, without any bad precedents to worry about.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 net/mpls/af_mpls.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750148b6..893cd2dc3979 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 		{
 			cfg->rc_mp = nla_data(nla);
 			cfg->rc_mp_len = nla_len(nla);
+			/* Fail until multipath support is complete */
+			goto errout;
 			break;
 		}
 		default:
-- 
2.2.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete
  2015-11-02 19:29 ` [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete Eric W. Biederman
@ 2015-11-02 19:49   ` Sergei Shtylyov
  2015-11-03  6:09     ` Eric W. Biederman
  0 siblings, 1 reply; 14+ messages in thread
From: Sergei Shtylyov @ 2015-11-02 19:49 UTC (permalink / raw)
  To: Eric W. Biederman, David Miller; +Cc: rshearma, netdev, Roopa Prabhu

Hello.

On 11/02/2015 10:29 PM, Eric W. Biederman wrote:

> Currently the multipath code has a nasty failure mode in that it will
> fail to notice link down or administrative device down and will
> instead black hole packets instead of sending them to their nexthop
> destination.
>
> Half the point of multipath is to gracefully handle forwarding path
> failures and as the current code does not handle forwarding failures the
> current code is dangerous to use.
>
> As mpls multipath has never been exported to userspace and as the
> implementation was not complete before the merge window disable the mpls
> multipath code by rejecting all multipath configuration requests.  This
> will give us another kernel development cycle to cleanly sort out the
> issues, without any bad precedents to worry about.
>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>   net/mpls/af_mpls.c | 2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index c70d750148b6..893cd2dc3979 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>   		{
>   			cfg->rc_mp = nla_data(nla);
>   			cfg->rc_mp_len = nla_len(nla);
> +			/* Fail until multipath support is complete */
> +			goto errout;
>   			break;

    Forgot to delete *break*?

>   		}
>   		default:
>

MBR, Sergei

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next RFC] mpls: support for dead routes
  2015-11-02  5:08     ` Eric W. Biederman
@ 2015-11-02 21:18       ` roopa
  0 siblings, 0 replies; 14+ messages in thread
From: roopa @ 2015-11-02 21:18 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: rshearma, davem, netdev

On 11/1/15, 9:08 PM, Eric W. Biederman wrote:
> roopa <roopa@cumulusnetworks.com> writes:
>
>> On 11/1/15, 1:27 PM, Eric W. Biederman wrote:
>>> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>>>
>>>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>>>> index c70d750..7db9678 100644
>>>> --- a/net/mpls/af_mpls.c
>>>> +++ b/net/mpls/af_mpls.c
>>>> @@ -27,6 +27,8 @@
>>>>   */
>>>>  #define MAX_MP_SELECT_LABELS 4
>>>>  
>>>> +u32 mpls_multipath_secret __read_mostly;
>>> This mpls_multipath_secret is never used.
>>>
>> It is used in mpls_multipath_rebalance, same as fib_rebalance.
> fib_multipath_secret is only initialized in fib_rebalalance.
> fib_multipath_secret is used used in fib_multipath_hash,
> which is called in fib_select_path.
>
> The equivalent code does not exist in your mpls patch.
sorry, missed that. I have posted a new version.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete
  2015-11-02 19:49   ` Sergei Shtylyov
@ 2015-11-03  6:09     ` Eric W. Biederman
  2015-11-03 14:03       ` roopa
  0 siblings, 1 reply; 14+ messages in thread
From: Eric W. Biederman @ 2015-11-03  6:09 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: David Miller, rshearma, netdev, Roopa Prabhu

Sergei Shtylyov <sergei.shtylyov@cogentembedded.com> writes:

> Hello.
>>
>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>> index c70d750148b6..893cd2dc3979 100644
>> --- a/net/mpls/af_mpls.c
>> +++ b/net/mpls/af_mpls.c
>> @@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>>   		{
>>   			cfg->rc_mp = nla_data(nla);
>>   			cfg->rc_mp_len = nla_len(nla);
>> +			/* Fail until multipath support is complete */
>> +			goto errout;
>>   			break;
>
>    Forgot to delete *break*?

Nope.  I did that deliberately, because this code is not supposed to
stay this way for long.

Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete
  2015-11-03  6:09     ` Eric W. Biederman
@ 2015-11-03 14:03       ` roopa
  2015-11-03 15:14         ` Robert Shearman
  0 siblings, 1 reply; 14+ messages in thread
From: roopa @ 2015-11-03 14:03 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Sergei Shtylyov, David Miller, rshearma, netdev

On 11/2/15, 10:09 PM, Eric W. Biederman wrote:
> Sergei Shtylyov <sergei.shtylyov@cogentembedded.com> writes:
>
>> Hello.
>>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>>> index c70d750148b6..893cd2dc3979 100644
>>> --- a/net/mpls/af_mpls.c
>>> +++ b/net/mpls/af_mpls.c
>>> @@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>>>   		{
>>>   			cfg->rc_mp = nla_data(nla);
>>>   			cfg->rc_mp_len = nla_len(nla);
>>> +			/* Fail until multipath support is complete */
>>> +			goto errout;
>>>   			break;
>>    Forgot to delete *break*?
> Nope.  I did that deliberately, because this code is not supposed to
> stay this way for long.
Eric, this is only until dead route support is added correct ?.
If yes, then my v2 on dead route/nh support is out. I know david is only taking bug fixes right now...
and I thought the dead route/nh is a bug fix. If you are ok with v2, we could skip this patch ?.

If david decides to not pick it up in net-next, i was hoping to submit it as a bug-fix to net.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete
  2015-11-03 14:03       ` roopa
@ 2015-11-03 15:14         ` Robert Shearman
  0 siblings, 0 replies; 14+ messages in thread
From: Robert Shearman @ 2015-11-03 15:14 UTC (permalink / raw)
  To: roopa, Eric W. Biederman; +Cc: Sergei Shtylyov, David Miller, netdev

On 03/11/15 14:03, roopa wrote:
> On 11/2/15, 10:09 PM, Eric W. Biederman wrote:
>> Sergei Shtylyov <sergei.shtylyov@cogentembedded.com> writes:
>>
>>> Hello.
>>>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>>>> index c70d750148b6..893cd2dc3979 100644
>>>> --- a/net/mpls/af_mpls.c
>>>> +++ b/net/mpls/af_mpls.c
>>>> @@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>>>>    		{
>>>>    			cfg->rc_mp = nla_data(nla);
>>>>    			cfg->rc_mp_len = nla_len(nla);
>>>> +			/* Fail until multipath support is complete */
>>>> +			goto errout;
>>>>    			break;
>>>     Forgot to delete *break*?
>> Nope.  I did that deliberately, because this code is not supposed to
>> stay this way for long.
> Eric, this is only until dead route support is added correct ?.
> If yes, then my v2 on dead route/nh support is out. I know david is only taking bug fixes right now...
> and I thought the dead route/nh is a bug fix. If you are ok with v2, we could skip this patch ?.

Roopa,

I think we should take our time with the dead nexthop changes, and 
Eric's patch gives us that time.

Thanks,
Rob

>
> If david decides to not pick it up in net-next, i was hoping to submit it as a bug-fix to net.
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-11-03 15:14 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-29 15:49 [PATCH net-next RFC] mpls: support for dead routes Roopa Prabhu
2015-10-29 16:53 ` Robert Shearman
2015-10-29 18:46   ` roopa
2015-10-30 15:06     ` Robert Shearman
2015-11-01 21:24       ` Eric W. Biederman
2015-11-01 21:27 ` Eric W. Biederman
2015-11-02  1:01   ` roopa
2015-11-02  5:08     ` Eric W. Biederman
2015-11-02 21:18       ` roopa
2015-11-02 19:29 ` [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete Eric W. Biederman
2015-11-02 19:49   ` Sergei Shtylyov
2015-11-03  6:09     ` Eric W. Biederman
2015-11-03 14:03       ` roopa
2015-11-03 15:14         ` Robert Shearman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).