All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
@ 2017-03-25 17:03 David Ahern
  2017-03-25 17:03 ` [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8 David Ahern
                   ` (6 more replies)
  0 siblings, 7 replies; 20+ messages in thread
From: David Ahern @ 2017-03-25 17:03 UTC (permalink / raw)
  To: netdev; +Cc: roopa, rshearma, ebiederm, David Ahern

Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
memory consumption in check the labels array is moved to the end of mpls_nh
and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
maximum number of labels across all nexthops in a route for LSR and the
number of labels configured for LWT.

The mpls_route layout is changed to:

   +----------------------+
   | mpls_route           |
   +----------------------+
   | mpls_nh 0            |
   +----------------------+
   | alignment padding    |   4 bytes for odd number of labels; 0 for even
   +----------------------+
   | via[rt_max_alen] 0   |
   +----------------------+
   | alignment padding    |   via's aligned on sizeof(unsigned long)
   +----------------------+
   | ...                  |

Meaning the via follows its mpls_nh providing better locality as the
number of labels increases. UDP_RR tests with namespaces shows no impact
to a modest performance increase with this layout for 1 or 2 labels and
1 or 2 nexthops.

The new limit is set to 12 to cover all currently known segment
routing use cases.

David Ahern (4):
  mpls: Convert number of nexthops to u8
  net: mpls: change mpls_route layout
  net: mpls: bump maximum number of labels
  net: mpls: Increase max number of labels for lwt encap

 include/net/mpls_iptunnel.h |   4 +-
 net/mpls/af_mpls.c          | 108 ++++++++++++++++++++++++++------------------
 net/mpls/internal.h         |  52 ++++++++++++++-------
 net/mpls/mpls_iptunnel.c    |  13 ++++--
 4 files changed, 112 insertions(+), 65 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
@ 2017-03-25 17:03 ` David Ahern
  2017-03-27  3:11   ` Eric W. Biederman
  2017-03-25 17:03 ` [PATCH net-next 2/4] net: mpls: change mpls_route layout David Ahern
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2017-03-25 17:03 UTC (permalink / raw)
  To: netdev; +Cc: roopa, rshearma, ebiederm, David Ahern

Number of nexthops and number of alive nexthops are tracked using an
unsigned int. A route should never have more than 255 nexthops so
convert both to u8. Update all references and intermediate variables
to consistently use u8 as well.

Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte
hole before the nexthops.

Also, the ACCESS_ONCE is changed to READ_ONCE per checkpatch message.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 net/mpls/af_mpls.c  | 29 ++++++++++++++++++-----------
 net/mpls/internal.h |  4 ++--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index cd8be8d5e4ad..d3dc6c43a1d1 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -192,7 +192,7 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
 static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 					     struct sk_buff *skb)
 {
-	int alive = ACCESS_ONCE(rt->rt_nhn_alive);
+	u8 alive = READ_ONCE(rt->rt_nhn_alive);
 	u32 hash = 0;
 	int nh_index = 0;
 	int n = 0;
@@ -458,7 +458,7 @@ struct mpls_route_config {
 	int			rc_mp_len;
 };
 
-static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
+static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen)
 {
 	u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
 	struct mpls_route *rt;
@@ -736,11 +736,11 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
 	return err;
 }
 
-static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
-			       u8 cfg_via_alen, u8 *max_via_alen)
+static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
+			      u8 cfg_via_alen, u8 *max_via_alen)
 {
-	int nhs = 0;
 	int remaining = len;
+	u8 nhs = 0;
 
 	if (!rtnh) {
 		*max_via_alen = cfg_via_alen;
@@ -765,7 +765,14 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
 						      via_alen);
 		}
 
+		/* number of nexthops is tracked by a u8.
+		 * Check for overflow.
+		 */
+		if (nhs == 255)
+			return 0;
+
 		nhs++;
+
 		rtnh = rtnh_next(rtnh, &remaining);
 	}
 
@@ -779,8 +786,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
 	struct rtnexthop *rtnh = cfg->rc_mp;
 	struct nlattr *nla_via, *nla_newdst;
 	int remaining = cfg->rc_mp_len;
-	int nhs = 0;
 	int err = 0;
+	u8 nhs = 0;
 
 	change_nexthops(rt) {
 		int attrlen;
@@ -834,7 +841,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	int err = -EINVAL;
 	u8 max_via_alen;
 	unsigned index;
-	int nhs;
+	u8 nhs;
 
 	index = cfg->rc_label;
 
@@ -1299,8 +1306,8 @@ static void mpls_ifdown(struct net_device *dev, int event)
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
 	unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
-	unsigned int alive;
 	unsigned index;
+	u8 alive;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	for (index = 0; index < net->mpls.platform_labels; index++) {
@@ -1339,7 +1346,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
 	unsigned index;
-	int alive;
+	u8 alive;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	for (index = 0; index < net->mpls.platform_labels; index++) {
@@ -1761,8 +1768,8 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	} else {
 		struct rtnexthop *rtnh;
 		struct nlattr *mp;
-		int dead = 0;
-		int linkdown = 0;
+		u8 linkdown = 0;
+		u8 dead = 0;
 
 		mp = nla_nest_start(skb, RTA_MULTIPATH);
 		if (!mp)
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 62928d8fabd1..66f388ba2d49 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -123,8 +123,8 @@ struct mpls_route { /* next hop label forwarding entry */
 	u8			rt_payload_type;
 	u8			rt_max_alen;
 	u8			rt_ttl_propagate;
-	unsigned int		rt_nhn;
-	unsigned int		rt_nhn_alive;
+	u8			rt_nhn;
+	u8			rt_nhn_alive;
 	struct mpls_nh		rt_nh[0];
 };
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH net-next 2/4] net: mpls: change mpls_route layout
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
  2017-03-25 17:03 ` [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8 David Ahern
@ 2017-03-25 17:03 ` David Ahern
  2017-03-28  0:04   ` Eric W. Biederman
  2017-03-25 17:03 ` [PATCH net-next 3/4] net: mpls: bump maximum number of labels David Ahern
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2017-03-25 17:03 UTC (permalink / raw)
  To: netdev; +Cc: roopa, rshearma, ebiederm, David Ahern

Move labels to the end of mpls_nh as a 0-sized array and within mpls_route
move the via for a nexthop after the mpls_nh. The new layout becomes:

   +----------------------+
   | mpls_route           |
   +----------------------+
   | mpls_nh 0            |
   +----------------------+
   | alignment padding    |   4 bytes for odd number of labels; 0 for even
   +----------------------+
   | via[rt_max_alen] 0   |
   +----------------------+
   | alignment padding    |   via's aligned on sizeof(unsigned long)
   +----------------------+
   | ...                  |
   +----------------------+
   | mpls_nh n-1          |
   +----------------------+
   | via[rt_max_alen] n-1 |
   +----------------------+

Memory allocated for nexthop + via is constant across all nexthops and
their via is based on the maximum number of labels across all nexthops and
the maximum via length. The size is saved in the mpls_route as rt_nh_size.
Accessing a nexthop becomes rt->rt_nh + index * rt->rt_nh_size.

The offset of the via address from a nexthop is saved as rt_via_offset
so that given an mpls_nh pointer the via for that hop is simply
nh + rt->rt_via_offset.

With prior code, memory allocated per mpls_route with 1 nexthop:
     via is a h/w address   - 64 bytes
     via is an ipv4 address - 64
     via is an ipv6 address - 72

With this patch set, memory allocated per mpls_route with 1 nexthop and
1 or 2 labels:
     via is a h/w address   - 56 bytes
     via is an ipv4 address - 56
     via is an ipv6 address - 64

The 8-byte reduction is due to the previous patch; the change introduced
by this patch has no impact on the size of allocations for 1 or 2 labels.

Performance impact of this change was examined using network namespaces
with veth pairs connecting namespaces. ns0 inserts the packet to the
label-switched path using an lwt route with encap mpls. ns1 adds 1 or 2
labels depending on test, ns2 (and ns3 for 2-label test) pops the label
and forwards. ns3 (or ns4) for a 2-label is the destination. Similar
series of namespaces used for 2-nexthop test.

Intent is to measure changes to latency (overhead in manipulating the
packet) in the forwarding path. Tests used netperf with UDP_RR.

IPv4:                     current   patches
   1 label, 1 nexthop      29908     30115
   2 label, 1 nexthop      29071     29612
   1 label, 2 nexthop      29582     29776
   2 label, 2 nexthop      29086     29149

IPv6:                     current   patches
   1 label, 1 nexthop      24502     24960
   2 label, 1 nexthop      24041     24407
   1 label, 2 nexthop      23795     23899
   2 label, 2 nexthop      23074     22959

In short, the change has no effect to a modest increase in performance.
This is expected since this patch does not really have an impact on routes
with 1 or 2 labels (the current limit) and 1 or 2 nexthops.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 net/mpls/af_mpls.c  | 37 +++++++++++++++++++++----------------
 net/mpls/internal.h | 43 +++++++++++++++++++++++++++++--------------
 2 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index d3dc6c43a1d1..e402e5148765 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -24,6 +24,8 @@
 #include <net/nexthop.h>
 #include "internal.h"
 
+#define MAX_NEW_LABELS 2
+
 /* Maximum number of labels to look ahead at when selecting a path of
  * a multipath route
  */
@@ -60,10 +62,7 @@ EXPORT_SYMBOL_GPL(mpls_output_possible);
 
 static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh)
 {
-	u8 *nh0_via = PTR_ALIGN((u8 *)&rt->rt_nh[rt->rt_nhn], VIA_ALEN_ALIGN);
-	int nh_index = nh - rt->rt_nh;
-
-	return nh0_via + rt->rt_max_alen * nh_index;
+	return (u8 *)nh + rt->rt_via_offset;
 }
 
 static const u8 *mpls_nh_via(const struct mpls_route *rt,
@@ -189,6 +188,11 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
 	return hash;
 }
 
+static struct mpls_nh *mpls_get_nexthop(struct mpls_route *rt, u8 index)
+{
+	return (struct mpls_nh *)((u8 *)rt->rt_nh + index * rt->rt_nh_size);
+}
+
 static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 					     struct sk_buff *skb)
 {
@@ -201,7 +205,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 	 * one path
 	 */
 	if (rt->rt_nhn == 1)
-		goto out;
+		return rt->rt_nh;
 
 	if (alive <= 0)
 		return NULL;
@@ -219,7 +223,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 	} endfor_nexthops(rt);
 
 out:
-	return &rt->rt_nh[nh_index];
+	return mpls_get_nexthop(rt, nh_index);
 }
 
 static bool mpls_egress(struct net *net, struct mpls_route *rt,
@@ -458,19 +462,20 @@ struct mpls_route_config {
 	int			rc_mp_len;
 };
 
-static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen)
+/* all nexthops within a route have the same size based on max
+ * number of labels and max via length for a hop
+ */
+static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels)
 {
-	u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
+	u8 nh_size = MPLS_NH_SIZE(max_labels, max_alen);
 	struct mpls_route *rt;
 
-	rt = kzalloc(ALIGN(sizeof(*rt) + num_nh * sizeof(*rt->rt_nh),
-			   VIA_ALEN_ALIGN) +
-		     num_nh * max_alen_aligned,
-		     GFP_KERNEL);
+	rt = kzalloc(sizeof(*rt) + num_nh * nh_size, GFP_KERNEL);
 	if (rt) {
 		rt->rt_nhn = num_nh;
 		rt->rt_nhn_alive = num_nh;
-		rt->rt_max_alen = max_alen_aligned;
+		rt->rt_nh_size = nh_size;
+		rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels);
 	}
 
 	return rt;
@@ -885,7 +890,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 		goto errout;
 
 	err = -ENOMEM;
-	rt = mpls_rt_alloc(nhs, max_via_alen);
+	rt = mpls_rt_alloc(nhs, max_via_alen, MAX_NEW_LABELS);
 	if (!rt)
 		goto errout;
 
@@ -1936,7 +1941,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	/* In case the predefined labels need to be populated */
 	if (limit > MPLS_LABEL_IPV4NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt0 = mpls_rt_alloc(1, lo->addr_len);
+		rt0 = mpls_rt_alloc(1, lo->addr_len, MAX_NEW_LABELS);
 		if (!rt0)
 			goto nort0;
 		RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo);
@@ -1950,7 +1955,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	}
 	if (limit > MPLS_LABEL_IPV6NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt2 = mpls_rt_alloc(1, lo->addr_len);
+		rt2 = mpls_rt_alloc(1, lo->addr_len, MAX_NEW_LABELS);
 		if (!rt2)
 			goto nort2;
 		RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 66f388ba2d49..302d48f54b57 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -64,7 +64,6 @@ struct mpls_dev {
 struct sk_buff;
 
 #define LABEL_NOT_SPECIFIED (1 << 20)
-#define MAX_NEW_LABELS 2
 
 /* This maximum ha length copied from the definition of struct neighbour */
 #define VIA_ALEN_ALIGN sizeof(unsigned long)
@@ -84,12 +83,25 @@ enum mpls_payload_type {
 struct mpls_nh { /* next hop label forwarding entry */
 	struct net_device __rcu *nh_dev;
 	unsigned int		nh_flags;
-	u32			nh_label[MAX_NEW_LABELS];
 	u8			nh_labels;
 	u8			nh_via_alen;
 	u8			nh_via_table;
+	/* u8 hole */
+	u32			nh_label[0];
 };
 
+/* offset of via from beginning of mpls_nh */
+#define MPLS_NH_VIA_OFF(num_labels) \
+		ALIGN(sizeof(struct mpls_nh) + (num_labels) * sizeof(u32), \
+		      VIA_ALEN_ALIGN)
+
+/* all nexthops within a route have the same size based on the
+ * max number of labels and max via length across all nexthops
+ */
+#define MPLS_NH_SIZE(num_labels, max_via_alen)		\
+		(MPLS_NH_VIA_OFF((num_labels)) +	\
+		ALIGN((max_via_alen), VIA_ALEN_ALIGN))
+
 enum mpls_ttl_propagation {
 	MPLS_TTL_PROP_DEFAULT,
 	MPLS_TTL_PROP_ENABLED,
@@ -104,16 +116,16 @@ enum mpls_ttl_propagation {
  * +----------------------+
  * | mpls_nh 0            |
  * +----------------------+
- * | ...                  |
- * +----------------------+
- * | mpls_nh n-1          |
- * +----------------------+
- * | alignment padding    |
+ * | alignment padding    |   4 bytes for odd number of labels
  * +----------------------+
  * | via[rt_max_alen] 0   |
  * +----------------------+
+ * | alignment padding    |   via's aligned on sizeof(unsigned long)
+ * +----------------------+
  * | ...                  |
  * +----------------------+
+ * | mpls_nh n-1          |
+ * +----------------------+
  * | via[rt_max_alen] n-1 |
  * +----------------------+
  */
@@ -121,24 +133,27 @@ struct mpls_route { /* next hop label forwarding entry */
 	struct rcu_head		rt_rcu;
 	u8			rt_protocol;
 	u8			rt_payload_type;
-	u8			rt_max_alen;
 	u8			rt_ttl_propagate;
 	u8			rt_nhn;
 	u8			rt_nhn_alive;
+	u8			rt_nh_size;
+	u8			rt_via_offset;
+	/* u8 hole */
 	struct mpls_nh		rt_nh[0];
 };
 
 #define for_nexthops(rt) {						\
-	int nhsel; struct mpls_nh *nh;			\
-	for (nhsel = 0, nh = (rt)->rt_nh;				\
+	int nhsel; struct mpls_nh *nh;	u8 *__nh;			\
+	for (nhsel = 0, nh = (rt)->rt_nh, __nh = (u8 *)((rt)->rt_nh);	\
 	     nhsel < (rt)->rt_nhn;					\
-	     nh++, nhsel++)
+	     __nh += rt->rt_nh_size, nh = (struct mpls_nh *)__nh, nhsel++)
 
 #define change_nexthops(rt) {						\
-	int nhsel; struct mpls_nh *nh;				\
-	for (nhsel = 0,	nh = (struct mpls_nh *)((rt)->rt_nh);	\
+	int nhsel; struct mpls_nh *nh; u8 *__nh;			\
+	for (nhsel = 0,	nh = (struct mpls_nh *)((rt)->rt_nh),		\
+			__nh = (u8 *)((rt)->rt_nh);			\
 	     nhsel < (rt)->rt_nhn;					\
-	     nh++, nhsel++)
+	     __nh += rt->rt_nh_size, nh = (struct mpls_nh *)__nh, nhsel++)
 
 #define endfor_nexthops(rt) }
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH net-next 3/4] net: mpls: bump maximum number of labels
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
  2017-03-25 17:03 ` [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8 David Ahern
  2017-03-25 17:03 ` [PATCH net-next 2/4] net: mpls: change mpls_route layout David Ahern
@ 2017-03-25 17:03 ` David Ahern
  2017-03-25 17:03 ` [PATCH net-next 4/4] net: mpls: Increase max number of labels for lwt encap David Ahern
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: David Ahern @ 2017-03-25 17:03 UTC (permalink / raw)
  To: netdev; +Cc: roopa, rshearma, ebiederm, David Ahern

Allow users to push down more labels per MPLS route. With the previous
patch no memory allocations for routes are based on MAX_NEW_LABELS, so
the limit is only used to keep userspace in check.

Per discussions, 16 labels has been deemed execessive, 8 seems to handle
most use cases, though 1 user has expressed an interest in up to 12
labels. Since the limit is only used to cap what userspace can push
down to the kernel, set the new limit to 12 which should accommodate all
known segment routing use cases.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 net/mpls/af_mpls.c | 59 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index e402e5148765..3023c261a04b 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -24,7 +24,10 @@
 #include <net/nexthop.h>
 #include "internal.h"
 
-#define MAX_NEW_LABELS 2
+/* put a reasonable limit on the number of labels
+ * we will accept from userspace
+ */
+#define MAX_NEW_LABELS 12
 
 /* Maximum number of labels to look ahead at when selecting a path of
  * a multipath route
@@ -681,9 +684,6 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
 		return -ENOMEM;
 
 	err = -EINVAL;
-	/* Ensure only a supported number of labels are present */
-	if (cfg->rc_output_labels > MAX_NEW_LABELS)
-		goto errout;
 
 	nh->nh_labels = cfg->rc_output_labels;
 	for (i = 0; i < nh->nh_labels; i++)
@@ -708,7 +708,7 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
 
 static int mpls_nh_build(struct net *net, struct mpls_route *rt,
 			 struct mpls_nh *nh, int oif, struct nlattr *via,
-			 struct nlattr *newdst)
+			 struct nlattr *newdst, u8 max_labels)
 {
 	int err = -ENOMEM;
 
@@ -716,7 +716,7 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
 		goto errout;
 
 	if (newdst) {
-		err = nla_get_labels(newdst, MAX_NEW_LABELS,
+		err = nla_get_labels(newdst, max_labels,
 				     &nh->nh_labels, nh->nh_label);
 		if (err)
 			goto errout;
@@ -742,21 +742,19 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
 }
 
 static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
-			      u8 cfg_via_alen, u8 *max_via_alen)
+			      u8 cfg_via_alen, u8 *max_via_alen,
+			      u8 *max_labels)
 {
 	int remaining = len;
 	u8 nhs = 0;
 
-	if (!rtnh) {
-		*max_via_alen = cfg_via_alen;
-		return 1;
-	}
-
 	*max_via_alen = 0;
+	*max_labels = 0;
 
 	while (rtnh_ok(rtnh, remaining)) {
 		struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 		int attrlen;
+		u8 n_labels;
 
 		attrlen = rtnh_attrlen(rtnh);
 		nla = nla_find(attrs, attrlen, RTA_VIA);
@@ -770,6 +768,12 @@ static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
 						      via_alen);
 		}
 
+		nla = nla_find(attrs, attrlen, RTA_NEWDST);
+		if (nla &&
+		    nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, NULL) == 0) {
+			*max_labels = max_t(u8, *max_labels, n_labels);
+		}
+
 		/* number of nexthops is tracked by a u8.
 		 * Check for overflow.
 		 */
@@ -786,7 +790,7 @@ static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
 }
 
 static int mpls_nh_build_multi(struct mpls_route_config *cfg,
-			       struct mpls_route *rt)
+			       struct mpls_route *rt, u8 max_labels)
 {
 	struct rtnexthop *rtnh = cfg->rc_mp;
 	struct nlattr *nla_via, *nla_newdst;
@@ -819,7 +823,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
 		}
 
 		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
-				    rtnh->rtnh_ifindex, nla_via, nla_newdst);
+				    rtnh->rtnh_ifindex, nla_via, nla_newdst,
+				    max_labels);
 		if (err)
 			goto errout;
 
@@ -846,6 +851,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	int err = -EINVAL;
 	u8 max_via_alen;
 	unsigned index;
+	u8 max_labels;
 	u8 nhs;
 
 	index = cfg->rc_label;
@@ -884,13 +890,21 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 		goto errout;
 
 	err = -EINVAL;
-	nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len,
-				  cfg->rc_via_alen, &max_via_alen);
-	if (nhs == 0)
+	if (cfg->rc_mp) {
+		nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len,
+					  cfg->rc_via_alen, &max_via_alen,
+					  &max_labels);
+	} else {
+		max_via_alen = cfg->rc_via_alen;
+		max_labels = cfg->rc_output_labels;
+		nhs = 1;
+	}
+
+	if (nhs == 0 || max_labels > MAX_NEW_LABELS)
 		goto errout;
 
 	err = -ENOMEM;
-	rt = mpls_rt_alloc(nhs, max_via_alen, MAX_NEW_LABELS);
+	rt = mpls_rt_alloc(nhs, max_via_alen, max_labels);
 	if (!rt)
 		goto errout;
 
@@ -899,7 +913,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	rt->rt_ttl_propagate = cfg->rc_ttl_propagate;
 
 	if (cfg->rc_mp)
-		err = mpls_nh_build_multi(cfg, rt);
+		err = mpls_nh_build_multi(cfg, rt, max_labels);
 	else
 		err = mpls_nh_build_from_cfg(cfg, rt);
 	if (err)
@@ -1534,7 +1548,8 @@ int nla_get_labels(const struct nlattr *nla,
 			return -EINVAL;
 		}
 
-		label[i] = dec.label;
+		if (label)
+			label[i] = dec.label;
 	}
 	*labels = nla_labels;
 	return 0;
@@ -1941,7 +1956,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	/* In case the predefined labels need to be populated */
 	if (limit > MPLS_LABEL_IPV4NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt0 = mpls_rt_alloc(1, lo->addr_len, MAX_NEW_LABELS);
+		rt0 = mpls_rt_alloc(1, lo->addr_len, 0);
 		if (!rt0)
 			goto nort0;
 		RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo);
@@ -1955,7 +1970,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	}
 	if (limit > MPLS_LABEL_IPV6NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt2 = mpls_rt_alloc(1, lo->addr_len, MAX_NEW_LABELS);
+		rt2 = mpls_rt_alloc(1, lo->addr_len, 0);
 		if (!rt2)
 			goto nort2;
 		RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo);
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH net-next 4/4] net: mpls: Increase max number of labels for lwt encap
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
                   ` (2 preceding siblings ...)
  2017-03-25 17:03 ` [PATCH net-next 3/4] net: mpls: bump maximum number of labels David Ahern
@ 2017-03-25 17:03 ` David Ahern
  2017-03-25 19:15 ` [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route Eric W. Biederman
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: David Ahern @ 2017-03-25 17:03 UTC (permalink / raw)
  To: netdev; +Cc: roopa, rshearma, ebiederm, David Ahern

Alow users to push down more labels per MPLS encap. Same logic as LSR
use case, so re-use the maximum number of labels.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/net/mpls_iptunnel.h |  4 +---
 net/mpls/af_mpls.c          |  5 -----
 net/mpls/internal.h         |  5 +++++
 net/mpls/mpls_iptunnel.c    | 13 ++++++++++---
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index a18af6a16eb5..038fa9985521 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -14,13 +14,11 @@
 #ifndef _NET_MPLS_IPTUNNEL_H
 #define _NET_MPLS_IPTUNNEL_H 1
 
-#define MAX_NEW_LABELS 2
-
 struct mpls_iptunnel_encap {
-	u32	label[MAX_NEW_LABELS];
 	u8	labels;
 	u8	ttl_propagate;
 	u8	default_ttl;
+	u32	label[0];
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 3023c261a04b..0ced87a51326 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -24,11 +24,6 @@
 #include <net/nexthop.h>
 #include "internal.h"
 
-/* put a reasonable limit on the number of labels
- * we will accept from userspace
- */
-#define MAX_NEW_LABELS 12
-
 /* Maximum number of labels to look ahead at when selecting a path of
  * a multipath route
  */
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 302d48f54b57..d43f8bc52653 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -2,6 +2,11 @@
 #define MPLS_INTERNAL_H
 #include <net/mpls.h>
 
+/* put a reasonable limit on the number of labels
+ * we will accept from userspace
+ */
+#define MAX_NEW_LABELS 12
+
 struct mpls_entry_decoded {
 	u32 label;
 	u8 ttl;
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 22f71fce0bfb..fe00e98667cf 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -164,6 +164,7 @@ static int mpls_build_state(struct nlattr *nla,
 	struct mpls_iptunnel_encap *tun_encap_info;
 	struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
 	struct lwtunnel_state *newts;
+	u8 n_labels;
 	int ret;
 
 	ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
@@ -175,12 +176,18 @@ static int mpls_build_state(struct nlattr *nla,
 		return -EINVAL;
 
 
-	newts = lwtunnel_state_alloc(sizeof(*tun_encap_info));
+	/* determine number of labels */
+	if (nla_get_labels(tb[MPLS_IPTUNNEL_DST],
+			   MAX_NEW_LABELS, &n_labels, NULL))
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*tun_encap_info) +
+				     n_labels * sizeof(u32));
 	if (!newts)
 		return -ENOMEM;
 
 	tun_encap_info = mpls_lwtunnel_encap(newts);
-	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels,
 			     &tun_encap_info->labels, tun_encap_info->label);
 	if (ret)
 		goto errout;
@@ -257,7 +264,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
 	    a_hdr->default_ttl != b_hdr->default_ttl)
 		return 1;
 
-	for (l = 0; l < MAX_NEW_LABELS; l++)
+	for (l = 0; l < a_hdr->labels; l++)
 		if (a_hdr->label[l] != b_hdr->label[l])
 			return 1;
 	return 0;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
                   ` (3 preceding siblings ...)
  2017-03-25 17:03 ` [PATCH net-next 4/4] net: mpls: Increase max number of labels for lwt encap David Ahern
@ 2017-03-25 19:15 ` Eric W. Biederman
  2017-03-27 10:39   ` Robert Shearman
  2017-03-27 22:52 ` David Miller
  2017-03-28  9:59 ` Robert Shearman
  6 siblings, 1 reply; 20+ messages in thread
From: Eric W. Biederman @ 2017-03-25 19:15 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, roopa, rshearma

David Ahern <dsa@cumulusnetworks.com> writes:

> Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
> memory consumption in check the labels array is moved to the end of mpls_nh
> and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
> maximum number of labels across all nexthops in a route for LSR and the
> number of labels configured for LWT.
>
> The mpls_route layout is changed to:
>
>    +----------------------+
>    | mpls_route           |
>    +----------------------+
>    | mpls_nh 0            |
>    +----------------------+
>    | alignment padding    |   4 bytes for odd number of labels; 0 for even
>    +----------------------+
>    | via[rt_max_alen] 0   |
>    +----------------------+
>    | alignment padding    |   via's aligned on sizeof(unsigned long)
>    +----------------------+
>    | ...                  |
>
> Meaning the via follows its mpls_nh providing better locality as the
> number of labels increases. UDP_RR tests with namespaces shows no impact
> to a modest performance increase with this layout for 1 or 2 labels and
> 1 or 2 nexthops.
>
> The new limit is set to 12 to cover all currently known segment
> routing use cases.

How does this compare with running the packet a couple of times through
the mpls table to get all of the desired labels applied?

I can certainly see the case in an mpls tunnel ingress where this might
could be desirable.    Which is something you implement in your last
patch.  However is it at all common to push lots of labels at once
during routing?

I am probably a bit naive but it seems absurd to push more
than a handful of labels onto a packet as you are routing it.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8
  2017-03-25 17:03 ` [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8 David Ahern
@ 2017-03-27  3:11   ` Eric W. Biederman
  2017-03-27 14:43     ` David Ahern
  0 siblings, 1 reply; 20+ messages in thread
From: Eric W. Biederman @ 2017-03-27  3:11 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, roopa, rshearma

David Ahern <dsa@cumulusnetworks.com> writes:

> Number of nexthops and number of alive nexthops are tracked using an
> unsigned int. A route should never have more than 255 nexthops so
> convert both to u8. Update all references and intermediate variables
> to consistently use u8 as well.
>
> Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte
> hole before the nexthops.
>
> Also, the ACCESS_ONCE is changed to READ_ONCE per checkpatch message.

I don't like this.  Byte writes don't exist on all architectures.

So while I think always writing to rtn_nhn_alive under the
rtn_lock ensures that we don't have wrong values written
it is quite subtle.  And I don't know how this will interact with other
fields that you are introducing.

AKA this might be ok, but I expect this formulation of the code
will easily bit-rot and break.

Eric

> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
> ---
>  net/mpls/af_mpls.c  | 29 ++++++++++++++++++-----------
>  net/mpls/internal.h |  4 ++--
>  2 files changed, 20 insertions(+), 13 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index cd8be8d5e4ad..d3dc6c43a1d1 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -192,7 +192,7 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
>  static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
>  					     struct sk_buff *skb)
>  {
> -	int alive = ACCESS_ONCE(rt->rt_nhn_alive);
> +	u8 alive = READ_ONCE(rt->rt_nhn_alive);
>  	u32 hash = 0;
>  	int nh_index = 0;
>  	int n = 0;
> @@ -458,7 +458,7 @@ struct mpls_route_config {
>  	int			rc_mp_len;
>  };
>  
> -static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
> +static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen)
>  {
>  	u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
>  	struct mpls_route *rt;
> @@ -736,11 +736,11 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
>  	return err;
>  }
>  
> -static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
> -			       u8 cfg_via_alen, u8 *max_via_alen)
> +static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
> +			      u8 cfg_via_alen, u8 *max_via_alen)
>  {
> -	int nhs = 0;
>  	int remaining = len;
> +	u8 nhs = 0;
>  
>  	if (!rtnh) {
>  		*max_via_alen = cfg_via_alen;
> @@ -765,7 +765,14 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
>  						      via_alen);
>  		}
>  
> +		/* number of nexthops is tracked by a u8.
> +		 * Check for overflow.
> +		 */
> +		if (nhs == 255)
> +			return 0;
> +
>  		nhs++;
> +
>  		rtnh = rtnh_next(rtnh, &remaining);
>  	}
>  
> @@ -779,8 +786,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>  	struct rtnexthop *rtnh = cfg->rc_mp;
>  	struct nlattr *nla_via, *nla_newdst;
>  	int remaining = cfg->rc_mp_len;
> -	int nhs = 0;
>  	int err = 0;
> +	u8 nhs = 0;
>  
>  	change_nexthops(rt) {
>  		int attrlen;
> @@ -834,7 +841,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>  	int err = -EINVAL;
>  	u8 max_via_alen;
>  	unsigned index;
> -	int nhs;
> +	u8 nhs;
>  
>  	index = cfg->rc_label;
>  
> @@ -1299,8 +1306,8 @@ static void mpls_ifdown(struct net_device *dev, int event)
>  	struct mpls_route __rcu **platform_label;
>  	struct net *net = dev_net(dev);
>  	unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
> -	unsigned int alive;
>  	unsigned index;
> +	u8 alive;
>  
>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>  	for (index = 0; index < net->mpls.platform_labels; index++) {
> @@ -1339,7 +1346,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
>  	struct mpls_route __rcu **platform_label;
>  	struct net *net = dev_net(dev);
>  	unsigned index;
> -	int alive;
> +	u8 alive;
>  
>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>  	for (index = 0; index < net->mpls.platform_labels; index++) {
> @@ -1761,8 +1768,8 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>  	} else {
>  		struct rtnexthop *rtnh;
>  		struct nlattr *mp;
> -		int dead = 0;
> -		int linkdown = 0;
> +		u8 linkdown = 0;
> +		u8 dead = 0;
>  
>  		mp = nla_nest_start(skb, RTA_MULTIPATH);
>  		if (!mp)
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index 62928d8fabd1..66f388ba2d49 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -123,8 +123,8 @@ struct mpls_route { /* next hop label forwarding entry */
>  	u8			rt_payload_type;
>  	u8			rt_max_alen;
>  	u8			rt_ttl_propagate;
> -	unsigned int		rt_nhn;
> -	unsigned int		rt_nhn_alive;
> +	u8			rt_nhn;
> +	u8			rt_nhn_alive;
>  	struct mpls_nh		rt_nh[0];
>  };

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-25 19:15 ` [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route Eric W. Biederman
@ 2017-03-27 10:39   ` Robert Shearman
  2017-03-27 14:21     ` David Ahern
  0 siblings, 1 reply; 20+ messages in thread
From: Robert Shearman @ 2017-03-27 10:39 UTC (permalink / raw)
  To: Eric W. Biederman, David Ahern; +Cc: netdev, roopa

On 25/03/17 19:15, Eric W. Biederman wrote:
> David Ahern <dsa@cumulusnetworks.com> writes:
>
>> Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
>> memory consumption in check the labels array is moved to the end of mpls_nh
>> and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
>> maximum number of labels across all nexthops in a route for LSR and the
>> number of labels configured for LWT.
>>
>> The mpls_route layout is changed to:
>>
>>    +----------------------+
>>    | mpls_route           |
>>    +----------------------+
>>    | mpls_nh 0            |
>>    +----------------------+
>>    | alignment padding    |   4 bytes for odd number of labels; 0 for even
>>    +----------------------+
>>    | via[rt_max_alen] 0   |
>>    +----------------------+
>>    | alignment padding    |   via's aligned on sizeof(unsigned long)
>>    +----------------------+
>>    | ...                  |
>>
>> Meaning the via follows its mpls_nh providing better locality as the
>> number of labels increases. UDP_RR tests with namespaces shows no impact
>> to a modest performance increase with this layout for 1 or 2 labels and
>> 1 or 2 nexthops.
>>
>> The new limit is set to 12 to cover all currently known segment
>> routing use cases.
>
> How does this compare with running the packet a couple of times through
> the mpls table to get all of the desired labels applied?

At the moment (i.e setting output interface for a route to the loopback 
interface) the TTL would currently be calculated incorrectly since it'll 
be decremented each time the packet is run through the input processing. 
If that was avoided, then the only issue would be the lower performance.

>
> I can certainly see the case in an mpls tunnel ingress where this might
> could be desirable.    Which is something you implement in your last
> patch.  However is it at all common to push lots of labels at once
> during routing?
>
> I am probably a bit naive but it seems absurd to push more
> than a handful of labels onto a packet as you are routing it.

 From draft-ietf-spring-segment-routing-mpls-07:

    Note that the kind of deployment of Segment Routing may affect the
    depth of the MPLS label stack.  As every segment in the list is
    represented by an additional MPLS label, the length of the segment
    list directly correlates to the depth of the label stack.
    Implementing a long path with many explicit hops as a segment list
    may thus yield a deep label stack that would need to be pushed at the
    head of the SR tunnel.

    However, many use cases would need very few segments in the list.
    This is especially true when taking good advantage of the ECMP aware
    routing within each segment.  In fact most use cases need just one
    additional segment and thus lead to a similar label stack depth as
    e.g.  RSVP-based routing.

The summary is that when using short-path routing then the number of 
labels needed to be pushed on will be small (2 or 3). However, if using 
SR to implement traffic engineering through a list of explicit hops then 
the number of labels pushed could be much greater and up to the diameter 
of the IGP network. Traffic engineering like this is not unusual.

Thanks,
Rob

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-27 10:39   ` Robert Shearman
@ 2017-03-27 14:21     ` David Ahern
  2017-03-28  3:08       ` Eric W. Biederman
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2017-03-27 14:21 UTC (permalink / raw)
  To: Robert Shearman, Eric W. Biederman; +Cc: netdev, roopa

On 3/27/17 4:39 AM, Robert Shearman wrote:
> On 25/03/17 19:15, Eric W. Biederman wrote:
>> David Ahern <dsa@cumulusnetworks.com> writes:
>>
>>> Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
>>> memory consumption in check the labels array is moved to the end of
>>> mpls_nh
>>> and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
>>> maximum number of labels across all nexthops in a route for LSR and the
>>> number of labels configured for LWT.
>>>
>>> The mpls_route layout is changed to:
>>>
>>>    +----------------------+
>>>    | mpls_route           |
>>>    +----------------------+
>>>    | mpls_nh 0            |
>>>    +----------------------+
>>>    | alignment padding    |   4 bytes for odd number of labels; 0 for
>>> even
>>>    +----------------------+
>>>    | via[rt_max_alen] 0   |
>>>    +----------------------+
>>>    | alignment padding    |   via's aligned on sizeof(unsigned long)
>>>    +----------------------+
>>>    | ...                  |
>>>
>>> Meaning the via follows its mpls_nh providing better locality as the
>>> number of labels increases. UDP_RR tests with namespaces shows no impact
>>> to a modest performance increase with this layout for 1 or 2 labels and
>>> 1 or 2 nexthops.
>>>
>>> The new limit is set to 12 to cover all currently known segment
>>> routing use cases.
>>
>> How does this compare with running the packet a couple of times through
>> the mpls table to get all of the desired labels applied?
> 
> At the moment (i.e setting output interface for a route to the loopback
> interface) the TTL would currently be calculated incorrectly since it'll
> be decremented each time the packet is run through the input processing.
> If that was avoided, then the only issue would be the lower performance.

We have the infrastructure to add all the labels on one pass. It does
not make sense to recirculate the packet to get the same effect.


> 
>>
>> I can certainly see the case in an mpls tunnel ingress where this might
>> could be desirable.    Which is something you implement in your last
>> patch.  However is it at all common to push lots of labels at once
>> during routing?
>>
>> I am probably a bit naive but it seems absurd to push more
>> than a handful of labels onto a packet as you are routing it.
> 
> From draft-ietf-spring-segment-routing-mpls-07:
> 
>    Note that the kind of deployment of Segment Routing may affect the
>    depth of the MPLS label stack.  As every segment in the list is
>    represented by an additional MPLS label, the length of the segment
>    list directly correlates to the depth of the label stack.
>    Implementing a long path with many explicit hops as a segment list
>    may thus yield a deep label stack that would need to be pushed at the
>    head of the SR tunnel.
> 
>    However, many use cases would need very few segments in the list.
>    This is especially true when taking good advantage of the ECMP aware
>    routing within each segment.  In fact most use cases need just one
>    additional segment and thus lead to a similar label stack depth as
>    e.g.  RSVP-based routing.
> 
> The summary is that when using short-path routing then the number of
> labels needed to be pushed on will be small (2 or 3). However, if using
> SR to implement traffic engineering through a list of explicit hops then
> the number of labels pushed could be much greater and up to the diameter
> of the IGP network. Traffic engineering like this is not unusual.

And the thread on the FRR mailing list has other ietf references. The
summary is that are plenty of use cases for more labels on ingress
(ip->mpls) and route paths (mpls->mpls). I did see one comment that 12
may not be enough for all use cases. Why not 16 or 20?

This patch set bumps the number of labels and the performance impact is
only to users that use a high label count. Other than a temporary stack
variable for installing the routes, no memory is allocated based on the
limit as an array size, so we could just as easily go with 16 - a nice
round number.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8
  2017-03-27  3:11   ` Eric W. Biederman
@ 2017-03-27 14:43     ` David Ahern
  2017-03-27 22:54       ` Eric W. Biederman
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2017-03-27 14:43 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: netdev, roopa, rshearma

On 3/26/17 9:11 PM, Eric W. Biederman wrote:
> I don't like this.  Byte writes don't exist on all architectures.
> 
> So while I think always writing to rtn_nhn_alive under the
> rtn_lock ensures that we don't have wrong values written
> it is quite subtle.  And I don't know how this will interact with other
> fields that you are introducing.
> 
> AKA this might be ok, but I expect this formulation of the code
> will easily bit-rot and break.

net/ has other use cases -- e.g., ipv6 tunneling has proto as a u8.

It unrealistic for a route to have 255 or more nexthops so the point of
this patch is to not waste 8 bytes tracking it - especially when
removing it gets routes with ipv4 and ipv6 via's into a cache line.

I can make the alive counter a u16 without increasing the size of the
struct. I'd prefer to leave it as an u8 to have a u8 hole for flags
should something be needed later.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
                   ` (4 preceding siblings ...)
  2017-03-25 19:15 ` [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route Eric W. Biederman
@ 2017-03-27 22:52 ` David Miller
  2017-03-28  9:59 ` Robert Shearman
  6 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2017-03-27 22:52 UTC (permalink / raw)
  To: dsa; +Cc: netdev, roopa, rshearma, ebiederm

From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 25 Mar 2017 10:03:24 -0700

> Bump the maximum number of labels for MPLS routes from 2 to 12.

This doesn't apply cleanly to net-next, please respin.

Perhaps it conflicts with your recent cleanups.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8
  2017-03-27 14:43     ` David Ahern
@ 2017-03-27 22:54       ` Eric W. Biederman
  2017-03-28 15:25         ` David Ahern
  0 siblings, 1 reply; 20+ messages in thread
From: Eric W. Biederman @ 2017-03-27 22:54 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, roopa, rshearma

David Ahern <dsa@cumulusnetworks.com> writes:

> On 3/26/17 9:11 PM, Eric W. Biederman wrote:
>> I don't like this.  Byte writes don't exist on all architectures.
>> 
>> So while I think always writing to rtn_nhn_alive under the
>> rtn_lock ensures that we don't have wrong values written
>> it is quite subtle.  And I don't know how this will interact with other
>> fields that you are introducing.
>> 
>> AKA this might be ok, but I expect this formulation of the code
>> will easily bit-rot and break.
>
> net/ has other use cases -- e.g., ipv6 tunneling has proto as a u8.
>
> It unrealistic for a route to have 255 or more nexthops so the point of
> this patch is to not waste 8 bytes tracking it - especially when
> removing it gets routes with ipv4 and ipv6 via's into a cache line.

The argument isn't that 255 nexthops is too few but that there is no
instruction to write to a single byte on some architectures.

My concern is that if we are writing a field using a non-byte write
without care we could easily have confusion with adjacent fields.

> I can make the alive counter a u16 without increasing the size of the
> struct. I'd prefer to leave it as an u8 to have a u8 hole for flags
> should something be needed later.

u16 is no better than u8.

The original architecture was that all changes to an mpls route would
be done in read, copy, allocate a new route, and replace the pointer
fashion.  Which allows rcu access.

There was argument made that it is silly to do that when a the network
device for a hop goes up or down.  Something about the memory allocation
not being reliable as I recall. And so we now have rt_nhn_alive and it
stored as an int so that it can be read and written atomically.

It is absolutely a no-brainer to change rt_nhn to a u8.  And I very much
appreciate all work to keep mpls_route into a single cache line.  As in
practices that is one of the most important parts to performance.

Which leads to the functions mpls_ifup, mpls_ifdown, and
mpls_select_multipath.

To make this all work correctly we need a couple of things.
- A big fat comment on struct mpls_route and mpls_nh about how
  and why these structures are modified and not replaced during
  nexthop processing.  Including the fact that it all modifications
  may only happen with rntl_lock held.

- The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses,
  that happen after the route is installed (and is thus rcu reachable).

- The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses,
  that happen after the route is installed (and is thus rcu reachable).

Someone needs to fix mpls_ifup AKA something like:

 			struct net_device *nh_dev =
 				rtnl_dereference(nh->nh_dev);

+			unhsigned int flags = READ_ONCE(nh->nh_flags);
+			if (nh_dev == dev) {
+				flags &= ~nh_flags;
+				WRITE_ONCE(nh->nh_flags, flags);
+			}
+			if (!(flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)))
+				alive++;
-			if (!(nh->nh_flags & nh_flags)) {
-				alive++;
-				continue;
-			}
-			if (nh_dev != dev)
-				continue;
-			alive++;
-			nh->nh_flags &= ~nh_flags;
 		} endfor_nexthops(rt);
 
-		ACCESS_ONCE(rt->rt_nhn_alive) = alive;
+		WRITE_ONCE(rt->rt_nhn_alive, alive);
 	}
 }

If we comment it all clearly and make very certain that the magic with
nh->nh_flags and rt->rt_nhn_alive works I don't object.  But we need to
let future people who touch the code know: here be dragons.

Especially as anything else in the same 32bits as rt->nhn_alive as our
update of that field will can rewrite those values too.  So we need
very careful to serialize any update like that.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/4] net: mpls: change mpls_route layout
  2017-03-25 17:03 ` [PATCH net-next 2/4] net: mpls: change mpls_route layout David Ahern
@ 2017-03-28  0:04   ` Eric W. Biederman
  0 siblings, 0 replies; 20+ messages in thread
From: Eric W. Biederman @ 2017-03-28  0:04 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, roopa, rshearma

David Ahern <dsa@cumulusnetworks.com> writes:

> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index 66f388ba2d49..302d48f54b57 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -64,7 +64,6 @@ struct mpls_dev {
>  struct sk_buff;
>  
>  #define LABEL_NOT_SPECIFIED (1 << 20)
> -#define MAX_NEW_LABELS 2
>  
>  /* This maximum ha length copied from the definition of struct neighbour */
>  #define VIA_ALEN_ALIGN sizeof(unsigned long)
> @@ -84,12 +83,25 @@ enum mpls_payload_type {
>  struct mpls_nh { /* next hop label forwarding entry */
>  	struct net_device __rcu *nh_dev;
>  	unsigned int		nh_flags;
> -	u32			nh_label[MAX_NEW_LABELS];
>  	u8			nh_labels;
>  	u8			nh_via_alen;
>  	u8			nh_via_table;
> +	/* u8 hole */

This hole probably be better documented with:
	u8			nh_reserved1;
> +	u32			nh_label[0];
>  };

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-27 14:21     ` David Ahern
@ 2017-03-28  3:08       ` Eric W. Biederman
  2017-03-28  9:52         ` Robert Shearman
                           ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Eric W. Biederman @ 2017-03-28  3:08 UTC (permalink / raw)
  To: David Ahern; +Cc: Robert Shearman, netdev, roopa

David Ahern <dsa@cumulusnetworks.com> writes:

> On 3/27/17 4:39 AM, Robert Shearman wrote:
>> On 25/03/17 19:15, Eric W. Biederman wrote:
>>> David Ahern <dsa@cumulusnetworks.com> writes:
>>>
>>>> Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
>>>> memory consumption in check the labels array is moved to the end of
>>>> mpls_nh
>>>> and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
>>>> maximum number of labels across all nexthops in a route for LSR and the
>>>> number of labels configured for LWT.
>>>>
>>>> The mpls_route layout is changed to:
>>>>
>>>>    +----------------------+
>>>>    | mpls_route           |
>>>>    +----------------------+
>>>>    | mpls_nh 0            |
>>>>    +----------------------+
>>>>    | alignment padding    |   4 bytes for odd number of labels; 0 for
>>>> even
>>>>    +----------------------+
>>>>    | via[rt_max_alen] 0   |
>>>>    +----------------------+
>>>>    | alignment padding    |   via's aligned on sizeof(unsigned long)
>>>>    +----------------------+
>>>>    | ...                  |
>>>>
>>>> Meaning the via follows its mpls_nh providing better locality as the
>>>> number of labels increases. UDP_RR tests with namespaces shows no impact
>>>> to a modest performance increase with this layout for 1 or 2 labels and
>>>> 1 or 2 nexthops.
>>>>
>>>> The new limit is set to 12 to cover all currently known segment
>>>> routing use cases.
>>>
>>> How does this compare with running the packet a couple of times through
>>> the mpls table to get all of the desired labels applied?
>> 
>> At the moment (i.e setting output interface for a route to the loopback
>> interface) the TTL would currently be calculated incorrectly since it'll
>> be decremented each time the packet is run through the input processing.
>> If that was avoided, then the only issue would be the lower performance.
>
> We have the infrastructure to add all the labels on one pass. It does
> not make sense to recirculate the packet to get the same effect.

I was really asking what are the advantages and disadvantages of this
change rather than suggesting it was a bad idea.  The information about
ttl is useful.

Adding that this will route packets with more labels more quickly than
the recirculation method is also useful to know.

>>> I can certainly see the case in an mpls tunnel ingress where this might
>>> could be desirable.    Which is something you implement in your last
>>> patch.  However is it at all common to push lots of labels at once
>>> during routing?
>>>
>>> I am probably a bit naive but it seems absurd to push more
>>> than a handful of labels onto a packet as you are routing it.
>> 
>> From draft-ietf-spring-segment-routing-mpls-07:
>> 
>>    Note that the kind of deployment of Segment Routing may affect the
>>    depth of the MPLS label stack.  As every segment in the list is
>>    represented by an additional MPLS label, the length of the segment
>>    list directly correlates to the depth of the label stack.
>>    Implementing a long path with many explicit hops as a segment list
>>    may thus yield a deep label stack that would need to be pushed at the
>>    head of the SR tunnel.
>> 
>>    However, many use cases would need very few segments in the list.
>>    This is especially true when taking good advantage of the ECMP aware
>>    routing within each segment.  In fact most use cases need just one
>>    additional segment and thus lead to a similar label stack depth as
>>    e.g.  RSVP-based routing.
>> 
>> The summary is that when using short-path routing then the number of
>> labels needed to be pushed on will be small (2 or 3). However, if using
>> SR to implement traffic engineering through a list of explicit hops then
>> the number of labels pushed could be much greater and up to the diameter
>> of the IGP network. Traffic engineering like this is not unusual.
>
> And the thread on the FRR mailing list has other ietf references. The
> summary is that are plenty of use cases for more labels on ingress
> (ip->mpls) and route paths (mpls->mpls). I did see one comment that 12
> may not be enough for all use cases. Why not 16 or 20?
>
> This patch set bumps the number of labels and the performance impact is
> only to users that use a high label count. Other than a temporary stack
> variable for installing the routes, no memory is allocated based on the
> limit as an array size, so we could just as easily go with 16 - a nice
> round number.

Overall I like what is being accomplished by this patchset.
I especially like the fact that the forwarding path is left
essentially unchanged, and that the struct mpls_route shirnks a little
for the common case.

I believe we should just kill MAX_NEW_LABELS.

I think the only significant change from your patch is the removal of an
array from mpls_route_config.

With the removal of MAX_NEW_LABELS I would replace it by a sanity check
in mpls_rt_alloc that verifies that the amount we are going to allocate
for struct mpls_route is < PAGE_SIZE.  Anything larger is just
asking for trouble.

That should put our practical limit just a little bit below 32 nexthops
adding 32 labels each.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-28  3:08       ` Eric W. Biederman
@ 2017-03-28  9:52         ` Robert Shearman
  2017-03-28 14:39         ` David Ahern
  2017-03-29 21:20         ` David Ahern
  2 siblings, 0 replies; 20+ messages in thread
From: Robert Shearman @ 2017-03-28  9:52 UTC (permalink / raw)
  To: Eric W. Biederman, David Ahern; +Cc: netdev, roopa

On 28/03/17 04:08, Eric W. Biederman wrote:
> David Ahern <dsa@cumulusnetworks.com> writes:
>
>> On 3/27/17 4:39 AM, Robert Shearman wrote:
>>> On 25/03/17 19:15, Eric W. Biederman wrote:
>>>> David Ahern <dsa@cumulusnetworks.com> writes:
>>>>
>>>>> Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
>>>>> memory consumption in check the labels array is moved to the end of
>>>>> mpls_nh
>>>>> and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
>>>>> maximum number of labels across all nexthops in a route for LSR and the
>>>>> number of labels configured for LWT.
>>>>>
>>>>> The mpls_route layout is changed to:
>>>>>
>>>>>    +----------------------+
>>>>>    | mpls_route           |
>>>>>    +----------------------+
>>>>>    | mpls_nh 0            |
>>>>>    +----------------------+
>>>>>    | alignment padding    |   4 bytes for odd number of labels; 0 for
>>>>> even
>>>>>    +----------------------+
>>>>>    | via[rt_max_alen] 0   |
>>>>>    +----------------------+
>>>>>    | alignment padding    |   via's aligned on sizeof(unsigned long)
>>>>>    +----------------------+
>>>>>    | ...                  |
>>>>>
>>>>> Meaning the via follows its mpls_nh providing better locality as the
>>>>> number of labels increases. UDP_RR tests with namespaces shows no impact
>>>>> to a modest performance increase with this layout for 1 or 2 labels and
>>>>> 1 or 2 nexthops.
>>>>>
>>>>> The new limit is set to 12 to cover all currently known segment
>>>>> routing use cases.
>>>>
>>>> How does this compare with running the packet a couple of times through
>>>> the mpls table to get all of the desired labels applied?
>>>
>>> At the moment (i.e setting output interface for a route to the loopback
>>> interface) the TTL would currently be calculated incorrectly since it'll
>>> be decremented each time the packet is run through the input processing.
>>> If that was avoided, then the only issue would be the lower performance.
>>
>> We have the infrastructure to add all the labels on one pass. It does
>> not make sense to recirculate the packet to get the same effect.
>
> I was really asking what are the advantages and disadvantages of this
> change rather than suggesting it was a bad idea.  The information about
> ttl is useful.
>
> Adding that this will route packets with more labels more quickly than
> the recirculation method is also useful to know.

I should also add that not recirculating also avoids having to allocate 
extra local labels, which may be limited in supply in some deployments, 
and avoids the extra control plane/user complexity associated with 
managing the routes associated with the recirculation.

Thanks,
Rob

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
                   ` (5 preceding siblings ...)
  2017-03-27 22:52 ` David Miller
@ 2017-03-28  9:59 ` Robert Shearman
  6 siblings, 0 replies; 20+ messages in thread
From: Robert Shearman @ 2017-03-28  9:59 UTC (permalink / raw)
  To: David Ahern, netdev; +Cc: roopa, ebiederm

On 25/03/17 17:03, David Ahern wrote:
> Bump the maximum number of labels for MPLS routes from 2 to 12. To keep
> memory consumption in check the labels array is moved to the end of mpls_nh
> and mpls_iptunnel_encap structs as a 0-sized array. Allocations use the
> maximum number of labels across all nexthops in a route for LSR and the
> number of labels configured for LWT.
>
> The mpls_route layout is changed to:
...
> Meaning the via follows its mpls_nh providing better locality as the
> number of labels increases. UDP_RR tests with namespaces shows no impact
> to a modest performance increase with this layout for 1 or 2 labels and
> 1 or 2 nexthops.
>
> The new limit is set to 12 to cover all currently known segment
> routing use cases.
>
> David Ahern (4):
>   mpls: Convert number of nexthops to u8
>   net: mpls: change mpls_route layout
>   net: mpls: bump maximum number of labels
>   net: mpls: Increase max number of labels for lwt encap
>
>  include/net/mpls_iptunnel.h |   4 +-
>  net/mpls/af_mpls.c          | 108 ++++++++++++++++++++++++++------------------
>  net/mpls/internal.h         |  52 ++++++++++++++-------
>  net/mpls/mpls_iptunnel.c    |  13 ++++--
>  4 files changed, 112 insertions(+), 65 deletions(-)
>

Acked-by: Robert Shearman <rshearma@brocade.com>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-28  3:08       ` Eric W. Biederman
  2017-03-28  9:52         ` Robert Shearman
@ 2017-03-28 14:39         ` David Ahern
  2017-03-29 21:20         ` David Ahern
  2 siblings, 0 replies; 20+ messages in thread
From: David Ahern @ 2017-03-28 14:39 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Robert Shearman, netdev, roopa

On 3/27/17 9:08 PM, Eric W. Biederman wrote:
> 
> Overall I like what is being accomplished by this patchset.
> I especially like the fact that the forwarding path is left
> essentially unchanged, and that the struct mpls_route shirnks a little
> for the common case.
> 
> I believe we should just kill MAX_NEW_LABELS.
> 
> I think the only significant change from your patch is the removal of an
> array from mpls_route_config.
> 
> With the removal of MAX_NEW_LABELS I would replace it by a sanity check
> in mpls_rt_alloc that verifies that the amount we are going to allocate
> for struct mpls_route is < PAGE_SIZE.  Anything larger is just
> asking for trouble.

Sure. We would want a consistent API across architectures. As I recall
some of them have 64kB page sizes, so rather than page size let's limit
to the number 4096 rather than PAGE_SIZE.

> 
> That should put our practical limit just a little bit below 32 nexthops
> adding 32 labels each.

I send out a v2 soon.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8
  2017-03-27 22:54       ` Eric W. Biederman
@ 2017-03-28 15:25         ` David Ahern
  2017-03-28 18:39           ` Eric W. Biederman
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2017-03-28 15:25 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: netdev, roopa, rshearma

On 3/27/17 4:54 PM, Eric W. Biederman wrote:
> It is absolutely a no-brainer to change rt_nhn to a u8.  And I very much
> appreciate all work to keep mpls_route into a single cache line.  As in
> practices that is one of the most important parts to performance.
> 
> Which leads to the functions mpls_ifup, mpls_ifdown, and
> mpls_select_multipath.
> 
> To make this all work correctly we need a couple of things.
> - A big fat comment on struct mpls_route and mpls_nh about how
>   and why these structures are modified and not replaced during
>   nexthop processing.  Including the fact that it all modifications
>   may only happen with rntl_lock held.
> 
> - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses,
>   that happen after the route is installed (and is thus rcu reachable).
> 
> - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses,
>   that happen after the route is installed (and is thus rcu reachable).

For both of these, mpls_select_multipath does need to use READ_ONCE to
read the nh_flags and rt_nhn_alive. In this case it is reading a value
that could change behind its back.

The READ_ONCE is not necessary for mpls_ifdown or mpls_ifup as these are
the functions that change the values. These 2 functions only need a
WRITE_ONCE for both struct members.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8
  2017-03-28 15:25         ` David Ahern
@ 2017-03-28 18:39           ` Eric W. Biederman
  0 siblings, 0 replies; 20+ messages in thread
From: Eric W. Biederman @ 2017-03-28 18:39 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, roopa, rshearma

David Ahern <dsa@cumulusnetworks.com> writes:

> On 3/27/17 4:54 PM, Eric W. Biederman wrote:
>> It is absolutely a no-brainer to change rt_nhn to a u8.  And I very much
>> appreciate all work to keep mpls_route into a single cache line.  As in
>> practices that is one of the most important parts to performance.
>> 
>> Which leads to the functions mpls_ifup, mpls_ifdown, and
>> mpls_select_multipath.
>> 
>> To make this all work correctly we need a couple of things.
>> - A big fat comment on struct mpls_route and mpls_nh about how
>>   and why these structures are modified and not replaced during
>>   nexthop processing.  Including the fact that it all modifications
>>   may only happen with rntl_lock held.
>> 
>> - The use of READ_ONCE and WRITE_ONCE on all rt->rt_nhn_alive accesses,
>>   that happen after the route is installed (and is thus rcu reachable).
>> 
>> - The use of READ_ONCE and WRITE_ONCE on all nh->nh_flags accesses,
>>   that happen after the route is installed (and is thus rcu reachable).
>
> For both of these, mpls_select_multipath does need to use READ_ONCE to
> read the nh_flags and rt_nhn_alive. In this case it is reading a value
> that could change behind its back.
>
> The READ_ONCE is not necessary for mpls_ifdown or mpls_ifup as these are
> the functions that change the values. These 2 functions only need a
> WRITE_ONCE for both struct members.

True.  We don't need READ_ONCE under rtnl_lock which we use to protect writes.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route
  2017-03-28  3:08       ` Eric W. Biederman
  2017-03-28  9:52         ` Robert Shearman
  2017-03-28 14:39         ` David Ahern
@ 2017-03-29 21:20         ` David Ahern
  2 siblings, 0 replies; 20+ messages in thread
From: David Ahern @ 2017-03-29 21:20 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Robert Shearman, netdev, roopa

On 3/27/17 9:08 PM, Eric W. Biederman wrote:
> I believe we should just kill MAX_NEW_LABELS.
> 
> I think the only significant change from your patch is the removal of an
> array from mpls_route_config.
> 
> With the removal of MAX_NEW_LABELS I would replace it by a sanity check
> in mpls_rt_alloc that verifies that the amount we are going to allocate
> for struct mpls_route is < PAGE_SIZE.  Anything larger is just
> asking for trouble.
> 
> That should put our practical limit just a little bit below 32 nexthops
> adding 32 labels each.

The 4096 limit works nice for mpls_route but not for lwt encap info.
That struct is 4-bytes + the labels. Seems odd to let ip->mpls allow up
to 255 labels (max for u8) while mpls->mpls has a limit.

I'm going to send v2 soon with the 4096 limit for mpls_route total size
but keeping the MAX_NEW_LABELS with a count of 30 for both. That keeps
the two paths consistent, keeps mpls_iptunnel_encap < 128 bytes for the
max allocation and 30 labels is allows plenty of options for TE and SR.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2017-03-29 21:20 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-25 17:03 [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route David Ahern
2017-03-25 17:03 ` [PATCH net-next 1/4] net: mpls: Convert number of nexthops to u8 David Ahern
2017-03-27  3:11   ` Eric W. Biederman
2017-03-27 14:43     ` David Ahern
2017-03-27 22:54       ` Eric W. Biederman
2017-03-28 15:25         ` David Ahern
2017-03-28 18:39           ` Eric W. Biederman
2017-03-25 17:03 ` [PATCH net-next 2/4] net: mpls: change mpls_route layout David Ahern
2017-03-28  0:04   ` Eric W. Biederman
2017-03-25 17:03 ` [PATCH net-next 3/4] net: mpls: bump maximum number of labels David Ahern
2017-03-25 17:03 ` [PATCH net-next 4/4] net: mpls: Increase max number of labels for lwt encap David Ahern
2017-03-25 19:15 ` [PATCH net-next 0/4] net: mpls: Allow users to configure more labels per route Eric W. Biederman
2017-03-27 10:39   ` Robert Shearman
2017-03-27 14:21     ` David Ahern
2017-03-28  3:08       ` Eric W. Biederman
2017-03-28  9:52         ` Robert Shearman
2017-03-28 14:39         ` David Ahern
2017-03-29 21:20         ` David Ahern
2017-03-27 22:52 ` David Miller
2017-03-28  9:59 ` Robert Shearman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.