netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] net: ipv4: add support for ECMP hash policy choice
@ 2017-03-06 14:59 Nikolay Aleksandrov
  2017-03-06 16:24 ` David Ahern
                   ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-06 14:59 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, davem, roopa, dsa, Nikolay Aleksandrov

This patch adds support for ECMP hash policy choice via a new sysctl
called fib_multipath_hash_policy and also adds support for L4 hashes.
The current values for fib_multipath_hash_policy are:
 0 - layer 3
 1 - layer 4 (new default)
If there's an skb hash already set and it matches the chosen policy then it
will be used instead of being calculated. The ICMP inner IP addresses use
is removed, and we switch to L4 default for better distribution.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
I'm not happy with using an integer, but it produces the smallest churn.
Just let me know if you'd like to switch to a string sysctl.

 Documentation/networking/ip-sysctl.txt |  8 +++
 include/net/ip_fib.h                   | 14 ++---
 include/net/netns/ipv4.h               |  1 +
 include/net/route.h                    |  5 +-
 net/ipv4/fib_frontend.c                |  3 ++
 net/ipv4/fib_semantics.c               | 11 ++--
 net/ipv4/icmp.c                        | 19 +------
 net/ipv4/route.c                       | 93 ++++++++++++++++++----------------
 net/ipv4/sysctl_net_ipv4.c             |  9 ++++
 9 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index fc73eeb7b3b8..15810ca7d8b0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
 	0 - disabled
 	1 - enabled
 
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes. Only valid
+	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+	Default: 1 (Layer 4)
+	Possible values:
+	0 - Layer 3
+	1 - Layer 4
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 368bb4024b78..8ac9bec053c5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -371,17 +371,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
-extern u32 fib_multipath_secret __read_mostly;
-
-static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
-{
-	return jhash_2words((__force u32)saddr, (__force u32)daddr,
-			    fib_multipath_secret) >> 1;
-}
-
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb);
+#endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash);
+		     struct flowi4 *fl4);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 622d2da27135..70a1d4251790 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,6 +152,7 @@ struct netns_ipv4 {
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
+	int sysctl_fib_multipath_hash_policy;
 #endif
 
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
diff --git a/include/net/route.h b/include/net/route.h
index c0874c87c173..77a5c613a290 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -113,13 +113,12 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
-					  int mp_hash);
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp);
 
 static inline struct rtable *__ip_route_output_key(struct net *net,
 						   struct flowi4 *flp)
 {
-	return __ip_route_output_key_hash(net, flp, -1);
+	return __ip_route_output_key_hash(net, flp);
 }
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 42bfd08109dd..bba87195cbf4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1233,6 +1233,9 @@ static int __net_init ip_fib_net_init(struct net *net)
 	/* Avoid false sharing : Use at least a full cache line */
 	size = max_t(size_t, size, L1_CACHE_BYTES);
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	net->ipv4.sysctl_fib_multipath_hash_policy = 1;
+#endif
 	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
 	if (!net->ipv4.fib_table_hash)
 		return -ENOMEM;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..6601bd9744c9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-u32 fib_multipath_secret __read_mostly;
 
 #define for_nexthops(fi) {						\
 	int nhsel; const struct fib_nh *nh;				\
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
 
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
-
-	net_get_random_once(&fib_multipath_secret,
-			    sizeof(fib_multipath_secret));
 }
 
 static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash)
+		     struct flowi4 *fl4)
 {
 	bool oif_check;
 
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi->fib_nhs > 1 && oif_check) {
-		if (mp_hash < 0)
-			mp_hash = get_hash_from_flowi4(fl4) >> 1;
+		int h = fib_multipath_hash(res->fi, fl4, NULL);
 
-		fib_select_multipath(res, mp_hash);
+		fib_select_multipath(res, h);
 	}
 	else
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc310db2708b..4929daf65e6f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	local_bh_enable();
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* Source and destination is swapped. See ip_multipath_icmp_hash */
-static int icmp_multipath_hash_skb(const struct sk_buff *skb)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-
-	return fib_multipath_hash(iph->daddr, iph->saddr);
-}
-
-#else
-
-#define icmp_multipath_hash_skb(skb) (-1)
-
-#endif
-
 static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
 
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-	rt = __ip_route_output_key_hash(net, fl4,
-					icmp_multipath_hash_skb(skb_in));
+	rt = __ip_route_output_key_hash(net, fl4);
 	if (IS_ERR(rt))
 		return rt;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8471dd116771..0193100baac4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1734,45 +1734,54 @@ static int __mkroute_input(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* To make ICMP packets follow the right flow, the multipath hash is
- * calculated from the inner IP addresses in reverse order.
- */
-static int ip_multipath_icmp_hash(struct sk_buff *skb)
-{
-	const struct iphdr *outer_iph = ip_hdr(skb);
-	struct icmphdr _icmph;
-	const struct icmphdr *icmph;
-	struct iphdr _inner_iph;
-	const struct iphdr *inner_iph;
-
-	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
-		goto standard_hash;
-
-	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
-				   &_icmph);
-	if (!icmph)
-		goto standard_hash;
-
-	if (icmph->type != ICMP_DEST_UNREACH &&
-	    icmph->type != ICMP_REDIRECT &&
-	    icmph->type != ICMP_TIME_EXCEEDED &&
-	    icmph->type != ICMP_PARAMETERPROB) {
-		goto standard_hash;
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb)
+{
+	struct net *net = fi->fib_net;
+	struct flow_keys hash_keys;
+	u32 mhash;
+
+	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+	case 0:
+		if (skb && skb_get_hash_raw(skb) && !skb->l4_hash)
+			return skb_get_hash_raw(skb) >> 1;
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		hash_keys.addrs.v4addrs.src = fl4->saddr;
+		hash_keys.addrs.v4addrs.dst = fl4->daddr;
+		break;
+	case 1:
+		/* skb is currently provided only when forwarding */
+		if (skb) {
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			struct flow_keys keys;
+
+			/* short-circuit if we already have L4 hash present */
+			if (skb->l4_hash)
+				return skb_get_hash_raw(skb) >> 1;
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			skb_flow_dissect_flow_keys(skb, &keys, flag);
+			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+			hash_keys.ports.src = keys.ports.src;
+			hash_keys.ports.dst = keys.ports.dst;
+			hash_keys.basic.ip_proto = keys.basic.ip_proto;
+		} else {
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+			hash_keys.ports.src = fl4->fl4_sport;
+			hash_keys.ports.dst = fl4->fl4_dport;
+			hash_keys.basic.ip_proto = fl4->flowi4_proto;
+		}
+		break;
 	}
+	mhash = flow_hash_from_keys(&hash_keys);
 
-	inner_iph = skb_header_pointer(skb,
-				       outer_iph->ihl * 4 + sizeof(_icmph),
-				       sizeof(_inner_iph), &_inner_iph);
-	if (!inner_iph)
-		goto standard_hash;
-
-	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
-
-standard_hash:
-	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+	return mhash >> 1;
 }
-
+EXPORT_SYMBOL_GPL(fib_multipath_hash);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
 static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1791,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1) {
-		int h;
+		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
+		int h = fib_multipath_hash(res->fi, &fl4, skb);
 
-		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
-			h = ip_multipath_icmp_hash(skb);
-		else
-			h = fib_multipath_hash(saddr, daddr);
 		fib_select_multipath(res, h);
 	}
 #endif
@@ -2202,8 +2208,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
  * Major route resolver routine.
  */
 
-struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-					  int mp_hash)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2365,7 +2370,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 		goto make_route;
 	}
 
-	fib_select_path(net, &res, fl4, mp_hash);
+	fib_select_path(net, &res, fl4);
 
 	dev_out = FIB_RES_DEV(res);
 	fl4->flowi4_oif = dev_out->ifindex;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d6880a6149ee..62c4f94923e5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "fib_multipath_hash_policy",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next] net: ipv4: add support for ECMP hash policy choice
  2017-03-06 14:59 [PATCH net-next] net: ipv4: add support for ECMP hash policy choice Nikolay Aleksandrov
@ 2017-03-06 16:24 ` David Ahern
  2017-03-06 16:52   ` Nikolay Aleksandrov
  2017-03-07  6:16 ` Roopa Prabhu
  2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
  2 siblings, 1 reply; 36+ messages in thread
From: David Ahern @ 2017-03-06 16:24 UTC (permalink / raw)
  To: Nikolay Aleksandrov, netdev; +Cc: edumazet, davem, roopa

On 3/6/17 7:59 AM, Nikolay Aleksandrov wrote:
> diff --git a/include/net/route.h b/include/net/route.h
> index c0874c87c173..77a5c613a290 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -113,13 +113,12 @@ struct in_device;
>  int ip_rt_init(void);
>  void rt_cache_flush(struct net *net);
>  void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
> -					  int mp_hash);
> +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp);
>  
>  static inline struct rtable *__ip_route_output_key(struct net *net,
>  						   struct flowi4 *flp)
>  {
> -	return __ip_route_output_key_hash(net, flp, -1);
> +	return __ip_route_output_key_hash(net, flp);
>  }
>  
>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,

The "_hash" variant was added by 79a131592dbb8. If the mp_hash arg is
removed, the "_hash" wrapper should be removed and go back to
__ip_route_output_key.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next] net: ipv4: add support for ECMP hash policy choice
  2017-03-06 16:24 ` David Ahern
@ 2017-03-06 16:52   ` Nikolay Aleksandrov
  0 siblings, 0 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-06 16:52 UTC (permalink / raw)
  To: David Ahern, netdev; +Cc: edumazet, davem, roopa

On 06/03/17 18:24, David Ahern wrote:
> On 3/6/17 7:59 AM, Nikolay Aleksandrov wrote:
>> diff --git a/include/net/route.h b/include/net/route.h
>> index c0874c87c173..77a5c613a290 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -113,13 +113,12 @@ struct in_device;
>>  int ip_rt_init(void);
>>  void rt_cache_flush(struct net *net);
>>  void rt_flush_dev(struct net_device *dev);
>> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
>> -					  int mp_hash);
>> +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp);
>>  
>>  static inline struct rtable *__ip_route_output_key(struct net *net,
>>  						   struct flowi4 *flp)
>>  {
>> -	return __ip_route_output_key_hash(net, flp, -1);
>> +	return __ip_route_output_key_hash(net, flp);
>>  }
>>  
>>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
> 
> The "_hash" variant was added by 79a131592dbb8. If the mp_hash arg is
> removed, the "_hash" wrapper should be removed and go back to
> __ip_route_output_key.
> 

Ah yes, I've missed that. :-) Will remove the _hash variant when posting v2.

Thanks,
 Nik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next] net: ipv4: add support for ECMP hash policy choice
  2017-03-06 14:59 [PATCH net-next] net: ipv4: add support for ECMP hash policy choice Nikolay Aleksandrov
  2017-03-06 16:24 ` David Ahern
@ 2017-03-07  6:16 ` Roopa Prabhu
  2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
  2 siblings, 0 replies; 36+ messages in thread
From: Roopa Prabhu @ 2017-03-07  6:16 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: netdev, edumazet, davem, dsa

On 3/6/17, 6:59 AM, Nikolay Aleksandrov wrote:
> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3
>  1 - layer 4 (new default)
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated. The ICMP inner IP addresses use
> is removed, and we switch to L4 default for better distribution.
>
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> ---
> I'm not happy with using an integer, but it produces the smallest churn.
> Just let me know if you'd like to switch to a string sysctl.
>
to confirm, this makes ipv4 same as the default ecmp hashing in ipv6.
but ipv6 also has the flow label to help in some cases.

If we decide to move to l4 by default, the sysctl might not be necessary.
and, my only concern with moving to default is if it will upset any existing fragmented
flow distributions. I tried to do some quick survey on other system defaults. Most switches I have
heard of use 5-tuple by default in hw. Will look some more.

We still have the option of not making l4 hash the default (your first internal version). It will make
the ipv4 default different than ipv6 (which is today's state of things and that maybe ok if it is just a sysctl away).

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice
  2017-03-06 14:59 [PATCH net-next] net: ipv4: add support for ECMP hash policy choice Nikolay Aleksandrov
  2017-03-06 16:24 ` David Ahern
  2017-03-07  6:16 ` Roopa Prabhu
@ 2017-03-07 11:01 ` Nikolay Aleksandrov
  2017-03-08 12:05   ` Jakub Sitnicki
                     ` (2 more replies)
  2 siblings, 3 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-07 11:01 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, davem, roopa, dsa, Nikolay Aleksandrov

This patch adds support for ECMP hash policy choice via a new sysctl
called fib_multipath_hash_policy and also adds support for L4 hashes.
The current values for fib_multipath_hash_policy are:
 0 - layer 3 (default)
 1 - layer 4
If there's an skb hash already set and it matches the chosen policy then it
will be used instead of being calculated. The ICMP inner IP addresses use
is removed.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
v2:
 - removed the output_key_hash as it's not needed anymore
 - reverted to my original/internal patch with L3 as default hash

 Documentation/networking/ip-sysctl.txt |  8 +++
 include/net/ip_fib.h                   | 14 ++---
 include/net/netns/ipv4.h               |  1 +
 include/net/route.h                    |  9 +---
 net/ipv4/fib_semantics.c               | 11 ++--
 net/ipv4/icmp.c                        | 19 +------
 net/ipv4/route.c                       | 95 ++++++++++++++++++----------------
 net/ipv4/sysctl_net_ipv4.c             |  9 ++++
 8 files changed, 78 insertions(+), 88 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index fc73eeb7b3b8..558e19653106 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
 	0 - disabled
 	1 - enabled
 
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes. Only valid
+	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+	Default: 0 (Layer 3)
+	Possible values:
+	0 - Layer 3
+	1 - Layer 4
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 368bb4024b78..8ac9bec053c5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -371,17 +371,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
-extern u32 fib_multipath_secret __read_mostly;
-
-static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
-{
-	return jhash_2words((__force u32)saddr, (__force u32)daddr,
-			    fib_multipath_secret) >> 1;
-}
-
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb);
+#endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash);
+		     struct flowi4 *fl4);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 622d2da27135..70a1d4251790 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,6 +152,7 @@ struct netns_ipv4 {
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
+	int sysctl_fib_multipath_hash_policy;
 #endif
 
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
diff --git a/include/net/route.h b/include/net/route.h
index c0874c87c173..32412c91c222 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -113,14 +113,7 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
-					  int mp_hash);
-
-static inline struct rtable *__ip_route_output_key(struct net *net,
-						   struct flowi4 *flp)
-{
-	return __ip_route_output_key_hash(net, flp, -1);
-}
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 				    const struct sock *sk);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..6601bd9744c9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-u32 fib_multipath_secret __read_mostly;
 
 #define for_nexthops(fi) {						\
 	int nhsel; const struct fib_nh *nh;				\
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
 
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
-
-	net_get_random_once(&fib_multipath_secret,
-			    sizeof(fib_multipath_secret));
 }
 
 static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash)
+		     struct flowi4 *fl4)
 {
 	bool oif_check;
 
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi->fib_nhs > 1 && oif_check) {
-		if (mp_hash < 0)
-			mp_hash = get_hash_from_flowi4(fl4) >> 1;
+		int h = fib_multipath_hash(res->fi, fl4, NULL);
 
-		fib_select_multipath(res, mp_hash);
+		fib_select_multipath(res, h);
 	}
 	else
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc310db2708b..46630bc30754 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	local_bh_enable();
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* Source and destination is swapped. See ip_multipath_icmp_hash */
-static int icmp_multipath_hash_skb(const struct sk_buff *skb)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-
-	return fib_multipath_hash(iph->daddr, iph->saddr);
-}
-
-#else
-
-#define icmp_multipath_hash_skb(skb) (-1)
-
-#endif
-
 static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
 
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-	rt = __ip_route_output_key_hash(net, fl4,
-					icmp_multipath_hash_skb(skb_in));
+	rt = __ip_route_output_key(net, fl4);
 	if (IS_ERR(rt))
 		return rt;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8471dd116771..cc774d927c5f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1734,45 +1734,54 @@ static int __mkroute_input(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* To make ICMP packets follow the right flow, the multipath hash is
- * calculated from the inner IP addresses in reverse order.
- */
-static int ip_multipath_icmp_hash(struct sk_buff *skb)
-{
-	const struct iphdr *outer_iph = ip_hdr(skb);
-	struct icmphdr _icmph;
-	const struct icmphdr *icmph;
-	struct iphdr _inner_iph;
-	const struct iphdr *inner_iph;
-
-	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
-		goto standard_hash;
-
-	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
-				   &_icmph);
-	if (!icmph)
-		goto standard_hash;
-
-	if (icmph->type != ICMP_DEST_UNREACH &&
-	    icmph->type != ICMP_REDIRECT &&
-	    icmph->type != ICMP_TIME_EXCEEDED &&
-	    icmph->type != ICMP_PARAMETERPROB) {
-		goto standard_hash;
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb)
+{
+	struct net *net = fi->fib_net;
+	struct flow_keys hash_keys;
+	u32 mhash;
+
+	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+	case 0:
+		if (skb && skb_get_hash_raw(skb) && !skb->l4_hash)
+			return skb_get_hash_raw(skb) >> 1;
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		hash_keys.addrs.v4addrs.src = fl4->saddr;
+		hash_keys.addrs.v4addrs.dst = fl4->daddr;
+		break;
+	case 1:
+		/* skb is currently provided only when forwarding */
+		if (skb) {
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			struct flow_keys keys;
+
+			/* short-circuit if we already have L4 hash present */
+			if (skb->l4_hash)
+				return skb_get_hash_raw(skb) >> 1;
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			skb_flow_dissect_flow_keys(skb, &keys, flag);
+			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+			hash_keys.ports.src = keys.ports.src;
+			hash_keys.ports.dst = keys.ports.dst;
+			hash_keys.basic.ip_proto = keys.basic.ip_proto;
+		} else {
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+			hash_keys.ports.src = fl4->fl4_sport;
+			hash_keys.ports.dst = fl4->fl4_dport;
+			hash_keys.basic.ip_proto = fl4->flowi4_proto;
+		}
+		break;
 	}
+	mhash = flow_hash_from_keys(&hash_keys);
 
-	inner_iph = skb_header_pointer(skb,
-				       outer_iph->ihl * 4 + sizeof(_icmph),
-				       sizeof(_inner_iph), &_inner_iph);
-	if (!inner_iph)
-		goto standard_hash;
-
-	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
-
-standard_hash:
-	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+	return mhash >> 1;
 }
-
+EXPORT_SYMBOL_GPL(fib_multipath_hash);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
 static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1791,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1) {
-		int h;
+		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
+		int h = fib_multipath_hash(res->fi, &fl4, skb);
 
-		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
-			h = ip_multipath_icmp_hash(skb);
-		else
-			h = fib_multipath_hash(saddr, daddr);
 		fib_select_multipath(res, h);
 	}
 #endif
@@ -2202,8 +2208,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
  * Major route resolver routine.
  */
 
-struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-					  int mp_hash)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2365,7 +2370,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 		goto make_route;
 	}
 
-	fib_select_path(net, &res, fl4, mp_hash);
+	fib_select_path(net, &res, fl4);
 
 	dev_out = FIB_RES_DEV(res);
 	fl4->flowi4_oif = dev_out->ifindex;
@@ -2378,7 +2383,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	rcu_read_unlock();
 	return rth;
 }
-EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
 {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d6880a6149ee..62c4f94923e5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "fib_multipath_hash_policy",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice
  2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
@ 2017-03-08 12:05   ` Jakub Sitnicki
  2017-03-08 12:43     ` Nikolay Aleksandrov
  2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
  2017-03-16 13:28   ` [PATCH net-next v4] " Nikolay Aleksandrov
  2 siblings, 1 reply; 36+ messages in thread
From: Jakub Sitnicki @ 2017-03-08 12:05 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: netdev, edumazet, davem, roopa, dsa

Hi Nikolay,

On Tue, Mar 07, 2017 at 11:01 AM GMT, Nikolay Aleksandrov wrote:
> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3 (default)
>  1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated. The ICMP inner IP addresses use
> is removed.
>
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> ---
> v2:
>  - removed the output_key_hash as it's not needed anymore
>  - reverted to my original/internal patch with L3 as default hash

What about ICMP PTB (Fragmentation Needed) forwarding that makes PMTUD
work with ECMP in setups like described in RFC7690 [1]?

  ptb -> router ecmp -> next hop L4/L7 load balancer -> destination

       router --> load balancer 1 --->
            \\--> load balancer 2 ---> load-balanced service
             \--> load balancer N --->

Removing special treatment of ICMP errors will break it, won't it?

FWIW, I gave a run to your patch (default settings, L3 hash) with a test
script [2] that simulates such a setup and it confirmed my worries - PTB
errors don't travel back to the source host any more.

-Jakub

[1] https://tools.ietf.org/html/rfc7690#section-2
[2] https://github.com/jsitnicki/tools/commit/ccb85e68421df4ffd8b7abf00f6f5fe1c6a5af76

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice
  2017-03-08 12:05   ` Jakub Sitnicki
@ 2017-03-08 12:43     ` Nikolay Aleksandrov
  2017-03-08 16:00       ` Jakub Sitnicki
  0 siblings, 1 reply; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-08 12:43 UTC (permalink / raw)
  To: Jakub Sitnicki; +Cc: netdev, edumazet, davem, roopa, dsa

On 08/03/17 14:05, Jakub Sitnicki wrote:
> Hi Nikolay,
> 
> On Tue, Mar 07, 2017 at 11:01 AM GMT, Nikolay Aleksandrov wrote:
>> This patch adds support for ECMP hash policy choice via a new sysctl
>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> The current values for fib_multipath_hash_policy are:
>>  0 - layer 3 (default)
>>  1 - layer 4
>> If there's an skb hash already set and it matches the chosen policy then it
>> will be used instead of being calculated. The ICMP inner IP addresses use
>> is removed.
>>
>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> ---
>> v2:
>>  - removed the output_key_hash as it's not needed anymore
>>  - reverted to my original/internal patch with L3 as default hash
> 
> What about ICMP PTB (Fragmentation Needed) forwarding that makes PMTUD
> work with ECMP in setups like described in RFC7690 [1]?
> 
>   ptb -> router ecmp -> next hop L4/L7 load balancer -> destination
> 
>        router --> load balancer 1 --->
>             \\--> load balancer 2 ---> load-balanced service
>              \--> load balancer N --->
> 
> Removing special treatment of ICMP errors will break it, won't it?
> 

Yes, I am aware and this decision was made with that in mind.
We'd like to use the HW hash when available and IIRC that doesn't play well with
special-casing ICMP errors for anycast as it may not match also. Another thing,
again if I remember correctly, was that this behaviour is closer to how hardware
handles ECMP.

One thing we can do is leave the current L3 behaviour with ICMP error handling
and add a new L3 mode that tries to use the skb hash when available and doesn't
care about the packet type.

What do you think ?

> FWIW, I gave a run to your patch (default settings, L3 hash) with a test
> script [2] that simulates such a setup and it confirmed my worries - PTB
> errors don't travel back to the source host any more.

Thanks for testing.

> 
> -Jakub
> 
> [1] https://tools.ietf.org/html/rfc7690#section-2
> [2] https://github.com/jsitnicki/tools/commit/ccb85e68421df4ffd8b7abf00f6f5fe1c6a5af76
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice
  2017-03-08 12:43     ` Nikolay Aleksandrov
@ 2017-03-08 16:00       ` Jakub Sitnicki
  2017-03-13  2:23         ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Jakub Sitnicki @ 2017-03-08 16:00 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: netdev, edumazet, davem, roopa, dsa

On Wed, Mar 08, 2017 at 12:43 PM GMT, Nikolay Aleksandrov wrote:
> On 08/03/17 14:05, Jakub Sitnicki wrote:
>> On Tue, Mar 07, 2017 at 11:01 AM GMT, Nikolay Aleksandrov wrote:
>>> This patch adds support for ECMP hash policy choice via a new sysctl
>>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>>> The current values for fib_multipath_hash_policy are:
>>>  0 - layer 3 (default)
>>>  1 - layer 4
>>> If there's an skb hash already set and it matches the chosen policy then it
>>> will be used instead of being calculated. The ICMP inner IP addresses use
>>> is removed.
>>>
>>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>>> ---
>>> v2:
>>>  - removed the output_key_hash as it's not needed anymore
>>>  - reverted to my original/internal patch with L3 as default hash
>> 
>> What about ICMP PTB (Fragmentation Needed) forwarding that makes PMTUD
>> work with ECMP in setups like described in RFC7690 [1]?
>> 
>>   ptb -> router ecmp -> next hop L4/L7 load balancer -> destination
>> 
>>        router --> load balancer 1 --->
>>             \\--> load balancer 2 ---> load-balanced service
>>              \--> load balancer N --->
>> 
>> Removing special treatment of ICMP errors will break it, won't it?
>> 
>
> Yes, I am aware and this decision was made with that in mind.
> We'd like to use the HW hash when available and IIRC that doesn't play well with
> special-casing ICMP errors for anycast as it may not match also. Another thing,
> again if I remember correctly, was that this behaviour is closer to how hardware
> handles ECMP.

OK, I wanted to make sure that is not an oversight that ECMP routing in
ipv4 stack is to be dumbed down to match the hardware behavior. I
thought that it was an advantage that we want to have over hardware
routers. (To be fair, I should mention that we don't have it in ipv6
stack ATM.)

>
> One thing we can do is leave the current L3 behaviour with ICMP error handling
> and add a new L3 mode that tries to use the skb hash when available and doesn't
> care about the packet type.
>
> What do you think ?

Sounds good to me. Would be good to hear other opinions also.

Thanks,
Jakub

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice
  2017-03-08 16:00       ` Jakub Sitnicki
@ 2017-03-13  2:23         ` David Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2017-03-13  2:23 UTC (permalink / raw)
  To: jkbs; +Cc: nikolay, netdev, edumazet, roopa, dsa

From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 08 Mar 2017 17:00:05 +0100

> On Wed, Mar 08, 2017 at 12:43 PM GMT, Nikolay Aleksandrov wrote:
>> On 08/03/17 14:05, Jakub Sitnicki wrote:
>>> On Tue, Mar 07, 2017 at 11:01 AM GMT, Nikolay Aleksandrov wrote:
>>>> This patch adds support for ECMP hash policy choice via a new sysctl
>>>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>>>> The current values for fib_multipath_hash_policy are:
>>>>  0 - layer 3 (default)
>>>>  1 - layer 4
>>>> If there's an skb hash already set and it matches the chosen policy then it
>>>> will be used instead of being calculated. The ICMP inner IP addresses use
>>>> is removed.
>>>>
>>>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>>>> ---
>>>> v2:
>>>>  - removed the output_key_hash as it's not needed anymore
>>>>  - reverted to my original/internal patch with L3 as default hash
>>> 
>>> What about ICMP PTB (Fragmentation Needed) forwarding that makes PMTUD
>>> work with ECMP in setups like described in RFC7690 [1]?
>>> 
>>>   ptb -> router ecmp -> next hop L4/L7 load balancer -> destination
>>> 
>>>        router --> load balancer 1 --->
>>>             \\--> load balancer 2 ---> load-balanced service
>>>              \--> load balancer N --->
>>> 
>>> Removing special treatment of ICMP errors will break it, won't it?
>>> 
>>
>> Yes, I am aware and this decision was made with that in mind.
>> We'd like to use the HW hash when available and IIRC that doesn't play well with
>> special-casing ICMP errors for anycast as it may not match also. Another thing,
>> again if I remember correctly, was that this behaviour is closer to how hardware
>> handles ECMP.
> 
> OK, I wanted to make sure that is not an oversight that ECMP routing in
> ipv4 stack is to be dumbed down to match the hardware behavior. I
> thought that it was an advantage that we want to have over hardware
> routers. (To be fair, I should mention that we don't have it in ipv6
> stack ATM.)
> 
>>
>> One thing we can do is leave the current L3 behaviour with ICMP error handling
>> and add a new L3 mode that tries to use the skb hash when available and doesn't
>> care about the packet type.
>>
>> What do you think ?
> 
> Sounds good to me. Would be good to hear other opinions also.

This would be so much less of an issue with symmetric hashing.  A quick glance
seems to indicate that Microsoft didn't specify the Toeplitz hash to order the
ports and the addresses so that it would be symmetric, so we can guess what
every piece of hardware out there computing a hash does :-/

We could solve this ICMP problem by using a symmetric hash (flow
dissector already supports this), but then we're back to the problem
that this behaves differently from card computed hashes and hardware
offload of ECMP.

I have to say that losing this ICMP handling makes the current code a
non-starter.  The existing code explicitly was written to handle this
case properly, so just undoing it and making it stop working is not
really something we can do.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
  2017-03-08 12:05   ` Jakub Sitnicki
@ 2017-03-14 15:36   ` Nikolay Aleksandrov
  2017-03-14 15:55     ` Stephen Hemminger
                       ` (2 more replies)
  2017-03-16 13:28   ` [PATCH net-next v4] " Nikolay Aleksandrov
  2 siblings, 3 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-14 15:36 UTC (permalink / raw)
  To: netdev; +Cc: davem, roopa, dsa, jkbs, edumazet, pch, Nikolay Aleksandrov

This patch adds support for ECMP hash policy choice via a new sysctl
called fib_multipath_hash_policy and also adds support for L4 hashes.
The current values for fib_multipath_hash_policy are:
 0 - layer 3 (default)
 1 - layer 4
If there's an skb hash already set and it matches the chosen policy then it
will be used instead of being calculated (currently only for L4).
In L3 mode we always calculate the hash due to the ICMP error special
case, the flow dissector's field consistentification should handle the
address order thus we can remove the address reversals.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
v3:
 - keep the ICMP error special handling and always calc L3 hash
   Jakub, could you please run your tests with this version ?

v2:
 - removed the output_key_hash as it's not needed anymore
 - reverted to my original/internal patch with L3 as default hash

 Documentation/networking/ip-sysctl.txt |  8 +++
 include/net/ip_fib.h                   | 14 ++----
 include/net/netns/ipv4.h               |  1 +
 include/net/route.h                    |  9 +---
 net/ipv4/fib_semantics.c               | 11 ++--
 net/ipv4/icmp.c                        | 19 +------
 net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
 net/ipv4/sysctl_net_ipv4.c             |  9 ++++
 8 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index fc73eeb7b3b8..558e19653106 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
 	0 - disabled
 	1 - enabled
 
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes. Only valid
+	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+	Default: 0 (Layer 3)
+	Possible values:
+	0 - Layer 3
+	1 - Layer 4
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index d9cee9659978..8c608a17bd9a 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
-extern u32 fib_multipath_secret __read_mostly;
-
-static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
-{
-	return jhash_2words((__force u32)saddr, (__force u32)daddr,
-			    fib_multipath_secret) >> 1;
-}
-
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb);
+#endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash);
+		     struct flowi4 *fl4);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 622d2da27135..70a1d4251790 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,6 +152,7 @@ struct netns_ipv4 {
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
+	int sysctl_fib_multipath_hash_policy;
 #endif
 
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
diff --git a/include/net/route.h b/include/net/route.h
index c0874c87c173..32412c91c222 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -113,14 +113,7 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
-					  int mp_hash);
-
-static inline struct rtable *__ip_route_output_key(struct net *net,
-						   struct flowi4 *flp)
-{
-	return __ip_route_output_key_hash(net, flp, -1);
-}
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 				    const struct sock *sk);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..6601bd9744c9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-u32 fib_multipath_secret __read_mostly;
 
 #define for_nexthops(fi) {						\
 	int nhsel; const struct fib_nh *nh;				\
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
 
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
-
-	net_get_random_once(&fib_multipath_secret,
-			    sizeof(fib_multipath_secret));
 }
 
 static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash)
+		     struct flowi4 *fl4)
 {
 	bool oif_check;
 
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi->fib_nhs > 1 && oif_check) {
-		if (mp_hash < 0)
-			mp_hash = get_hash_from_flowi4(fl4) >> 1;
+		int h = fib_multipath_hash(res->fi, fl4, NULL);
 
-		fib_select_multipath(res, mp_hash);
+		fib_select_multipath(res, h);
 	}
 	else
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc310db2708b..46630bc30754 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	local_bh_enable();
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* Source and destination is swapped. See ip_multipath_icmp_hash */
-static int icmp_multipath_hash_skb(const struct sk_buff *skb)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-
-	return fib_multipath_hash(iph->daddr, iph->saddr);
-}
-
-#else
-
-#define icmp_multipath_hash_skb(skb) (-1)
-
-#endif
-
 static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
 
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-	rt = __ip_route_output_key_hash(net, fl4,
-					icmp_multipath_hash_skb(skb_in));
+	rt = __ip_route_output_key(net, fl4);
 	if (IS_ERR(rt))
 		return rt;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8471dd116771..3d7f99747285 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1734,11 +1734,11 @@ static int __mkroute_input(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-
 /* To make ICMP packets follow the right flow, the multipath hash is
- * calculated from the inner IP addresses in reverse order.
+ * calculated from the inner IP addresses.
  */
-static int ip_multipath_icmp_hash(struct sk_buff *skb)
+static void ip_multipath_icmp_hash(const struct sk_buff *skb,
+				   struct flowi4 *fl4)
 {
 	const struct iphdr *outer_iph = ip_hdr(skb);
 	struct icmphdr _icmph;
@@ -1746,33 +1746,85 @@ static int ip_multipath_icmp_hash(struct sk_buff *skb)
 	struct iphdr _inner_iph;
 	const struct iphdr *inner_iph;
 
+	fl4->saddr = outer_iph->saddr;
+	fl4->daddr = outer_iph->daddr;
 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
-		goto standard_hash;
+		return;
 
 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
 				   &_icmph);
 	if (!icmph)
-		goto standard_hash;
+		return;
 
 	if (icmph->type != ICMP_DEST_UNREACH &&
 	    icmph->type != ICMP_REDIRECT &&
 	    icmph->type != ICMP_TIME_EXCEEDED &&
-	    icmph->type != ICMP_PARAMETERPROB) {
-		goto standard_hash;
-	}
+	    icmph->type != ICMP_PARAMETERPROB)
+		return;
 
 	inner_iph = skb_header_pointer(skb,
 				       outer_iph->ihl * 4 + sizeof(_icmph),
 				       sizeof(_inner_iph), &_inner_iph);
 	if (!inner_iph)
-		goto standard_hash;
+		return;
+	fl4->saddr = inner_iph->saddr;
+	fl4->daddr = inner_iph->daddr;
+}
 
-	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb)
+{
+	struct net *net = fi->fib_net;
+	struct flow_keys hash_keys;
+	u32 mhash;
 
-standard_hash:
-	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
-}
+	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+	case 0:
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		if (skb && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			struct flowi4 _fl4;
 
+			ip_multipath_icmp_hash(skb, &_fl4);
+			hash_keys.addrs.v4addrs.src = _fl4.saddr;
+			hash_keys.addrs.v4addrs.dst = _fl4.daddr;
+		} else {
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+		}
+		break;
+	case 1:
+		/* skb is currently provided only when forwarding */
+		if (skb) {
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			struct flow_keys keys;
+
+			/* short-circuit if we already have L4 hash present */
+			if (skb->l4_hash)
+				return skb_get_hash_raw(skb) >> 1;
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			skb_flow_dissect_flow_keys(skb, &keys, flag);
+			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+			hash_keys.ports.src = keys.ports.src;
+			hash_keys.ports.dst = keys.ports.dst;
+			hash_keys.basic.ip_proto = keys.basic.ip_proto;
+		} else {
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+			hash_keys.ports.src = fl4->fl4_sport;
+			hash_keys.ports.dst = fl4->fl4_dport;
+			hash_keys.basic.ip_proto = fl4->flowi4_proto;
+		}
+		break;
+	}
+	mhash = flow_hash_from_keys(&hash_keys);
+
+	return mhash >> 1;
+}
+EXPORT_SYMBOL_GPL(fib_multipath_hash);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
 static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1834,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1) {
-		int h;
+		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
+		int h = fib_multipath_hash(res->fi, &fl4, skb);
 
-		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
-			h = ip_multipath_icmp_hash(skb);
-		else
-			h = fib_multipath_hash(saddr, daddr);
 		fib_select_multipath(res, h);
 	}
 #endif
@@ -2202,8 +2251,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
  * Major route resolver routine.
  */
 
-struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-					  int mp_hash)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 		goto make_route;
 	}
 
-	fib_select_path(net, &res, fl4, mp_hash);
+	fib_select_path(net, &res, fl4);
 
 	dev_out = FIB_RES_DEV(res);
 	fl4->flowi4_oif = dev_out->ifindex;
@@ -2378,7 +2426,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	rcu_read_unlock();
 	return rth;
 }
-EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
 {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d6880a6149ee..62c4f94923e5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "fib_multipath_hash_policy",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
@ 2017-03-14 15:55     ` Stephen Hemminger
  2017-03-14 15:58       ` Nikolay Aleksandrov
  2017-03-14 18:55     ` Nikolay Aleksandrov
  2017-03-15 11:32     ` Jakub Sitnicki
  2 siblings, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2017-03-14 15:55 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: netdev, davem, roopa, dsa, jkbs, edumazet, pch

On Tue, 14 Mar 2017 17:36:15 +0200
Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:

> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3 (default)
>  1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated (currently only for L4).
> In L3 mode we always calculate the hash due to the ICMP error special
> case, the flow dissector's field consistentification should handle the
> address order thus we can remove the address reversals.
> 
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

It is good to see ECMP come back from the grave.
Linux used to support it long ago but was abandoned after it was unstable
and removed from iproute2 in 2012.

The old API was through route attributes which makes more sense than
doing it with sysctl. It makes more sense to use netlink instead.
Therefore please go back and do something like the old API rather than doing it through
sysctl.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 15:55     ` Stephen Hemminger
@ 2017-03-14 15:58       ` Nikolay Aleksandrov
  2017-03-14 18:48         ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-14 15:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, davem, roopa, dsa, jkbs, edumazet, pch

On 14/03/17 17:55, Stephen Hemminger wrote:
> On Tue, 14 Mar 2017 17:36:15 +0200
> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
> 
>> This patch adds support for ECMP hash policy choice via a new sysctl
>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> The current values for fib_multipath_hash_policy are:
>>  0 - layer 3 (default)
>>  1 - layer 4
>> If there's an skb hash already set and it matches the chosen policy then it
>> will be used instead of being calculated (currently only for L4).
>> In L3 mode we always calculate the hash due to the ICMP error special
>> case, the flow dissector's field consistentification should handle the
>> address order thus we can remove the address reversals.
>>
>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> 
> It is good to see ECMP come back from the grave.
> Linux used to support it long ago but was abandoned after it was unstable
> and removed from iproute2 in 2012.
> 
> The old API was through route attributes which makes more sense than
> doing it with sysctl. It makes more sense to use netlink instead.
> Therefore please go back and do something like the old API rather than doing it through
> sysctl.
> 

That's what my initial version did, but this was discussed during NetConf in Seville
and it was decided that it's best to make a global sysctl, thus the change.

Thanks,
 Nik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 15:58       ` Nikolay Aleksandrov
@ 2017-03-14 18:48         ` David Miller
  2017-03-14 20:25           ` Stephen Hemminger
  0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2017-03-14 18:48 UTC (permalink / raw)
  To: nikolay; +Cc: stephen, netdev, roopa, dsa, jkbs, edumazet, pch

From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 14 Mar 2017 17:58:46 +0200

> On 14/03/17 17:55, Stephen Hemminger wrote:
>> On Tue, 14 Mar 2017 17:36:15 +0200
>> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
>> 
>>> This patch adds support for ECMP hash policy choice via a new sysctl
>>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>>> The current values for fib_multipath_hash_policy are:
>>>  0 - layer 3 (default)
>>>  1 - layer 4
>>> If there's an skb hash already set and it matches the chosen policy then it
>>> will be used instead of being calculated (currently only for L4).
>>> In L3 mode we always calculate the hash due to the ICMP error special
>>> case, the flow dissector's field consistentification should handle the
>>> address order thus we can remove the address reversals.
>>>
>>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> 
>> It is good to see ECMP come back from the grave.
>> Linux used to support it long ago but was abandoned after it was unstable
>> and removed from iproute2 in 2012.
>> 
>> The old API was through route attributes which makes more sense than
>> doing it with sysctl. It makes more sense to use netlink instead.
>> Therefore please go back and do something like the old API rather than doing it through
>> sysctl.
>> 
> 
> That's what my initial version did, but this was discussed during NetConf in Seville
> and it was decided that it's best to make a global sysctl, thus the change.

Correct, we discussed this, and we all agreed to only have a sysctl for now.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
  2017-03-14 15:55     ` Stephen Hemminger
@ 2017-03-14 18:55     ` Nikolay Aleksandrov
  2017-03-15 11:32     ` Jakub Sitnicki
  2 siblings, 0 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-14 18:55 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Roopa Prabhu, David Ahern, jkbs, edumazet, pch


> On Mar 14, 2017, at 5:36 PM, Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
> 
> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
> 0 - layer 3 (default)
> 1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated (currently only for L4).
> In L3 mode we always calculate the hash due to the ICMP error special
> case, the flow dissector's field consistentification should handle the
> address order thus we can remove the address reversals.
> 
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> ---
> v3:
> - keep the ICMP error special handling and always calc L3 hash
>   Jakub, could you please run your tests with this version ?
> 
> v2:
> - removed the output_key_hash as it's not needed anymore
> - reverted to my original/internal patch with L3 as default hash
> 
> Documentation/networking/ip-sysctl.txt |  8 +++
> include/net/ip_fib.h                   | 14 ++----
> include/net/netns/ipv4.h               |  1 +
> include/net/route.h                    |  9 +---
> net/ipv4/fib_semantics.c               | 11 ++--
> net/ipv4/icmp.c                        | 19 +------
> net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
> net/ipv4/sysctl_net_ipv4.c             |  9 ++++
> 8 files changed, 98 insertions(+), 65 deletions(-)
> 
[snip]
> /* To make ICMP packets follow the right flow, the multipath hash is
> - * calculated from the inner IP addresses in reverse order.
> + * calculated from the inner IP addresses.
>  */
> -static int ip_multipath_icmp_hash(struct sk_buff *skb)
> +static void ip_multipath_icmp_hash(const struct sk_buff *skb,
> +				   struct flowi4 *fl4)
> {
> 	const struct iphdr *outer_iph = ip_hdr(skb);
> 	struct icmphdr _icmph;
> @@ -1746,33 +1746,85 @@ static int ip_multipath_icmp_hash(struct sk_buff *skb)
> 	struct iphdr _inner_iph;
> 	const struct iphdr *inner_iph;
> 
> +	fl4->saddr = outer_iph->saddr;
> +	fl4->daddr = outer_iph->daddr;
> 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
> -		goto standard_hash;
> +		return;
> 
> 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
> 				   &_icmph);
> 	if (!icmph)
> -		goto standard_hash;
> +		return;
> 
> 	if (icmph->type != ICMP_DEST_UNREACH &&
> 	    icmph->type != ICMP_REDIRECT &&
> 	    icmph->type != ICMP_TIME_EXCEEDED &&
> -	    icmph->type != ICMP_PARAMETERPROB) {
> -		goto standard_hash;
> -	}
> +	    icmph->type != ICMP_PARAMETERPROB)
> +		return;
> 
> 	inner_iph = skb_header_pointer(skb,
> 				       outer_iph->ihl * 4 + sizeof(_icmph),
> 				       sizeof(_inner_iph), &_inner_iph);
> 	if (!inner_iph)
> -		goto standard_hash;
> +		return;
> +	fl4->saddr = inner_iph->saddr;
> +	fl4->daddr = inner_iph->daddr;
> +}
> 
> -	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> +		       const struct sk_buff *skb)
> +{
> +	struct net *net = fi->fib_net;
> +	struct flow_keys hash_keys;
> +	u32 mhash;
> 
> -standard_hash:
> -	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
> -}
> +	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
> +	case 0:
> +		memset(&hash_keys, 0, sizeof(hash_keys));
> +		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> +		if (skb && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
> +			struct flowi4 _fl4;
> 
> +			ip_multipath_icmp_hash(skb, &_fl4);

Ugh, obviously I could’ve just passed hash_keys here, will  change this in v4 but I’ll wait
to see if there aren’t any other comments or issues.

Thanks,
 Nik

> +			hash_keys.addrs.v4addrs.src = _fl4.saddr;
> +			hash_keys.addrs.v4addrs.dst = _fl4.daddr;
> +		} else {
> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
> +		}
> +		break;
> +	case 1:
> +		/* skb is currently provided only when forwarding */
> +		if (skb) {
> +			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
> +			struct flow_keys keys;
> +
> +			/* short-circuit if we already have L4 hash present */
> +			if (skb->l4_hash)
> +				return skb_get_hash_raw(skb) >> 1;
> +			memset(&hash_keys, 0, sizeof(hash_keys));
> +			skb_flow_dissect_flow_keys(skb, &keys, flag);
> +			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
> +			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
> +			hash_keys.ports.src = keys.ports.src;
> +			hash_keys.ports.dst = keys.ports.dst;
> +			hash_keys.basic.ip_proto = keys.basic.ip_proto;
> +		} else {
> +			memset(&hash_keys, 0, sizeof(hash_keys));
> +			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
> +			hash_keys.ports.src = fl4->fl4_sport;
> +			hash_keys.ports.dst = fl4->fl4_dport;
> +			hash_keys.basic.ip_proto = fl4->flowi4_proto;
> +		}
> +		break;
> +	}
> +	mhash = flow_hash_from_keys(&hash_keys);
> +
> +	return mhash >> 1;
> +}
> +EXPORT_SYMBOL_GPL(fib_multipath_hash);
> #endif /* CONFIG_IP_ROUTE_MULTIPATH */

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 18:48         ` David Miller
@ 2017-03-14 20:25           ` Stephen Hemminger
  2017-03-14 21:10             ` Roopa Prabhu
  2017-03-15  0:24             ` David Miller
  0 siblings, 2 replies; 36+ messages in thread
From: Stephen Hemminger @ 2017-03-14 20:25 UTC (permalink / raw)
  To: David Miller; +Cc: nikolay, netdev, roopa, dsa, jkbs, edumazet, pch

On Tue, 14 Mar 2017 11:48:37 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> Date: Tue, 14 Mar 2017 17:58:46 +0200
> 
> > On 14/03/17 17:55, Stephen Hemminger wrote:  
> >> On Tue, 14 Mar 2017 17:36:15 +0200
> >> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
> >>   
> >>> This patch adds support for ECMP hash policy choice via a new sysctl
> >>> called fib_multipath_hash_policy and also adds support for L4 hashes.
> >>> The current values for fib_multipath_hash_policy are:
> >>>  0 - layer 3 (default)
> >>>  1 - layer 4
> >>> If there's an skb hash already set and it matches the chosen policy then it
> >>> will be used instead of being calculated (currently only for L4).
> >>> In L3 mode we always calculate the hash due to the ICMP error special
> >>> case, the flow dissector's field consistentification should handle the
> >>> address order thus we can remove the address reversals.
> >>>
> >>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>  
> >> 
> >> It is good to see ECMP come back from the grave.
> >> Linux used to support it long ago but was abandoned after it was unstable
> >> and removed from iproute2 in 2012.
> >> 
> >> The old API was through route attributes which makes more sense than
> >> doing it with sysctl. It makes more sense to use netlink instead.
> >> Therefore please go back and do something like the old API rather than doing it through
> >> sysctl.
> >>   
> > 
> > That's what my initial version did, but this was discussed during NetConf in Seville
> > and it was decided that it's best to make a global sysctl, thus the change.  
> 
> Correct, we discussed this, and we all agreed to only have a sysctl for now.

Why? If you are going to have private discussions please post the rationale
in public.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 20:25           ` Stephen Hemminger
@ 2017-03-14 21:10             ` Roopa Prabhu
  2017-03-14 21:42               ` Stephen Hemminger
  2017-03-15  0:24             ` David Miller
  1 sibling, 1 reply; 36+ messages in thread
From: Roopa Prabhu @ 2017-03-14 21:10 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Nikolay Aleksandrov, netdev, David Ahern, jkbs,
	Eric Dumazet, Peter Christensen

On Tue, Mar 14, 2017 at 1:25 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 14 Mar 2017 11:48:37 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
>
>> From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> Date: Tue, 14 Mar 2017 17:58:46 +0200
>>
>> > On 14/03/17 17:55, Stephen Hemminger wrote:
>> >> On Tue, 14 Mar 2017 17:36:15 +0200
>> >> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
>> >>
>> >>> This patch adds support for ECMP hash policy choice via a new sysctl
>> >>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> >>> The current values for fib_multipath_hash_policy are:
>> >>>  0 - layer 3 (default)
>> >>>  1 - layer 4
>> >>> If there's an skb hash already set and it matches the chosen policy then it
>> >>> will be used instead of being calculated (currently only for L4).
>> >>> In L3 mode we always calculate the hash due to the ICMP error special
>> >>> case, the flow dissector's field consistentification should handle the
>> >>> address order thus we can remove the address reversals.
>> >>>
>> >>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> >>
>> >> It is good to see ECMP come back from the grave.
>> >> Linux used to support it long ago but was abandoned after it was unstable
>> >> and removed from iproute2 in 2012.
>> >>
>> >> The old API was through route attributes which makes more sense than
>> >> doing it with sysctl. It makes more sense to use netlink instead.
>> >> Therefore please go back and do something like the old API rather than doing it through
>> >> sysctl.
>> >>
>> >
>> > That's what my initial version did, but this was discussed during NetConf in Seville
>> > and it was decided that it's best to make a global sysctl, thus the change.
>>
>> Correct, we discussed this, and we all agreed to only have a sysctl for now.
>
> Why? If you are going to have private discussions please post the rationale
> in public.

Stephen, is there any reason to have a per ecmp route multipath algo
selection ?.
All platforms have a global multipath selection algo. I also don't see
routing daemons ready or willing to specify a per ecmp route multipath
selection algo attribute.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 21:10             ` Roopa Prabhu
@ 2017-03-14 21:42               ` Stephen Hemminger
  2017-03-14 22:38                 ` Roopa Prabhu
  0 siblings, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2017-03-14 21:42 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: David Miller, Nikolay Aleksandrov, netdev, David Ahern, jkbs,
	Eric Dumazet, Peter Christensen

On Tue, 14 Mar 2017 14:10:22 -0700
Roopa Prabhu <roopa@cumulusnetworks.com> wrote:

> On Tue, Mar 14, 2017 at 1:25 PM, Stephen Hemminger
> <stephen@networkplumber.org> wrote:
> > On Tue, 14 Mar 2017 11:48:37 -0700 (PDT)
> > David Miller <davem@davemloft.net> wrote:
> >  
> >> From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> >> Date: Tue, 14 Mar 2017 17:58:46 +0200
> >>  
> >> > On 14/03/17 17:55, Stephen Hemminger wrote:  
> >> >> On Tue, 14 Mar 2017 17:36:15 +0200
> >> >> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
> >> >>  
> >> >>> This patch adds support for ECMP hash policy choice via a new sysctl
> >> >>> called fib_multipath_hash_policy and also adds support for L4 hashes.
> >> >>> The current values for fib_multipath_hash_policy are:
> >> >>>  0 - layer 3 (default)
> >> >>>  1 - layer 4
> >> >>> If there's an skb hash already set and it matches the chosen policy then it
> >> >>> will be used instead of being calculated (currently only for L4).
> >> >>> In L3 mode we always calculate the hash due to the ICMP error special
> >> >>> case, the flow dissector's field consistentification should handle the
> >> >>> address order thus we can remove the address reversals.
> >> >>>
> >> >>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>  
> >> >>
> >> >> It is good to see ECMP come back from the grave.
> >> >> Linux used to support it long ago but was abandoned after it was unstable
> >> >> and removed from iproute2 in 2012.
> >> >>
> >> >> The old API was through route attributes which makes more sense than
> >> >> doing it with sysctl. It makes more sense to use netlink instead.
> >> >> Therefore please go back and do something like the old API rather than doing it through
> >> >> sysctl.
> >> >>  
> >> >
> >> > That's what my initial version did, but this was discussed during NetConf in Seville
> >> > and it was decided that it's best to make a global sysctl, thus the change.  
> >>
> >> Correct, we discussed this, and we all agreed to only have a sysctl for now.  
> >
> > Why? If you are going to have private discussions please post the rationale
> > in public.  
> 
> Stephen, is there any reason to have a per ecmp route multipath algo
> selection ?.
> All platforms have a global multipath selection algo. I also don't see
> routing daemons ready or willing to specify a per ecmp route multipath
> selection algo attribute.

There is no compelling reason to make the attribute per route. But the
issue is more that configuration through sysctl's is problematic. It doesn't
fit into the standard API paradigm. Sysctl's are like routing patches not
part of the real CLI. Trying to trap sysctl's for things like switchedev
offload is particularly problematic. I can see the case for either way,
and don't have a fixed opinion.

The bigger discussion is trying to keep a record of the rationale for decisions
such that there isn't buried tribal knowledge. This is why Dave has always been
quite insistent on having discussions on the mailing list. There doesn't seem to
be a good long term record other than Documentation/networking or commit logs.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 21:42               ` Stephen Hemminger
@ 2017-03-14 22:38                 ` Roopa Prabhu
  2017-03-14 23:27                   ` Stephen Hemminger
  0 siblings, 1 reply; 36+ messages in thread
From: Roopa Prabhu @ 2017-03-14 22:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Nikolay Aleksandrov, netdev, David Ahern, jkbs,
	Eric Dumazet, Peter Christensen

On Tue, Mar 14, 2017 at 2:42 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 14 Mar 2017 14:10:22 -0700
> Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>
>> On Tue, Mar 14, 2017 at 1:25 PM, Stephen Hemminger
>> <stephen@networkplumber.org> wrote:
>> > On Tue, 14 Mar 2017 11:48:37 -0700 (PDT)
>> > David Miller <davem@davemloft.net> wrote:
>> >
>> >> From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

[snip]


>> >> > That's what my initial version did, but this was discussed during NetConf in Seville
>> >> > and it was decided that it's best to make a global sysctl, thus the change.
>> >>
>> >> Correct, we discussed this, and we all agreed to only have a sysctl for now.
>> >
>> > Why? If you are going to have private discussions please post the rationale
>> > in public.
>>
>> Stephen, is there any reason to have a per ecmp route multipath algo
>> selection ?.
>> All platforms have a global multipath selection algo. I also don't see
>> routing daemons ready or willing to specify a per ecmp route multipath
>> selection algo attribute.
>
> There is no compelling reason to make the attribute per route. But the
> issue is more that configuration through sysctl's is problematic. It doesn't
> fit into the standard API paradigm. Sysctl's are like routing patches not
> part of the real CLI. Trying to trap sysctl's for things like switchedev
> offload is particularly problematic. I can see the case for either way,
> and don't have a fixed opinion.

ok. understand the switchdev offload part. It was that way in the past...but
today you can listen to sysctl updates on the netconf netlink channel.
it works pretty well.

>
> The bigger discussion is trying to keep a record of the rationale for decisions
> such that there isn't buried tribal knowledge. This is why Dave has always been
> quite insistent on having discussions on the mailing list. There doesn't seem to
> be a good long term record other than Documentation/networking or commit logs.
>

agree. Most of the discussion around this has happened on the netdev
mailing list so far.
The previous set that updated ecmp algo tried to add a per route
attribute and after review on this list was moved to
on by default. Nikolay's first version included a per route
attribute...to follow the previous ecmp work.
Before Nikolay posted the second version, my feedback was to make it
global. And this feedback was only re-iterated at
last netconf/netdev. so, we can continue the discussion here if there
are other opinions.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 22:38                 ` Roopa Prabhu
@ 2017-03-14 23:27                   ` Stephen Hemminger
  2017-03-14 23:45                     ` David Ahern
  0 siblings, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2017-03-14 23:27 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: David Miller, Nikolay Aleksandrov, netdev, David Ahern, jkbs,
	Eric Dumazet, Peter Christensen

On Tue, 14 Mar 2017 15:38:40 -0700
Roopa Prabhu <roopa@cumulusnetworks.com> wrote:

> >> >> > That's what my initial version did, but this was discussed during NetConf in Seville
> >> >> > and it was decided that it's best to make a global sysctl, thus the change.  
> >> >>
> >> >> Correct, we discussed this, and we all agreed to only have a sysctl for now.  
> >> >
> >> > Why? If you are going to have private discussions please post the rationale
> >> > in public.  
> >>
> >> Stephen, is there any reason to have a per ecmp route multipath algo
> >> selection ?.
> >> All platforms have a global multipath selection algo. I also don't see
> >> routing daemons ready or willing to specify a per ecmp route multipath
> >> selection algo attribute.  
> >
> > There is no compelling reason to make the attribute per route. But the
> > issue is more that configuration through sysctl's is problematic. It doesn't
> > fit into the standard API paradigm. Sysctl's are like routing patches not
> > part of the real CLI. Trying to trap sysctl's for things like switchedev
> > offload is particularly problematic. I can see the case for either way,
> > and don't have a fixed opinion.  
> 
> ok. understand the switchdev offload part. It was that way in the past...but
> today you can listen to sysctl updates on the netconf netlink channel.
> it works pretty well.

Is there another patch to add the NETCONFA_ECMP support?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 23:27                   ` Stephen Hemminger
@ 2017-03-14 23:45                     ` David Ahern
  2017-03-15  9:17                       ` Nicolas Dichtel
  0 siblings, 1 reply; 36+ messages in thread
From: David Ahern @ 2017-03-14 23:45 UTC (permalink / raw)
  To: Stephen Hemminger, Roopa Prabhu
  Cc: David Miller, Nikolay Aleksandrov, netdev, jkbs, Eric Dumazet,
	Peter Christensen

On 3/14/17 5:27 PM, Stephen Hemminger wrote:
> On Tue, 14 Mar 2017 15:38:40 -0700
> Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
> 
>>>>>>> That's what my initial version did, but this was discussed during NetConf in Seville
>>>>>>> and it was decided that it's best to make a global sysctl, thus the change.  
>>>>>>
>>>>>> Correct, we discussed this, and we all agreed to only have a sysctl for now.  
>>>>>
>>>>> Why? If you are going to have private discussions please post the rationale
>>>>> in public.  
>>>>
>>>> Stephen, is there any reason to have a per ecmp route multipath algo
>>>> selection ?.
>>>> All platforms have a global multipath selection algo. I also don't see
>>>> routing daemons ready or willing to specify a per ecmp route multipath
>>>> selection algo attribute.  
>>>
>>> There is no compelling reason to make the attribute per route. But the
>>> issue is more that configuration through sysctl's is problematic. It doesn't
>>> fit into the standard API paradigm. Sysctl's are like routing patches not
>>> part of the real CLI. Trying to trap sysctl's for things like switchedev
>>> offload is particularly problematic. I can see the case for either way,
>>> and don't have a fixed opinion.  
>>
>> ok. understand the switchdev offload part. It was that way in the past...but
>> today you can listen to sysctl updates on the netconf netlink channel.
>> it works pretty well.
> 
> Is there another patch to add the NETCONFA_ECMP support?
> 

does userspace care?

switchdev uses notifiers for in kernel notification of FIB changes
(routes and rules).

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 20:25           ` Stephen Hemminger
  2017-03-14 21:10             ` Roopa Prabhu
@ 2017-03-15  0:24             ` David Miller
  2017-03-15  2:30               ` Tom Herbert
  1 sibling, 1 reply; 36+ messages in thread
From: David Miller @ 2017-03-15  0:24 UTC (permalink / raw)
  To: stephen; +Cc: nikolay, netdev, roopa, dsa, jkbs, edumazet, pch

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 14 Mar 2017 13:25:06 -0700

> On Tue, 14 Mar 2017 11:48:37 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
>> From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> Date: Tue, 14 Mar 2017 17:58:46 +0200
>> 
>> > On 14/03/17 17:55, Stephen Hemminger wrote:  
>> >> On Tue, 14 Mar 2017 17:36:15 +0200
>> >> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
>> >>   
>> >>> This patch adds support for ECMP hash policy choice via a new sysctl
>> >>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> >>> The current values for fib_multipath_hash_policy are:
>> >>>  0 - layer 3 (default)
>> >>>  1 - layer 4
>> >>> If there's an skb hash already set and it matches the chosen policy then it
>> >>> will be used instead of being calculated (currently only for L4).
>> >>> In L3 mode we always calculate the hash due to the ICMP error special
>> >>> case, the flow dissector's field consistentification should handle the
>> >>> address order thus we can remove the address reversals.
>> >>>
>> >>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>  
>> >> 
>> >> It is good to see ECMP come back from the grave.
>> >> Linux used to support it long ago but was abandoned after it was unstable
>> >> and removed from iproute2 in 2012.
>> >> 
>> >> The old API was through route attributes which makes more sense than
>> >> doing it with sysctl. It makes more sense to use netlink instead.
>> >> Therefore please go back and do something like the old API rather than doing it through
>> >> sysctl.
>> >>   
>> > 
>> > That's what my initial version did, but this was discussed during NetConf in Seville
>> > and it was decided that it's best to make a global sysctl, thus the change.  
>> 
>> Correct, we discussed this, and we all agreed to only have a sysctl for now.
> 
> Why? If you are going to have private discussions please post the rationale
> in public.

The idea is that we couldn't come up with an immediate use case, and if one
came up we could easily add the per-route or per-fib-table attribute.

Most people want the entire system to behave a certain way wrt. ECMP, rather
than have fine granularity.  For example, the case being discussed here is
to simply have software's behavior match that of hardware offloads.

We shouldn't add things until there is a real demonstrated and requested need.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15  0:24             ` David Miller
@ 2017-03-15  2:30               ` Tom Herbert
  2017-03-17  3:36                 ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2017-03-15  2:30 UTC (permalink / raw)
  To: David Miller
  Cc: Stephen Hemminger, Nikolay Aleksandrov,
	Linux Kernel Network Developers, roopa, David Ahern,
	Jakub Sitnicki, Eric Dumazet, Peter Christensen

On Tue, Mar 14, 2017 at 5:24 PM, David Miller <davem@davemloft.net> wrote:
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Tue, 14 Mar 2017 13:25:06 -0700
>
>> On Tue, 14 Mar 2017 11:48:37 -0700 (PDT)
>> David Miller <davem@davemloft.net> wrote:
>>
>>> From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>>> Date: Tue, 14 Mar 2017 17:58:46 +0200
>>>
>>> > On 14/03/17 17:55, Stephen Hemminger wrote:
>>> >> On Tue, 14 Mar 2017 17:36:15 +0200
>>> >> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
>>> >>
>>> >>> This patch adds support for ECMP hash policy choice via a new sysctl
>>> >>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>>> >>> The current values for fib_multipath_hash_policy are:
>>> >>>  0 - layer 3 (default)
>>> >>>  1 - layer 4
>>> >>> If there's an skb hash already set and it matches the chosen policy then it
>>> >>> will be used instead of being calculated (currently only for L4).
>>> >>> In L3 mode we always calculate the hash due to the ICMP error special
>>> >>> case, the flow dissector's field consistentification should handle the
>>> >>> address order thus we can remove the address reversals.
>>> >>>
>>> >>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>>> >>
>>> >> It is good to see ECMP come back from the grave.
>>> >> Linux used to support it long ago but was abandoned after it was unstable
>>> >> and removed from iproute2 in 2012.
>>> >>
>>> >> The old API was through route attributes which makes more sense than
>>> >> doing it with sysctl. It makes more sense to use netlink instead.
>>> >> Therefore please go back and do something like the old API rather than doing it through
>>> >> sysctl.
>>> >>
>>> >
>>> > That's what my initial version did, but this was discussed during NetConf in Seville
>>> > and it was decided that it's best to make a global sysctl, thus the change.
>>>
>>> Correct, we discussed this, and we all agreed to only have a sysctl for now.
>>
>> Why? If you are going to have private discussions please post the rationale
>> in public.
>
> The idea is that we couldn't come up with an immediate use case, and if one
> came up we could easily add the per-route or per-fib-table attribute.
>
> Most people want the entire system to behave a certain way wrt. ECMP, rather
> than have fine granularity.  For example, the case being discussed here is
> to simply have software's behavior match that of hardware offloads.
>
Agreed, but then why do we even need any complexity here by that
argument? RSS is specifically defined to do 5-tuple hashing for TCP
(and UDP), and 3-tuple. No one has ever complained that doing per flow
RSS for TCP is bad thing AFAIK. We followed that same model for RPS,
RFS, and XPS via state in the connection context. The skb_hash is
often given to us for free, whereas in order to do a 3-tuple we have
to actually do more work and do at least an extra jhash. I suppose the
argument is probably that switches allow this configuration and
somehow we want to have feature parity, but it would be very
interesting to know if anyone is not doing per flow ECMP in real life
and why...

Tom

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 23:45                     ` David Ahern
@ 2017-03-15  9:17                       ` Nicolas Dichtel
  2017-03-15 10:46                         ` Nikolay Aleksandrov
  2017-03-15 15:01                         ` David Ahern
  0 siblings, 2 replies; 36+ messages in thread
From: Nicolas Dichtel @ 2017-03-15  9:17 UTC (permalink / raw)
  To: David Ahern, Stephen Hemminger, Roopa Prabhu
  Cc: David Miller, Nikolay Aleksandrov, netdev, jkbs, Eric Dumazet,
	Peter Christensen

Le 15/03/2017 à 00:45, David Ahern a écrit :
> On 3/14/17 5:27 PM, Stephen Hemminger wrote:
>> On Tue, 14 Mar 2017 15:38:40 -0700
>> Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>>
>>>>>>>> That's what my initial version did, but this was discussed during NetConf in Seville
>>>>>>>> and it was decided that it's best to make a global sysctl, thus the change.  
>>>>>>>
>>>>>>> Correct, we discussed this, and we all agreed to only have a sysctl for now.  
>>>>>>
>>>>>> Why? If you are going to have private discussions please post the rationale
>>>>>> in public.  
>>>>>
>>>>> Stephen, is there any reason to have a per ecmp route multipath algo
>>>>> selection ?.
>>>>> All platforms have a global multipath selection algo. I also don't see
>>>>> routing daemons ready or willing to specify a per ecmp route multipath
>>>>> selection algo attribute.  
>>>>
>>>> There is no compelling reason to make the attribute per route. But the
>>>> issue is more that configuration through sysctl's is problematic. It doesn't
>>>> fit into the standard API paradigm. Sysctl's are like routing patches not
>>>> part of the real CLI. Trying to trap sysctl's for things like switchedev
>>>> offload is particularly problematic. I can see the case for either way,
>>>> and don't have a fixed opinion.  
>>>
>>> ok. understand the switchdev offload part. It was that way in the past...but
>>> today you can listen to sysctl updates on the netconf netlink channel.
>>> it works pretty well.
>>
>> Is there another patch to add the NETCONFA_ECMP support?
>>
> 
> does userspace care?
Yes, I think it is needed so that userspace can correctly monitor this behavior.
It also enables to check this parameter through netlink.


Regards,
Nicolas

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15  9:17                       ` Nicolas Dichtel
@ 2017-03-15 10:46                         ` Nikolay Aleksandrov
  2017-03-15 11:18                           ` Nicolas Dichtel
  2017-03-15 15:01                         ` David Ahern
  1 sibling, 1 reply; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-15 10:46 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: David Ahern, Stephen Hemminger, Roopa Prabhu, David Miller,
	netdev, jkbs, Eric Dumazet, Peter Christensen


> On Mar 15, 2017, at 11:17 AM, Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:
> 
> Le 15/03/2017 à 00:45, David Ahern a écrit :
>> On 3/14/17 5:27 PM, Stephen Hemminger wrote:
>>> On Tue, 14 Mar 2017 15:38:40 -0700
>>> Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>>> 
>>>>>>>>> That's what my initial version did, but this was discussed during NetConf in Seville
>>>>>>>>> and it was decided that it's best to make a global sysctl, thus the change.  
>>>>>>>> 
>>>>>>>> Correct, we discussed this, and we all agreed to only have a sysctl for now.  
>>>>>>> 
>>>>>>> Why? If you are going to have private discussions please post the rationale
>>>>>>> in public.  
>>>>>> 
>>>>>> Stephen, is there any reason to have a per ecmp route multipath algo
>>>>>> selection ?.
>>>>>> All platforms have a global multipath selection algo. I also don't see
>>>>>> routing daemons ready or willing to specify a per ecmp route multipath
>>>>>> selection algo attribute.  
>>>>> 
>>>>> There is no compelling reason to make the attribute per route. But the
>>>>> issue is more that configuration through sysctl's is problematic. It doesn't
>>>>> fit into the standard API paradigm. Sysctl's are like routing patches not
>>>>> part of the real CLI. Trying to trap sysctl's for things like switchedev
>>>>> offload is particularly problematic. I can see the case for either way,
>>>>> and don't have a fixed opinion.  
>>>> 
>>>> ok. understand the switchdev offload part. It was that way in the past...but
>>>> today you can listen to sysctl updates on the netconf netlink channel.
>>>> it works pretty well.
>>> 
>>> Is there another patch to add the NETCONFA_ECMP support?
>>> 
>> 
>> does userspace care?
> Yes, I think it is needed so that userspace can correctly monitor this behavior.
> It also enables to check this parameter through netlink.
> 
> 
> Regards,
> Nicolas

This doesn’t fit the NETCONFA model well, there is no “all”, “default” or per iface option to be set, also
the other multipath sysctl which affects behaviour doesn’t have a notification.

Thanks,
 Nik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15 10:46                         ` Nikolay Aleksandrov
@ 2017-03-15 11:18                           ` Nicolas Dichtel
  2017-03-15 11:27                             ` Nikolay Aleksandrov
  0 siblings, 1 reply; 36+ messages in thread
From: Nicolas Dichtel @ 2017-03-15 11:18 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: David Ahern, Stephen Hemminger, Roopa Prabhu, David Miller,
	netdev, jkbs, Eric Dumazet, Peter Christensen

Le 15/03/2017 à 11:46, Nikolay Aleksandrov a écrit :
[snip]
> This doesn’t fit the NETCONFA model well, there is no “all”, “default” or per iface option to be set, also
> the other multipath sysctl which affects behaviour doesn’t have a notification.
Right. Maybe adding a "system-wide" option can help, but it's probably another
patch to add netconf support for other ecmp properties.


Thank you,
Nicolas

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15 11:18                           ` Nicolas Dichtel
@ 2017-03-15 11:27                             ` Nikolay Aleksandrov
  0 siblings, 0 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-15 11:27 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: David Ahern, Stephen Hemminger, Roopa Prabhu, David Miller,
	netdev, jkbs, Eric Dumazet, Peter Christensen


> On Mar 15, 2017, at 1:18 PM, Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:
> 
> Le 15/03/2017 à 11:46, Nikolay Aleksandrov a écrit :
> [snip]
>> This doesn’t fit the NETCONFA model well, there is no “all”, “default” or per iface option to be set, also
>> the other multipath sysctl which affects behaviour doesn’t have a notification.
> Right. Maybe adding a "system-wide" option can help, but it's probably another
> patch to add netconf support for other ecmp properties.
> 
> 
> Thank you,
> Nicolas

Yep, sounds good.
I’ll add the netlink support for system-wide NETCONFA and ecmp as follow up patches so we can
have a separate discussion on the design there.

Thanks,
 Nik

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
  2017-03-14 15:55     ` Stephen Hemminger
  2017-03-14 18:55     ` Nikolay Aleksandrov
@ 2017-03-15 11:32     ` Jakub Sitnicki
  2017-03-15 12:10       ` Nikolay Aleksandrov
  2 siblings, 1 reply; 36+ messages in thread
From: Jakub Sitnicki @ 2017-03-15 11:32 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: netdev, davem, roopa, dsa, edumazet, pch

On Tue, Mar 14, 2017 at 03:36 PM GMT, Nikolay Aleksandrov wrote:
> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3 (default)
>  1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated (currently only for L4).
> In L3 mode we always calculate the hash due to the ICMP error special
> case, the flow dissector's field consistentification should handle the
> address order thus we can remove the address reversals.
>
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> ---
> v3:
>  - keep the ICMP error special handling and always calc L3 hash
>    Jakub, could you please run your tests with this version ?
>
> v2:
>  - removed the output_key_hash as it's not needed anymore
>  - reverted to my original/internal patch with L3 as default hash
>
>  Documentation/networking/ip-sysctl.txt |  8 +++
>  include/net/ip_fib.h                   | 14 ++----
>  include/net/netns/ipv4.h               |  1 +
>  include/net/route.h                    |  9 +---
>  net/ipv4/fib_semantics.c               | 11 ++--
>  net/ipv4/icmp.c                        | 19 +------
>  net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
>  net/ipv4/sysctl_net_ipv4.c             |  9 ++++
>  8 files changed, 98 insertions(+), 65 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index fc73eeb7b3b8..558e19653106 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
>  	0 - disabled
>  	1 - enabled
>  
> +fib_multipath_hash_policy - INTEGER
> +	Controls which hash policy to use for multipath routes. Only valid
> +	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
> +	Default: 0 (Layer 3)
> +	Possible values:
> +	0 - Layer 3
> +	1 - Layer 4
> +
>  route/max_size - INTEGER
>  	Maximum number of routes allowed in the kernel.  Increase
>  	this when using large numbers of interfaces and/or routes.
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index d9cee9659978..8c608a17bd9a 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
>  int fib_sync_down_addr(struct net_device *dev, __be32 local);
>  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>  
> -extern u32 fib_multipath_secret __read_mostly;
> -
> -static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
> -{
> -	return jhash_2words((__force u32)saddr, (__force u32)daddr,
> -			    fib_multipath_secret) >> 1;
> -}
> -
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> +		       const struct sk_buff *skb);
> +#endif
>  void fib_select_multipath(struct fib_result *res, int hash);
>  void fib_select_path(struct net *net, struct fib_result *res,
> -		     struct flowi4 *fl4, int mp_hash);
> +		     struct flowi4 *fl4);
>  
>  /* Exported by fib_trie.c */
>  void fib_trie_init(void);
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 622d2da27135..70a1d4251790 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -152,6 +152,7 @@ struct netns_ipv4 {
>  #endif
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>  	int sysctl_fib_multipath_use_neigh;
> +	int sysctl_fib_multipath_hash_policy;
>  #endif
>  
>  	unsigned int	fib_seq;	/* protected by rtnl_mutex */
> diff --git a/include/net/route.h b/include/net/route.h
> index c0874c87c173..32412c91c222 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -113,14 +113,7 @@ struct in_device;
>  int ip_rt_init(void);
>  void rt_cache_flush(struct net *net);
>  void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
> -					  int mp_hash);
> -
> -static inline struct rtable *__ip_route_output_key(struct net *net,
> -						   struct flowi4 *flp)
> -{
> -	return __ip_route_output_key_hash(net, flp, -1);
> -}
> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
>  
>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>  				    const struct sock *sk);
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 317026a39cfa..6601bd9744c9 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
>  static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
>  
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -u32 fib_multipath_secret __read_mostly;
>  
>  #define for_nexthops(fi) {						\
>  	int nhsel; const struct fib_nh *nh;				\
> @@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
>  
>  		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
>  	} endfor_nexthops(fi);
> -
> -	net_get_random_once(&fib_multipath_secret,
> -			    sizeof(fib_multipath_secret));
>  }
>  
>  static inline void fib_add_weight(struct fib_info *fi,
> @@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
>  #endif
>  
>  void fib_select_path(struct net *net, struct fib_result *res,
> -		     struct flowi4 *fl4, int mp_hash)
> +		     struct flowi4 *fl4)
>  {
>  	bool oif_check;
>  
> @@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
>  
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>  	if (res->fi->fib_nhs > 1 && oif_check) {
> -		if (mp_hash < 0)
> -			mp_hash = get_hash_from_flowi4(fl4) >> 1;
> +		int h = fib_multipath_hash(res->fi, fl4, NULL);
>  
> -		fib_select_multipath(res, mp_hash);
> +		fib_select_multipath(res, h);
>  	}
>  	else
>  #endif
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index fc310db2708b..46630bc30754 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
>  	local_bh_enable();
>  }
>  
> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> -
> -/* Source and destination is swapped. See ip_multipath_icmp_hash */
> -static int icmp_multipath_hash_skb(const struct sk_buff *skb)
> -{
> -	const struct iphdr *iph = ip_hdr(skb);
> -
> -	return fib_multipath_hash(iph->daddr, iph->saddr);
> -}
> -
> -#else
> -
> -#define icmp_multipath_hash_skb(skb) (-1)
> -
> -#endif
> -
>  static struct rtable *icmp_route_lookup(struct net *net,
>  					struct flowi4 *fl4,
>  					struct sk_buff *skb_in,
> @@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>  	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
>  
>  	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
> -	rt = __ip_route_output_key_hash(net, fl4,
> -					icmp_multipath_hash_skb(skb_in));
> +	rt = __ip_route_output_key(net, fl4);
>  	if (IS_ERR(rt))
>  		return rt;
>  

I'm observing that the above hunk changes things because saddr passed to
icmp_route_lookup doesn't always remain set to ip_hdr(skb_in)->daddr.

This happens when generating an ICMP error in response to a datagram
which destination address is not a local one, that is from ip_forward.

-Jakub

> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 8471dd116771..3d7f99747285 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1734,11 +1734,11 @@ static int __mkroute_input(struct sk_buff *skb,
>  }
>  
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -
>  /* To make ICMP packets follow the right flow, the multipath hash is
> - * calculated from the inner IP addresses in reverse order.
> + * calculated from the inner IP addresses.
>   */
> -static int ip_multipath_icmp_hash(struct sk_buff *skb)
> +static void ip_multipath_icmp_hash(const struct sk_buff *skb,
> +				   struct flowi4 *fl4)
>  {
>  	const struct iphdr *outer_iph = ip_hdr(skb);
>  	struct icmphdr _icmph;
> @@ -1746,33 +1746,85 @@ static int ip_multipath_icmp_hash(struct sk_buff *skb)
>  	struct iphdr _inner_iph;
>  	const struct iphdr *inner_iph;
>  
> +	fl4->saddr = outer_iph->saddr;
> +	fl4->daddr = outer_iph->daddr;
>  	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
> -		goto standard_hash;
> +		return;
>  
>  	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
>  				   &_icmph);
>  	if (!icmph)
> -		goto standard_hash;
> +		return;
>  
>  	if (icmph->type != ICMP_DEST_UNREACH &&
>  	    icmph->type != ICMP_REDIRECT &&
>  	    icmph->type != ICMP_TIME_EXCEEDED &&
> -	    icmph->type != ICMP_PARAMETERPROB) {
> -		goto standard_hash;
> -	}
> +	    icmph->type != ICMP_PARAMETERPROB)
> +		return;
>  
>  	inner_iph = skb_header_pointer(skb,
>  				       outer_iph->ihl * 4 + sizeof(_icmph),
>  				       sizeof(_inner_iph), &_inner_iph);
>  	if (!inner_iph)
> -		goto standard_hash;
> +		return;
> +	fl4->saddr = inner_iph->saddr;
> +	fl4->daddr = inner_iph->daddr;
> +}
>  
> -	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> +		       const struct sk_buff *skb)
> +{
> +	struct net *net = fi->fib_net;
> +	struct flow_keys hash_keys;
> +	u32 mhash;
>  
> -standard_hash:
> -	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
> -}
> +	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
> +	case 0:
> +		memset(&hash_keys, 0, sizeof(hash_keys));
> +		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> +		if (skb && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
> +			struct flowi4 _fl4;
>  
> +			ip_multipath_icmp_hash(skb, &_fl4);
> +			hash_keys.addrs.v4addrs.src = _fl4.saddr;
> +			hash_keys.addrs.v4addrs.dst = _fl4.daddr;
> +		} else {
> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
> +		}
> +		break;
> +	case 1:
> +		/* skb is currently provided only when forwarding */
> +		if (skb) {
> +			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
> +			struct flow_keys keys;
> +
> +			/* short-circuit if we already have L4 hash present */
> +			if (skb->l4_hash)
> +				return skb_get_hash_raw(skb) >> 1;
> +			memset(&hash_keys, 0, sizeof(hash_keys));
> +			skb_flow_dissect_flow_keys(skb, &keys, flag);
> +			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
> +			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
> +			hash_keys.ports.src = keys.ports.src;
> +			hash_keys.ports.dst = keys.ports.dst;
> +			hash_keys.basic.ip_proto = keys.basic.ip_proto;
> +		} else {
> +			memset(&hash_keys, 0, sizeof(hash_keys));
> +			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
> +			hash_keys.ports.src = fl4->fl4_sport;
> +			hash_keys.ports.dst = fl4->fl4_dport;
> +			hash_keys.basic.ip_proto = fl4->flowi4_proto;
> +		}
> +		break;
> +	}
> +	mhash = flow_hash_from_keys(&hash_keys);
> +
> +	return mhash >> 1;
> +}
> +EXPORT_SYMBOL_GPL(fib_multipath_hash);
>  #endif /* CONFIG_IP_ROUTE_MULTIPATH */
>  
>  static int ip_mkroute_input(struct sk_buff *skb,
> @@ -1782,12 +1834,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
>  {
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>  	if (res->fi && res->fi->fib_nhs > 1) {
> -		int h;
> +		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
> +		int h = fib_multipath_hash(res->fi, &fl4, skb);
>  
> -		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
> -			h = ip_multipath_icmp_hash(skb);
> -		else
> -			h = fib_multipath_hash(saddr, daddr);
>  		fib_select_multipath(res, h);
>  	}
>  #endif
> @@ -2202,8 +2251,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>   * Major route resolver routine.
>   */
>  
> -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
> -					  int mp_hash)
> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>  {
>  	struct net_device *dev_out = NULL;
>  	__u8 tos = RT_FL_TOS(fl4);
> @@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>  		goto make_route;
>  	}
>  
> -	fib_select_path(net, &res, fl4, mp_hash);
> +	fib_select_path(net, &res, fl4);
>  
>  	dev_out = FIB_RES_DEV(res);
>  	fl4->flowi4_oif = dev_out->ifindex;
> @@ -2378,7 +2426,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>  	rcu_read_unlock();
>  	return rth;
>  }
> -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
> +EXPORT_SYMBOL_GPL(__ip_route_output_key);
>  
>  static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
>  {
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index d6880a6149ee..62c4f94923e5 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one,
>  	},
> +	{
> +		.procname	= "fib_multipath_hash_policy",
> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
>  #endif
>  	{
>  		.procname	= "ip_unprivileged_port_start",

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15 11:32     ` Jakub Sitnicki
@ 2017-03-15 12:10       ` Nikolay Aleksandrov
  0 siblings, 0 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-15 12:10 UTC (permalink / raw)
  To: Jakub Sitnicki; +Cc: netdev, davem, roopa, dsa, edumazet, pch

On 15/03/17 13:32, Jakub Sitnicki wrote:
> On Tue, Mar 14, 2017 at 03:36 PM GMT, Nikolay Aleksandrov wrote:
>> This patch adds support for ECMP hash policy choice via a new sysctl
>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> The current values for fib_multipath_hash_policy are:
>>  0 - layer 3 (default)
>>  1 - layer 4
>> If there's an skb hash already set and it matches the chosen policy then it
>> will be used instead of being calculated (currently only for L4).
>> In L3 mode we always calculate the hash due to the ICMP error special
>> case, the flow dissector's field consistentification should handle the
>> address order thus we can remove the address reversals.
>>
>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> ---
>> v3:
>>  - keep the ICMP error special handling and always calc L3 hash
>>    Jakub, could you please run your tests with this version ?
>>
>> v2:
>>  - removed the output_key_hash as it's not needed anymore
>>  - reverted to my original/internal patch with L3 as default hash
>>
>>  Documentation/networking/ip-sysctl.txt |  8 +++
>>  include/net/ip_fib.h                   | 14 ++----
>>  include/net/netns/ipv4.h               |  1 +
>>  include/net/route.h                    |  9 +---
>>  net/ipv4/fib_semantics.c               | 11 ++--
>>  net/ipv4/icmp.c                        | 19 +------
>>  net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
>>  net/ipv4/sysctl_net_ipv4.c             |  9 ++++
>>  8 files changed, 98 insertions(+), 65 deletions(-)
>>
>> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
>> index fc73eeb7b3b8..558e19653106 100644
>> --- a/Documentation/networking/ip-sysctl.txt
>> +++ b/Documentation/networking/ip-sysctl.txt
>> @@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
>>  	0 - disabled
>>  	1 - enabled
>>  
>> +fib_multipath_hash_policy - INTEGER
>> +	Controls which hash policy to use for multipath routes. Only valid
>> +	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
>> +	Default: 0 (Layer 3)
>> +	Possible values:
>> +	0 - Layer 3
>> +	1 - Layer 4
>> +
>>  route/max_size - INTEGER
>>  	Maximum number of routes allowed in the kernel.  Increase
>>  	this when using large numbers of interfaces and/or routes.
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index d9cee9659978..8c608a17bd9a 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
>>  int fib_sync_down_addr(struct net_device *dev, __be32 local);
>>  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>>  
>> -extern u32 fib_multipath_secret __read_mostly;
>> -
>> -static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
>> -{
>> -	return jhash_2words((__force u32)saddr, (__force u32)daddr,
>> -			    fib_multipath_secret) >> 1;
>> -}
>> -
>> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
>> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
>> +		       const struct sk_buff *skb);
>> +#endif
>>  void fib_select_multipath(struct fib_result *res, int hash);
>>  void fib_select_path(struct net *net, struct fib_result *res,
>> -		     struct flowi4 *fl4, int mp_hash);
>> +		     struct flowi4 *fl4);
>>  
>>  /* Exported by fib_trie.c */
>>  void fib_trie_init(void);
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index 622d2da27135..70a1d4251790 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -152,6 +152,7 @@ struct netns_ipv4 {
>>  #endif
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	int sysctl_fib_multipath_use_neigh;
>> +	int sysctl_fib_multipath_hash_policy;
>>  #endif
>>  
>>  	unsigned int	fib_seq;	/* protected by rtnl_mutex */
>> diff --git a/include/net/route.h b/include/net/route.h
>> index c0874c87c173..32412c91c222 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -113,14 +113,7 @@ struct in_device;
>>  int ip_rt_init(void);
>>  void rt_cache_flush(struct net *net);
>>  void rt_flush_dev(struct net_device *dev);
>> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
>> -					  int mp_hash);
>> -
>> -static inline struct rtable *__ip_route_output_key(struct net *net,
>> -						   struct flowi4 *flp)
>> -{
>> -	return __ip_route_output_key_hash(net, flp, -1);
>> -}
>> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
>>  
>>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>>  				    const struct sock *sk);
>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>> index 317026a39cfa..6601bd9744c9 100644
>> --- a/net/ipv4/fib_semantics.c
>> +++ b/net/ipv4/fib_semantics.c
>> @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
>>  static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -u32 fib_multipath_secret __read_mostly;
>>  
>>  #define for_nexthops(fi) {						\
>>  	int nhsel; const struct fib_nh *nh;				\
>> @@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
>>  
>>  		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
>>  	} endfor_nexthops(fi);
>> -
>> -	net_get_random_once(&fib_multipath_secret,
>> -			    sizeof(fib_multipath_secret));
>>  }
>>  
>>  static inline void fib_add_weight(struct fib_info *fi,
>> @@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
>>  #endif
>>  
>>  void fib_select_path(struct net *net, struct fib_result *res,
>> -		     struct flowi4 *fl4, int mp_hash)
>> +		     struct flowi4 *fl4)
>>  {
>>  	bool oif_check;
>>  
>> @@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	if (res->fi->fib_nhs > 1 && oif_check) {
>> -		if (mp_hash < 0)
>> -			mp_hash = get_hash_from_flowi4(fl4) >> 1;
>> +		int h = fib_multipath_hash(res->fi, fl4, NULL);
>>  
>> -		fib_select_multipath(res, mp_hash);
>> +		fib_select_multipath(res, h);
>>  	}
>>  	else
>>  #endif
>> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
>> index fc310db2708b..46630bc30754 100644
>> --- a/net/ipv4/icmp.c
>> +++ b/net/ipv4/icmp.c
>> @@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
>>  	local_bh_enable();
>>  }
>>  
>> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -
>> -/* Source and destination is swapped. See ip_multipath_icmp_hash */
>> -static int icmp_multipath_hash_skb(const struct sk_buff *skb)
>> -{
>> -	const struct iphdr *iph = ip_hdr(skb);
>> -
>> -	return fib_multipath_hash(iph->daddr, iph->saddr);
>> -}
>> -
>> -#else
>> -
>> -#define icmp_multipath_hash_skb(skb) (-1)
>> -
>> -#endif
>> -
>>  static struct rtable *icmp_route_lookup(struct net *net,
>>  					struct flowi4 *fl4,
>>  					struct sk_buff *skb_in,
>> @@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>>  	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
>>  
>>  	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
>> -	rt = __ip_route_output_key_hash(net, fl4,
>> -					icmp_multipath_hash_skb(skb_in));
>> +	rt = __ip_route_output_key(net, fl4);
>>  	if (IS_ERR(rt))
>>  		return rt;
>>  
> 
> I'm observing that the above hunk changes things because saddr passed to
> icmp_route_lookup doesn't always remain set to ip_hdr(skb_in)->daddr.
> 
> This happens when generating an ICMP error in response to a datagram
> which destination address is not a local one, that is from ip_forward.
> 
> -Jakub

Oh, of course. Good catch, I'll fix it in v4.

Thank you,
 Nik

> 
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index 8471dd116771..3d7f99747285 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -1734,11 +1734,11 @@ static int __mkroute_input(struct sk_buff *skb,
>>  }
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -
>>  /* To make ICMP packets follow the right flow, the multipath hash is
>> - * calculated from the inner IP addresses in reverse order.
>> + * calculated from the inner IP addresses.
>>   */
>> -static int ip_multipath_icmp_hash(struct sk_buff *skb)
>> +static void ip_multipath_icmp_hash(const struct sk_buff *skb,
>> +				   struct flowi4 *fl4)
>>  {
>>  	const struct iphdr *outer_iph = ip_hdr(skb);
>>  	struct icmphdr _icmph;
>> @@ -1746,33 +1746,85 @@ static int ip_multipath_icmp_hash(struct sk_buff *skb)
>>  	struct iphdr _inner_iph;
>>  	const struct iphdr *inner_iph;
>>  
>> +	fl4->saddr = outer_iph->saddr;
>> +	fl4->daddr = outer_iph->daddr;
>>  	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
>> -		goto standard_hash;
>> +		return;
>>  
>>  	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
>>  				   &_icmph);
>>  	if (!icmph)
>> -		goto standard_hash;
>> +		return;
>>  
>>  	if (icmph->type != ICMP_DEST_UNREACH &&
>>  	    icmph->type != ICMP_REDIRECT &&
>>  	    icmph->type != ICMP_TIME_EXCEEDED &&
>> -	    icmph->type != ICMP_PARAMETERPROB) {
>> -		goto standard_hash;
>> -	}
>> +	    icmph->type != ICMP_PARAMETERPROB)
>> +		return;
>>  
>>  	inner_iph = skb_header_pointer(skb,
>>  				       outer_iph->ihl * 4 + sizeof(_icmph),
>>  				       sizeof(_inner_iph), &_inner_iph);
>>  	if (!inner_iph)
>> -		goto standard_hash;
>> +		return;
>> +	fl4->saddr = inner_iph->saddr;
>> +	fl4->daddr = inner_iph->daddr;
>> +}
>>  
>> -	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
>> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
>> +		       const struct sk_buff *skb)
>> +{
>> +	struct net *net = fi->fib_net;
>> +	struct flow_keys hash_keys;
>> +	u32 mhash;
>>  
>> -standard_hash:
>> -	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
>> -}
>> +	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
>> +	case 0:
>> +		memset(&hash_keys, 0, sizeof(hash_keys));
>> +		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>> +		if (skb && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
>> +			struct flowi4 _fl4;
>>  
>> +			ip_multipath_icmp_hash(skb, &_fl4);
>> +			hash_keys.addrs.v4addrs.src = _fl4.saddr;
>> +			hash_keys.addrs.v4addrs.dst = _fl4.daddr;
>> +		} else {
>> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
>> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
>> +		}
>> +		break;
>> +	case 1:
>> +		/* skb is currently provided only when forwarding */
>> +		if (skb) {
>> +			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
>> +			struct flow_keys keys;
>> +
>> +			/* short-circuit if we already have L4 hash present */
>> +			if (skb->l4_hash)
>> +				return skb_get_hash_raw(skb) >> 1;
>> +			memset(&hash_keys, 0, sizeof(hash_keys));
>> +			skb_flow_dissect_flow_keys(skb, &keys, flag);
>> +			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
>> +			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
>> +			hash_keys.ports.src = keys.ports.src;
>> +			hash_keys.ports.dst = keys.ports.dst;
>> +			hash_keys.basic.ip_proto = keys.basic.ip_proto;
>> +		} else {
>> +			memset(&hash_keys, 0, sizeof(hash_keys));
>> +			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
>> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
>> +			hash_keys.ports.src = fl4->fl4_sport;
>> +			hash_keys.ports.dst = fl4->fl4_dport;
>> +			hash_keys.basic.ip_proto = fl4->flowi4_proto;
>> +		}
>> +		break;
>> +	}
>> +	mhash = flow_hash_from_keys(&hash_keys);
>> +
>> +	return mhash >> 1;
>> +}
>> +EXPORT_SYMBOL_GPL(fib_multipath_hash);
>>  #endif /* CONFIG_IP_ROUTE_MULTIPATH */
>>  
>>  static int ip_mkroute_input(struct sk_buff *skb,
>> @@ -1782,12 +1834,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
>>  {
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	if (res->fi && res->fi->fib_nhs > 1) {
>> -		int h;
>> +		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
>> +		int h = fib_multipath_hash(res->fi, &fl4, skb);
>>  
>> -		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
>> -			h = ip_multipath_icmp_hash(skb);
>> -		else
>> -			h = fib_multipath_hash(saddr, daddr);
>>  		fib_select_multipath(res, h);
>>  	}
>>  #endif
>> @@ -2202,8 +2251,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>>   * Major route resolver routine.
>>   */
>>  
>> -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>> -					  int mp_hash)
>> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>>  {
>>  	struct net_device *dev_out = NULL;
>>  	__u8 tos = RT_FL_TOS(fl4);
>> @@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>  		goto make_route;
>>  	}
>>  
>> -	fib_select_path(net, &res, fl4, mp_hash);
>> +	fib_select_path(net, &res, fl4);
>>  
>>  	dev_out = FIB_RES_DEV(res);
>>  	fl4->flowi4_oif = dev_out->ifindex;
>> @@ -2378,7 +2426,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>  	rcu_read_unlock();
>>  	return rth;
>>  }
>> -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
>> +EXPORT_SYMBOL_GPL(__ip_route_output_key);
>>  
>>  static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
>>  {
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index d6880a6149ee..62c4f94923e5 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>>  		.extra1		= &zero,
>>  		.extra2		= &one,
>>  	},
>> +	{
>> +		.procname	= "fib_multipath_hash_policy",
>> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= &zero,
>> +		.extra2		= &one,
>> +	},
>>  #endif
>>  	{
>>  		.procname	= "ip_unprivileged_port_start",
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15  9:17                       ` Nicolas Dichtel
  2017-03-15 10:46                         ` Nikolay Aleksandrov
@ 2017-03-15 15:01                         ` David Ahern
  2017-03-15 15:20                           ` Stephen Hemminger
  1 sibling, 1 reply; 36+ messages in thread
From: David Ahern @ 2017-03-15 15:01 UTC (permalink / raw)
  To: nicolas.dichtel, Stephen Hemminger, Roopa Prabhu
  Cc: David Miller, Nikolay Aleksandrov, netdev, jkbs, Eric Dumazet,
	Peter Christensen

On 3/15/17 3:17 AM, Nicolas Dichtel wrote:
>>> Is there another patch to add the NETCONFA_ECMP support?
>>>
>>
>> does userspace care?
> Yes, I think it is needed so that userspace can correctly monitor this behavior.
> It also enables to check this parameter through netlink.
> 

I don't understand why userspace cares. What can userspace do with that
information?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15 15:01                         ` David Ahern
@ 2017-03-15 15:20                           ` Stephen Hemminger
  0 siblings, 0 replies; 36+ messages in thread
From: Stephen Hemminger @ 2017-03-15 15:20 UTC (permalink / raw)
  To: David Ahern
  Cc: nicolas.dichtel, Roopa Prabhu, David Miller, Nikolay Aleksandrov,
	netdev, jkbs, Eric Dumazet, Peter Christensen

On Wed, 15 Mar 2017 09:01:16 -0600
David Ahern <dsa@cumulusnetworks.com> wrote:

> On 3/15/17 3:17 AM, Nicolas Dichtel wrote:
> >>> Is there another patch to add the NETCONFA_ECMP support?
> >>>  
> >>
> >> does userspace care?  
> > Yes, I think it is needed so that userspace can correctly monitor this behavior.
> > It also enables to check this parameter through netlink.
> >   
> 
> I don't understand why userspace cares. What can userspace do with that
> information?

There are cases such as routing or switch control from userspace where the desire
is to track kernel behavior in an external program. I know of at least 3 cases where
this is done.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH net-next v4] net: ipv4: add support for ECMP hash policy choice
  2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
  2017-03-08 12:05   ` Jakub Sitnicki
  2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
@ 2017-03-16 13:28   ` Nikolay Aleksandrov
  2017-03-16 16:41     ` Stephen Hemminger
  2017-03-21 22:28     ` David Miller
  2 siblings, 2 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-16 13:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, roopa, dsa, jkbs, edumazet, pch, stephen, nicolas.dichtel,
	Nikolay Aleksandrov

This patch adds support for ECMP hash policy choice via a new sysctl
called fib_multipath_hash_policy and also adds support for L4 hashes.
The current values for fib_multipath_hash_policy are:
 0 - layer 3 (default)
 1 - layer 4
If there's an skb hash already set and it matches the chosen policy then it
will be used instead of being calculated (currently only for L4).
In L3 mode we always calculate the hash due to the ICMP error special
case, the flow dissector's field consistentification should handle the
address order thus we can remove the address reversals.
If the skb is provided we always use it for the hash calculation,
otherwise we fallback to fl4, that is if skb is NULL fl4 has to be set.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
v4:
 - fix the ICMP send ip_forward case to use the skb_in addresses
 - use hash_keys directly when extracting ICMP headers
 - always use the skb when available

v3:
 - keep the ICMP error special handling and always calc L3 hash

v2:
 - removed the output_key_hash as it's not needed anymore
 - reverted to my original/internal patch with L3 as default hash

 Documentation/networking/ip-sysctl.txt |  8 +++
 include/net/ip_fib.h                   | 14 ++----
 include/net/netns/ipv4.h               |  1 +
 include/net/route.h                    |  6 +--
 net/ipv4/fib_semantics.c               | 11 ++--
 net/ipv4/icmp.c                        | 19 +------
 net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
 net/ipv4/sysctl_net_ipv4.c             |  9 ++++
 8 files changed, 100 insertions(+), 60 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index fc73eeb7b3b8..558e19653106 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
 	0 - disabled
 	1 - enabled
 
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes. Only valid
+	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+	Default: 0 (Layer 3)
+	Possible values:
+	0 - Layer 3
+	1 - Layer 4
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index d9cee9659978..706705f7ef23 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
-extern u32 fib_multipath_secret __read_mostly;
-
-static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
-{
-	return jhash_2words((__force u32)saddr, (__force u32)daddr,
-			    fib_multipath_secret) >> 1;
-}
-
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb);
+#endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash);
+		     struct flowi4 *fl4, const struct sk_buff *skb);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 622d2da27135..70a1d4251790 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,6 +152,7 @@ struct netns_ipv4 {
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
+	int sysctl_fib_multipath_hash_policy;
 #endif
 
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
diff --git a/include/net/route.h b/include/net/route.h
index c0874c87c173..2cc0e14c6359 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -113,13 +113,13 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
-					  int mp_hash);
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
+					  const struct sk_buff *skb);
 
 static inline struct rtable *__ip_route_output_key(struct net *net,
 						   struct flowi4 *flp)
 {
-	return __ip_route_output_key_hash(net, flp, -1);
+	return __ip_route_output_key_hash(net, flp, NULL);
 }
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..da449ddb8cc1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-u32 fib_multipath_secret __read_mostly;
 
 #define for_nexthops(fi) {						\
 	int nhsel; const struct fib_nh *nh;				\
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
 
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
-
-	net_get_random_once(&fib_multipath_secret,
-			    sizeof(fib_multipath_secret));
 }
 
 static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, int mp_hash)
+		     struct flowi4 *fl4, const struct sk_buff *skb)
 {
 	bool oif_check;
 
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi->fib_nhs > 1 && oif_check) {
-		if (mp_hash < 0)
-			mp_hash = get_hash_from_flowi4(fl4) >> 1;
+		int h = fib_multipath_hash(res->fi, fl4, skb);
 
-		fib_select_multipath(res, mp_hash);
+		fib_select_multipath(res, h);
 	}
 	else
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc310db2708b..43318b5f5647 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	local_bh_enable();
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* Source and destination is swapped. See ip_multipath_icmp_hash */
-static int icmp_multipath_hash_skb(const struct sk_buff *skb)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-
-	return fib_multipath_hash(iph->daddr, iph->saddr);
-}
-
-#else
-
-#define icmp_multipath_hash_skb(skb) (-1)
-
-#endif
-
 static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
 
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-	rt = __ip_route_output_key_hash(net, fl4,
-					icmp_multipath_hash_skb(skb_in));
+	rt = __ip_route_output_key_hash(net, fl4, skb_in);
 	if (IS_ERR(rt))
 		return rt;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8471dd116771..5dda1ef81c7e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1734,45 +1734,97 @@ static int __mkroute_input(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-
 /* To make ICMP packets follow the right flow, the multipath hash is
- * calculated from the inner IP addresses in reverse order.
+ * calculated from the inner IP addresses.
  */
-static int ip_multipath_icmp_hash(struct sk_buff *skb)
+static void ip_multipath_l3_keys(const struct sk_buff *skb,
+				 struct flow_keys *hash_keys)
 {
 	const struct iphdr *outer_iph = ip_hdr(skb);
-	struct icmphdr _icmph;
+	const struct iphdr *inner_iph;
 	const struct icmphdr *icmph;
 	struct iphdr _inner_iph;
-	const struct iphdr *inner_iph;
+	struct icmphdr _icmph;
+
+	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
+	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
+	if (likely(outer_iph->protocol != IPPROTO_ICMP))
+		return;
 
 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
-		goto standard_hash;
+		return;
 
 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
 				   &_icmph);
 	if (!icmph)
-		goto standard_hash;
+		return;
 
 	if (icmph->type != ICMP_DEST_UNREACH &&
 	    icmph->type != ICMP_REDIRECT &&
 	    icmph->type != ICMP_TIME_EXCEEDED &&
-	    icmph->type != ICMP_PARAMETERPROB) {
-		goto standard_hash;
-	}
+	    icmph->type != ICMP_PARAMETERPROB)
+		return;
 
 	inner_iph = skb_header_pointer(skb,
 				       outer_iph->ihl * 4 + sizeof(_icmph),
 				       sizeof(_inner_iph), &_inner_iph);
 	if (!inner_iph)
-		goto standard_hash;
+		return;
+	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
+	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
+}
 
-	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+/* if skb is set it will be used and fl4 can be NULL */
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+		       const struct sk_buff *skb)
+{
+	struct net *net = fi->fib_net;
+	struct flow_keys hash_keys;
+	u32 mhash;
 
-standard_hash:
-	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
-}
+	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+	case 0:
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		if (skb) {
+			ip_multipath_l3_keys(skb, &hash_keys);
+		} else {
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+		}
+		break;
+	case 1:
+		/* skb is currently provided only when forwarding */
+		if (skb) {
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			struct flow_keys keys;
+
+			/* short-circuit if we already have L4 hash present */
+			if (skb->l4_hash)
+				return skb_get_hash_raw(skb) >> 1;
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			skb_flow_dissect_flow_keys(skb, &keys, flag);
+			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+			hash_keys.ports.src = keys.ports.src;
+			hash_keys.ports.dst = keys.ports.dst;
+			hash_keys.basic.ip_proto = keys.basic.ip_proto;
+		} else {
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+			hash_keys.ports.src = fl4->fl4_sport;
+			hash_keys.ports.dst = fl4->fl4_dport;
+			hash_keys.basic.ip_proto = fl4->flowi4_proto;
+		}
+		break;
+	}
+	mhash = flow_hash_from_keys(&hash_keys);
 
+	return mhash >> 1;
+}
+EXPORT_SYMBOL_GPL(fib_multipath_hash);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
 static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1834,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1) {
-		int h;
+		int h = fib_multipath_hash(res->fi, NULL, skb);
 
-		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
-			h = ip_multipath_icmp_hash(skb);
-		else
-			h = fib_multipath_hash(saddr, daddr);
 		fib_select_multipath(res, h);
 	}
 #endif
@@ -2203,7 +2251,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
  */
 
 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-					  int mp_hash)
+					  const struct sk_buff *skb)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 		goto make_route;
 	}
 
-	fib_select_path(net, &res, fl4, mp_hash);
+	fib_select_path(net, &res, fl4, skb);
 
 	dev_out = FIB_RES_DEV(res);
 	fl4->flowi4_oif = dev_out->ifindex;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d6880a6149ee..62c4f94923e5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "fib_multipath_hash_policy",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v4] net: ipv4: add support for ECMP hash policy choice
  2017-03-16 13:28   ` [PATCH net-next v4] " Nikolay Aleksandrov
@ 2017-03-16 16:41     ` Stephen Hemminger
  2017-03-16 16:49       ` Nikolay Aleksandrov
  2017-03-21 22:28     ` David Miller
  1 sibling, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2017-03-16 16:41 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: netdev, davem, roopa, dsa, jkbs, edumazet, pch, nicolas.dichtel

On Thu, 16 Mar 2017 15:28:00 +0200
Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:

> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index d6880a6149ee..62c4f94923e5 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one,
>  	},
> +	{
> +		.procname	= "fib_multipath_hash_policy",
> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	

Rather than having magic integer values, it would be better to use
strings (like TCP congestion control).  Especially if you want to support
more values in the future.

Also what about IPv6?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v4] net: ipv4: add support for ECMP hash policy choice
  2017-03-16 16:41     ` Stephen Hemminger
@ 2017-03-16 16:49       ` Nikolay Aleksandrov
  2017-03-17 10:06         ` Nikolay Aleksandrov
  0 siblings, 1 reply; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-16 16:49 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, davem, roopa, dsa, jkbs, edumazet, pch, nicolas.dichtel

On 16/03/17 18:41, Stephen Hemminger wrote:
> On Thu, 16 Mar 2017 15:28:00 +0200
> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
> 
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index d6880a6149ee..62c4f94923e5 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>>  		.extra1		= &zero,
>>  		.extra2		= &one,
>>  	},
>> +	{
>> +		.procname	= "fib_multipath_hash_policy",
>> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= &zero,
>> +		.extra2		= &one,
>> +	
> 
> Rather than having magic integer values, it would be better to use
> strings (like TCP congestion control).  Especially if you want to support
> more values in the future.

With strings we'll need two sysctls - one to export the available ones, and one to set.
I too am not happy with plain integers as I've said in v1 of this patch but if people
are okay with having two new sysctls exported then I don't mind reworking it with strings.
Or any other ideas are welcome.

> 
> Also what about IPv6?
> 

Patches are welcome, too. :-)
We can update it at any time, IPv4 and 6 have been different for a very long time
and it's not the point of this patch to bring them closer, though IPv6 already
does L4 by default.

 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
  2017-03-15  2:30               ` Tom Herbert
@ 2017-03-17  3:36                 ` David Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2017-03-17  3:36 UTC (permalink / raw)
  To: tom; +Cc: stephen, nikolay, netdev, roopa, dsa, jkbs, edumazet, pch

From: Tom Herbert <tom@herbertland.com>
Date: Tue, 14 Mar 2017 19:30:47 -0700

> Agreed, but then why do we even need any complexity here by that
> argument? RSS is specifically defined to do 5-tuple hashing for TCP
> (and UDP), and 3-tuple. No one has ever complained that doing per flow
> RSS for TCP is bad thing AFAIK. We followed that same model for RPS,
> RFS, and XPS via state in the connection context. The skb_hash is
> often given to us for free, whereas in order to do a 3-tuple we have
> to actually do more work and do at least an extra jhash. I suppose the
> argument is probably that switches allow this configuration and
> somehow we want to have feature parity, but it would be very
> interesting to know if anyone is not doing per flow ECMP in real life
> and why...

Anyone doing any kind of hardware offload work requires the
possibility of getting feature parity on the software side.  It's the
only way to properly test and debug things.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v4] net: ipv4: add support for ECMP hash policy choice
  2017-03-16 16:49       ` Nikolay Aleksandrov
@ 2017-03-17 10:06         ` Nikolay Aleksandrov
  0 siblings, 0 replies; 36+ messages in thread
From: Nikolay Aleksandrov @ 2017-03-17 10:06 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, davem, roopa, dsa, jkbs, edumazet, pch, nicolas.dichtel

On 16/03/17 18:49, Nikolay Aleksandrov wrote:
> On 16/03/17 18:41, Stephen Hemminger wrote:
>> On Thu, 16 Mar 2017 15:28:00 +0200
>> Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
>>
>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>> index d6880a6149ee..62c4f94923e5 100644
>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>>>  		.extra1		= &zero,
>>>  		.extra2		= &one,
>>>  	},
>>> +	{
>>> +		.procname	= "fib_multipath_hash_policy",
>>> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
>>> +		.maxlen		= sizeof(int),
>>> +		.mode		= 0644,
>>> +		.proc_handler	= proc_dointvec_minmax,
>>> +		.extra1		= &zero,
>>> +		.extra2		= &one,
>>> +	
>>
>> Rather than having magic integer values, it would be better to use
>> strings (like TCP congestion control).  Especially if you want to support
>> more values in the future.
> 
> With strings we'll need two sysctls - one to export the available ones, and one to set.
> I too am not happy with plain integers as I've said in v1 of this patch but if people
> are okay with having two new sysctls exported then I don't mind reworking it with strings.
> Or any other ideas are welcome.
> 
>>
>> Also what about IPv6?
>>
> 
> Patches are welcome, too. :-)
> We can update it at any time, IPv4 and 6 have been different for a very long time
> and it's not the point of this patch to bring them closer, though IPv6 already
> does L4 by default.

Just to clarify, we're interested in getting IPv4 and 6 on the same level and will
work towards that but let's take it a step at a time.

 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next v4] net: ipv4: add support for ECMP hash policy choice
  2017-03-16 13:28   ` [PATCH net-next v4] " Nikolay Aleksandrov
  2017-03-16 16:41     ` Stephen Hemminger
@ 2017-03-21 22:28     ` David Miller
  1 sibling, 0 replies; 36+ messages in thread
From: David Miller @ 2017-03-21 22:28 UTC (permalink / raw)
  To: nikolay; +Cc: netdev, roopa, dsa, jkbs, edumazet, pch, stephen, nicolas.dichtel

From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 16 Mar 2017 15:28:00 +0200

> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3 (default)
>  1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated (currently only for L4).
> In L3 mode we always calculate the hash due to the ICMP error special
> case, the flow dissector's field consistentification should handle the
> address order thus we can remove the address reversals.
> If the skb is provided we always use it for the hash calculation,
> otherwise we fallback to fl4, that is if skb is NULL fl4 has to be set.
> 
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

Applied, thanks Nikolay.

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2017-03-21 22:29 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-06 14:59 [PATCH net-next] net: ipv4: add support for ECMP hash policy choice Nikolay Aleksandrov
2017-03-06 16:24 ` David Ahern
2017-03-06 16:52   ` Nikolay Aleksandrov
2017-03-07  6:16 ` Roopa Prabhu
2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
2017-03-08 12:05   ` Jakub Sitnicki
2017-03-08 12:43     ` Nikolay Aleksandrov
2017-03-08 16:00       ` Jakub Sitnicki
2017-03-13  2:23         ` David Miller
2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
2017-03-14 15:55     ` Stephen Hemminger
2017-03-14 15:58       ` Nikolay Aleksandrov
2017-03-14 18:48         ` David Miller
2017-03-14 20:25           ` Stephen Hemminger
2017-03-14 21:10             ` Roopa Prabhu
2017-03-14 21:42               ` Stephen Hemminger
2017-03-14 22:38                 ` Roopa Prabhu
2017-03-14 23:27                   ` Stephen Hemminger
2017-03-14 23:45                     ` David Ahern
2017-03-15  9:17                       ` Nicolas Dichtel
2017-03-15 10:46                         ` Nikolay Aleksandrov
2017-03-15 11:18                           ` Nicolas Dichtel
2017-03-15 11:27                             ` Nikolay Aleksandrov
2017-03-15 15:01                         ` David Ahern
2017-03-15 15:20                           ` Stephen Hemminger
2017-03-15  0:24             ` David Miller
2017-03-15  2:30               ` Tom Herbert
2017-03-17  3:36                 ` David Miller
2017-03-14 18:55     ` Nikolay Aleksandrov
2017-03-15 11:32     ` Jakub Sitnicki
2017-03-15 12:10       ` Nikolay Aleksandrov
2017-03-16 13:28   ` [PATCH net-next v4] " Nikolay Aleksandrov
2017-03-16 16:41     ` Stephen Hemminger
2017-03-16 16:49       ` Nikolay Aleksandrov
2017-03-17 10:06         ` Nikolay Aleksandrov
2017-03-21 22:28     ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).