All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
To: Jakub Sitnicki <jkbs@redhat.com>
Cc: netdev@vger.kernel.org, davem@davemloft.net,
	roopa@cumulusnetworks.com, dsa@cumulusnetworks.com,
	edumazet@google.com, pch@ordbogen.com
Subject: Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice
Date: Wed, 15 Mar 2017 14:10:08 +0200	[thread overview]
Message-ID: <b1bf5cfc-22c1-5640-6f71-be25647bebb3@cumulusnetworks.com> (raw)
In-Reply-To: <87h92u22te.fsf@redhat.com>

On 15/03/17 13:32, Jakub Sitnicki wrote:
> On Tue, Mar 14, 2017 at 03:36 PM GMT, Nikolay Aleksandrov wrote:
>> This patch adds support for ECMP hash policy choice via a new sysctl
>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> The current values for fib_multipath_hash_policy are:
>>  0 - layer 3 (default)
>>  1 - layer 4
>> If there's an skb hash already set and it matches the chosen policy then it
>> will be used instead of being calculated (currently only for L4).
>> In L3 mode we always calculate the hash due to the ICMP error special
>> case, the flow dissector's field consistentification should handle the
>> address order thus we can remove the address reversals.
>>
>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> ---
>> v3:
>>  - keep the ICMP error special handling and always calc L3 hash
>>    Jakub, could you please run your tests with this version ?
>>
>> v2:
>>  - removed the output_key_hash as it's not needed anymore
>>  - reverted to my original/internal patch with L3 as default hash
>>
>>  Documentation/networking/ip-sysctl.txt |  8 +++
>>  include/net/ip_fib.h                   | 14 ++----
>>  include/net/netns/ipv4.h               |  1 +
>>  include/net/route.h                    |  9 +---
>>  net/ipv4/fib_semantics.c               | 11 ++--
>>  net/ipv4/icmp.c                        | 19 +------
>>  net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
>>  net/ipv4/sysctl_net_ipv4.c             |  9 ++++
>>  8 files changed, 98 insertions(+), 65 deletions(-)
>>
>> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
>> index fc73eeb7b3b8..558e19653106 100644
>> --- a/Documentation/networking/ip-sysctl.txt
>> +++ b/Documentation/networking/ip-sysctl.txt
>> @@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
>>  	0 - disabled
>>  	1 - enabled
>>  
>> +fib_multipath_hash_policy - INTEGER
>> +	Controls which hash policy to use for multipath routes. Only valid
>> +	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
>> +	Default: 0 (Layer 3)
>> +	Possible values:
>> +	0 - Layer 3
>> +	1 - Layer 4
>> +
>>  route/max_size - INTEGER
>>  	Maximum number of routes allowed in the kernel.  Increase
>>  	this when using large numbers of interfaces and/or routes.
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index d9cee9659978..8c608a17bd9a 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
>>  int fib_sync_down_addr(struct net_device *dev, __be32 local);
>>  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>>  
>> -extern u32 fib_multipath_secret __read_mostly;
>> -
>> -static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
>> -{
>> -	return jhash_2words((__force u32)saddr, (__force u32)daddr,
>> -			    fib_multipath_secret) >> 1;
>> -}
>> -
>> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
>> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
>> +		       const struct sk_buff *skb);
>> +#endif
>>  void fib_select_multipath(struct fib_result *res, int hash);
>>  void fib_select_path(struct net *net, struct fib_result *res,
>> -		     struct flowi4 *fl4, int mp_hash);
>> +		     struct flowi4 *fl4);
>>  
>>  /* Exported by fib_trie.c */
>>  void fib_trie_init(void);
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index 622d2da27135..70a1d4251790 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -152,6 +152,7 @@ struct netns_ipv4 {
>>  #endif
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	int sysctl_fib_multipath_use_neigh;
>> +	int sysctl_fib_multipath_hash_policy;
>>  #endif
>>  
>>  	unsigned int	fib_seq;	/* protected by rtnl_mutex */
>> diff --git a/include/net/route.h b/include/net/route.h
>> index c0874c87c173..32412c91c222 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -113,14 +113,7 @@ struct in_device;
>>  int ip_rt_init(void);
>>  void rt_cache_flush(struct net *net);
>>  void rt_flush_dev(struct net_device *dev);
>> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
>> -					  int mp_hash);
>> -
>> -static inline struct rtable *__ip_route_output_key(struct net *net,
>> -						   struct flowi4 *flp)
>> -{
>> -	return __ip_route_output_key_hash(net, flp, -1);
>> -}
>> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
>>  
>>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>>  				    const struct sock *sk);
>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>> index 317026a39cfa..6601bd9744c9 100644
>> --- a/net/ipv4/fib_semantics.c
>> +++ b/net/ipv4/fib_semantics.c
>> @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
>>  static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -u32 fib_multipath_secret __read_mostly;
>>  
>>  #define for_nexthops(fi) {						\
>>  	int nhsel; const struct fib_nh *nh;				\
>> @@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
>>  
>>  		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
>>  	} endfor_nexthops(fi);
>> -
>> -	net_get_random_once(&fib_multipath_secret,
>> -			    sizeof(fib_multipath_secret));
>>  }
>>  
>>  static inline void fib_add_weight(struct fib_info *fi,
>> @@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
>>  #endif
>>  
>>  void fib_select_path(struct net *net, struct fib_result *res,
>> -		     struct flowi4 *fl4, int mp_hash)
>> +		     struct flowi4 *fl4)
>>  {
>>  	bool oif_check;
>>  
>> @@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	if (res->fi->fib_nhs > 1 && oif_check) {
>> -		if (mp_hash < 0)
>> -			mp_hash = get_hash_from_flowi4(fl4) >> 1;
>> +		int h = fib_multipath_hash(res->fi, fl4, NULL);
>>  
>> -		fib_select_multipath(res, mp_hash);
>> +		fib_select_multipath(res, h);
>>  	}
>>  	else
>>  #endif
>> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
>> index fc310db2708b..46630bc30754 100644
>> --- a/net/ipv4/icmp.c
>> +++ b/net/ipv4/icmp.c
>> @@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
>>  	local_bh_enable();
>>  }
>>  
>> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -
>> -/* Source and destination is swapped. See ip_multipath_icmp_hash */
>> -static int icmp_multipath_hash_skb(const struct sk_buff *skb)
>> -{
>> -	const struct iphdr *iph = ip_hdr(skb);
>> -
>> -	return fib_multipath_hash(iph->daddr, iph->saddr);
>> -}
>> -
>> -#else
>> -
>> -#define icmp_multipath_hash_skb(skb) (-1)
>> -
>> -#endif
>> -
>>  static struct rtable *icmp_route_lookup(struct net *net,
>>  					struct flowi4 *fl4,
>>  					struct sk_buff *skb_in,
>> @@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>>  	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
>>  
>>  	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
>> -	rt = __ip_route_output_key_hash(net, fl4,
>> -					icmp_multipath_hash_skb(skb_in));
>> +	rt = __ip_route_output_key(net, fl4);
>>  	if (IS_ERR(rt))
>>  		return rt;
>>  
> 
> I'm observing that the above hunk changes things because saddr passed to
> icmp_route_lookup doesn't always remain set to ip_hdr(skb_in)->daddr.
> 
> This happens when generating an ICMP error in response to a datagram
> which destination address is not a local one, that is from ip_forward.
> 
> -Jakub

Oh, of course. Good catch, I'll fix it in v4.

Thank you,
 Nik

> 
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index 8471dd116771..3d7f99747285 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -1734,11 +1734,11 @@ static int __mkroute_input(struct sk_buff *skb,
>>  }
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -
>>  /* To make ICMP packets follow the right flow, the multipath hash is
>> - * calculated from the inner IP addresses in reverse order.
>> + * calculated from the inner IP addresses.
>>   */
>> -static int ip_multipath_icmp_hash(struct sk_buff *skb)
>> +static void ip_multipath_icmp_hash(const struct sk_buff *skb,
>> +				   struct flowi4 *fl4)
>>  {
>>  	const struct iphdr *outer_iph = ip_hdr(skb);
>>  	struct icmphdr _icmph;
>> @@ -1746,33 +1746,85 @@ static int ip_multipath_icmp_hash(struct sk_buff *skb)
>>  	struct iphdr _inner_iph;
>>  	const struct iphdr *inner_iph;
>>  
>> +	fl4->saddr = outer_iph->saddr;
>> +	fl4->daddr = outer_iph->daddr;
>>  	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
>> -		goto standard_hash;
>> +		return;
>>  
>>  	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
>>  				   &_icmph);
>>  	if (!icmph)
>> -		goto standard_hash;
>> +		return;
>>  
>>  	if (icmph->type != ICMP_DEST_UNREACH &&
>>  	    icmph->type != ICMP_REDIRECT &&
>>  	    icmph->type != ICMP_TIME_EXCEEDED &&
>> -	    icmph->type != ICMP_PARAMETERPROB) {
>> -		goto standard_hash;
>> -	}
>> +	    icmph->type != ICMP_PARAMETERPROB)
>> +		return;
>>  
>>  	inner_iph = skb_header_pointer(skb,
>>  				       outer_iph->ihl * 4 + sizeof(_icmph),
>>  				       sizeof(_inner_iph), &_inner_iph);
>>  	if (!inner_iph)
>> -		goto standard_hash;
>> +		return;
>> +	fl4->saddr = inner_iph->saddr;
>> +	fl4->daddr = inner_iph->daddr;
>> +}
>>  
>> -	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
>> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
>> +		       const struct sk_buff *skb)
>> +{
>> +	struct net *net = fi->fib_net;
>> +	struct flow_keys hash_keys;
>> +	u32 mhash;
>>  
>> -standard_hash:
>> -	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
>> -}
>> +	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
>> +	case 0:
>> +		memset(&hash_keys, 0, sizeof(hash_keys));
>> +		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>> +		if (skb && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
>> +			struct flowi4 _fl4;
>>  
>> +			ip_multipath_icmp_hash(skb, &_fl4);
>> +			hash_keys.addrs.v4addrs.src = _fl4.saddr;
>> +			hash_keys.addrs.v4addrs.dst = _fl4.daddr;
>> +		} else {
>> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
>> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
>> +		}
>> +		break;
>> +	case 1:
>> +		/* skb is currently provided only when forwarding */
>> +		if (skb) {
>> +			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
>> +			struct flow_keys keys;
>> +
>> +			/* short-circuit if we already have L4 hash present */
>> +			if (skb->l4_hash)
>> +				return skb_get_hash_raw(skb) >> 1;
>> +			memset(&hash_keys, 0, sizeof(hash_keys));
>> +			skb_flow_dissect_flow_keys(skb, &keys, flag);
>> +			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
>> +			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
>> +			hash_keys.ports.src = keys.ports.src;
>> +			hash_keys.ports.dst = keys.ports.dst;
>> +			hash_keys.basic.ip_proto = keys.basic.ip_proto;
>> +		} else {
>> +			memset(&hash_keys, 0, sizeof(hash_keys));
>> +			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
>> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
>> +			hash_keys.ports.src = fl4->fl4_sport;
>> +			hash_keys.ports.dst = fl4->fl4_dport;
>> +			hash_keys.basic.ip_proto = fl4->flowi4_proto;
>> +		}
>> +		break;
>> +	}
>> +	mhash = flow_hash_from_keys(&hash_keys);
>> +
>> +	return mhash >> 1;
>> +}
>> +EXPORT_SYMBOL_GPL(fib_multipath_hash);
>>  #endif /* CONFIG_IP_ROUTE_MULTIPATH */
>>  
>>  static int ip_mkroute_input(struct sk_buff *skb,
>> @@ -1782,12 +1834,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
>>  {
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	if (res->fi && res->fi->fib_nhs > 1) {
>> -		int h;
>> +		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
>> +		int h = fib_multipath_hash(res->fi, &fl4, skb);
>>  
>> -		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
>> -			h = ip_multipath_icmp_hash(skb);
>> -		else
>> -			h = fib_multipath_hash(saddr, daddr);
>>  		fib_select_multipath(res, h);
>>  	}
>>  #endif
>> @@ -2202,8 +2251,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>>   * Major route resolver routine.
>>   */
>>  
>> -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>> -					  int mp_hash)
>> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>>  {
>>  	struct net_device *dev_out = NULL;
>>  	__u8 tos = RT_FL_TOS(fl4);
>> @@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>  		goto make_route;
>>  	}
>>  
>> -	fib_select_path(net, &res, fl4, mp_hash);
>> +	fib_select_path(net, &res, fl4);
>>  
>>  	dev_out = FIB_RES_DEV(res);
>>  	fl4->flowi4_oif = dev_out->ifindex;
>> @@ -2378,7 +2426,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>  	rcu_read_unlock();
>>  	return rth;
>>  }
>> -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
>> +EXPORT_SYMBOL_GPL(__ip_route_output_key);
>>  
>>  static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
>>  {
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index d6880a6149ee..62c4f94923e5 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>>  		.extra1		= &zero,
>>  		.extra2		= &one,
>>  	},
>> +	{
>> +		.procname	= "fib_multipath_hash_policy",
>> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= &zero,
>> +		.extra2		= &one,
>> +	},
>>  #endif
>>  	{
>>  		.procname	= "ip_unprivileged_port_start",
> 

  reply	other threads:[~2017-03-15 12:10 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-06 14:59 [PATCH net-next] net: ipv4: add support for ECMP hash policy choice Nikolay Aleksandrov
2017-03-06 16:24 ` David Ahern
2017-03-06 16:52   ` Nikolay Aleksandrov
2017-03-07  6:16 ` Roopa Prabhu
2017-03-07 11:01 ` [PATCH net-next v2] " Nikolay Aleksandrov
2017-03-08 12:05   ` Jakub Sitnicki
2017-03-08 12:43     ` Nikolay Aleksandrov
2017-03-08 16:00       ` Jakub Sitnicki
2017-03-13  2:23         ` David Miller
2017-03-14 15:36   ` [PATCH net-next v3] " Nikolay Aleksandrov
2017-03-14 15:55     ` Stephen Hemminger
2017-03-14 15:58       ` Nikolay Aleksandrov
2017-03-14 18:48         ` David Miller
2017-03-14 20:25           ` Stephen Hemminger
2017-03-14 21:10             ` Roopa Prabhu
2017-03-14 21:42               ` Stephen Hemminger
2017-03-14 22:38                 ` Roopa Prabhu
2017-03-14 23:27                   ` Stephen Hemminger
2017-03-14 23:45                     ` David Ahern
2017-03-15  9:17                       ` Nicolas Dichtel
2017-03-15 10:46                         ` Nikolay Aleksandrov
2017-03-15 11:18                           ` Nicolas Dichtel
2017-03-15 11:27                             ` Nikolay Aleksandrov
2017-03-15 15:01                         ` David Ahern
2017-03-15 15:20                           ` Stephen Hemminger
2017-03-15  0:24             ` David Miller
2017-03-15  2:30               ` Tom Herbert
2017-03-17  3:36                 ` David Miller
2017-03-14 18:55     ` Nikolay Aleksandrov
2017-03-15 11:32     ` Jakub Sitnicki
2017-03-15 12:10       ` Nikolay Aleksandrov [this message]
2017-03-16 13:28   ` [PATCH net-next v4] " Nikolay Aleksandrov
2017-03-16 16:41     ` Stephen Hemminger
2017-03-16 16:49       ` Nikolay Aleksandrov
2017-03-17 10:06         ` Nikolay Aleksandrov
2017-03-21 22:28     ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b1bf5cfc-22c1-5640-6f71-be25647bebb3@cumulusnetworks.com \
    --to=nikolay@cumulusnetworks.com \
    --cc=davem@davemloft.net \
    --cc=dsa@cumulusnetworks.com \
    --cc=edumazet@google.com \
    --cc=jkbs@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=pch@ordbogen.com \
    --cc=roopa@cumulusnetworks.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.