From mboxrd@z Thu Jan 1 00:00:00 1970 From: Eric Dumazet Subject: Re: [PATCH net v2] net/ipv6: Add anycast addresses to a global hashtable Date: Tue, 23 Oct 2018 20:12:10 -0700 Message-ID: References: <20181024015832.16607-1-0xeffeff@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Cc: davem@davemloft.net, kuznet@ms2.inr.ac.ru, yoshfuji@linux-ipv6.org To: Jeff Barnhill <0xeffeff@gmail.com>, netdev@vger.kernel.org Return-path: Received: from mail-pg1-f196.google.com ([209.85.215.196]:43222 "EHLO mail-pg1-f196.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1725998AbeJXLiT (ORCPT ); Wed, 24 Oct 2018 07:38:19 -0400 Received: by mail-pg1-f196.google.com with SMTP id n10-v6so893253pgv.10 for ; Tue, 23 Oct 2018 20:12:13 -0700 (PDT) In-Reply-To: <20181024015832.16607-1-0xeffeff@gmail.com> Content-Language: en-US Sender: netdev-owner@vger.kernel.org List-ID: On 10/23/2018 06:58 PM, Jeff Barnhill wrote: > icmp6_send() function is expensive on systems with a large number of > interfaces. Every time it’s called, it has to verify that the source > address does not correspond to an existing anycast address by looping > through every device and every anycast address on the device. This can > result in significant delays for a CPU when there are a large number of > neighbors and ND timers are frequently timing out and calling > neigh_invalidate(). > > Add anycast addresses to a global hashtable to allow quick searching for > matching anycast addresses. This is based on inet6_addr_lst in addrconf.c. > > Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> > --- > include/net/addrconf.h | 2 + > include/net/if_inet6.h | 8 +++ > net/ipv6/af_inet6.c | 5 ++ > net/ipv6/anycast.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 145 insertions(+), 2 deletions(-) > > diff --git a/include/net/addrconf.h b/include/net/addrconf.h > index 6def0351bcc3..0cee3f99c41d 100644 > --- a/include/net/addrconf.h > +++ b/include/net/addrconf.h > @@ -312,6 +312,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, > const struct in6_addr *addr); > bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, > const struct in6_addr *addr); > +int anycast_init(void); > +void anycast_cleanup(void); > > /* Device notifier */ > int register_inet6addr_notifier(struct notifier_block *nb); > diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h > index d7578cf49c3a..55a4a1d8cebc 100644 > --- a/include/net/if_inet6.h > +++ b/include/net/if_inet6.h > @@ -142,6 +142,14 @@ struct ipv6_ac_socklist { > struct ipv6_ac_socklist *acl_next; > }; > > +struct ipv6_ac_addrlist { > + struct in6_addr acal_addr; > + possible_net_t acal_pnet; > + int acal_users; That would be a refcount_t acal_users; so that CONFIG_REFCOUNT_FULL brings debugging for free. > + struct hlist_node acal_lst; /* inet6_acaddr_lst */ > + struct rcu_head rcu; > +}; > + > struct ifacaddr6 { > struct in6_addr aca_addr; > struct fib6_info *aca_rt; > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c > index 9a4261e50272..971a05fdd3bd 100644 > --- a/net/ipv6/af_inet6.c > +++ b/net/ipv6/af_inet6.c > @@ -1001,6 +1001,9 @@ static int __init inet6_init(void) > err = ip6_flowlabel_init(); > if (err) > goto ip6_flowlabel_fail; > + err = anycast_init(); > + if (err) > + goto anycast_fail; > err = addrconf_init(); > if (err) > goto addrconf_fail; > @@ -1091,6 +1094,8 @@ static int __init inet6_init(void) > ipv6_exthdrs_fail: > addrconf_cleanup(); > addrconf_fail: > + anycast_cleanup(); > +anycast_fail: > ip6_flowlabel_cleanup(); > ip6_flowlabel_fail: > ndisc_late_cleanup(); > diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c > index 4e0ff7031edd..def1e156d857 100644 > --- a/net/ipv6/anycast.c > +++ b/net/ipv6/anycast.c > @@ -44,8 +44,22 @@ > > #include > > +#define IN6_ADDR_HSIZE_SHIFT 8 > +#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) > +/* anycast address hash table > + */ > +static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; > +static DEFINE_SPINLOCK(acaddr_hash_lock); > + > static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); > > +static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) > +{ > + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); > + > + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); > +} > + > /* > * socket join an anycast group > */ > @@ -204,6 +218,83 @@ void ipv6_sock_ac_close(struct sock *sk) > rtnl_unlock(); > } > > +static struct ipv6_ac_addrlist *acal_alloc(struct net *net, > + const struct in6_addr *addr) > +{ > + struct ipv6_ac_addrlist *acal; > + > + acal = kzalloc(sizeof(*acal), GFP_ATOMIC); > + if (!acal) > + return NULL; > + > + acal->acal_addr = *addr; > + write_pnet(&acal->acal_pnet, get_net(net)); I am not sure why you grab a reference on the netns. The ipv6 address will be freed at some point before the netns disappears. It would automatically remove the associated struct ipv6_ac_addrlist. > + acal->acal_users = 1; > + INIT_HLIST_NODE(&acal->acal_lst); > + > + return acal; > +} > + > +static void acal_free_rcu(struct rcu_head *h) > +{ > + struct ipv6_ac_addrlist *acal; > + > + acal = container_of(h, struct ipv6_ac_addrlist, rcu); > + WARN_ON(acal->acal_users); Not needed with refcount_t debugging infra. > + put_net(read_pnet(&acal->acal_pnet)); > + kfree(acal); So this could use kfree_rcu() in the caller, and get rid of acal_free_rcu() completely. > +} > + > +static int ipv6_add_acaddr_hash(struct net *net, const struct in6_addr *addr) > +{ > + unsigned int hash = inet6_acaddr_hash(net, addr); > + struct ipv6_ac_addrlist *acal; > + int err = 0; > + > + spin_lock(&acaddr_hash_lock); > + hlist_for_each_entry(acal, &inet6_acaddr_lst[hash], acal_lst) { > + if (!net_eq(read_pnet(&acal->acal_pnet), net)) > + continue; > + if (ipv6_addr_equal(&acal->acal_addr, addr)) { > + acal->acal_users++; > + goto out; > + } > + } > + > + acal = acal_alloc(net, addr); > + if (!acal) { > + err = -ENOMEM; > + goto out; > + } > + > + hlist_add_head_rcu(&acal->acal_lst, &inet6_acaddr_lst[hash]); > + > +out: > + spin_unlock(&acaddr_hash_lock); > + return err; > +} > + > +static void ipv6_del_acaddr_hash(struct net *net, const struct in6_addr *addr) > +{ > + unsigned int hash = inet6_acaddr_hash(net, addr); > + struct ipv6_ac_addrlist *acal; > + > + spin_lock(&acaddr_hash_lock); > + hlist_for_each_entry(acal, &inet6_acaddr_lst[hash], acal_lst) { > + if (!net_eq(read_pnet(&acal->acal_pnet), net)) > + continue; > + if (ipv6_addr_equal(&acal->acal_addr, addr)) { > + if (--acal->acal_users < 1) { > + hlist_del_init_rcu(&acal->acal_lst); > + call_rcu(&acal->rcu, acal_free_rcu); > + } > + spin_unlock(&acaddr_hash_lock); > + return; > + } > + } > + spin_unlock(&acaddr_hash_lock); > +} > + > static void aca_get(struct ifacaddr6 *aca) > { > refcount_inc(&aca->aca_refcnt); > @@ -275,6 +366,13 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) > err = -ENOMEM; > goto out; > } > + err = ipv6_add_acaddr_hash(dev_net(idev->dev), addr); > + if (err) { > + fib6_info_release(f6i); > + fib6_info_release(f6i); Double call to fib6_info_release() ? Why ? > + kfree(aca); > + goto out; > + } > > aca->aca_next = idev->ac_list; > idev->ac_list = aca; > @@ -324,6 +422,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) > prev_aca->aca_next = aca->aca_next; > else > idev->ac_list = aca->aca_next; > + ipv6_del_acaddr_hash(dev_net(idev->dev), &aca->aca_addr); > write_unlock_bh(&idev->lock); > addrconf_leave_solict(idev, &aca->aca_addr); > > @@ -350,6 +449,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) > write_lock_bh(&idev->lock); > while ((aca = idev->ac_list) != NULL) { > idev->ac_list = aca->aca_next; > + ipv6_del_acaddr_hash(dev_net(idev->dev), &aca->aca_addr); > + > write_unlock_bh(&idev->lock); > > addrconf_leave_solict(idev, &aca->aca_addr); > @@ -391,16 +492,22 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, > const struct in6_addr *addr) > { > bool found = false; > + unsigned int hash = inet6_acaddr_hash(net, addr); > + struct ipv6_ac_addrlist *acal; Reorder variable declaration in longest to shortest (reverse xmas tree), per David Miller request :) > > rcu_read_lock(); > if (dev) > found = ipv6_chk_acast_dev(dev, addr); > else > - for_each_netdev_rcu(net, dev) > - if (ipv6_chk_acast_dev(dev, addr)) { > + hlist_for_each_entry_rcu(acal, &inet6_acaddr_lst[hash], > + acal_lst) { > + if (!net_eq(read_pnet(&acal->acal_pnet), net)) > + continue; > + if (ipv6_addr_equal(&acal->acal_addr, addr)) { > found = true; > break; > } > + } > rcu_read_unlock(); > return found; > } > @@ -539,4 +646,25 @@ void ac6_proc_exit(struct net *net) > { > remove_proc_entry("anycast6", net->proc_net); > } > + > +/* Init / cleanup code > + */ > +int __init anycast_init(void) > +{ > + int i; > + > + for (i = 0; i < IN6_ADDR_HSIZE; i++) > + INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); > + return 0; > +} > + > +void anycast_cleanup(void) > +{ > + int i; > + > + spin_lock(&acaddr_hash_lock); > + for (i = 0; i < IN6_ADDR_HSIZE; i++) > + WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); > + spin_unlock(&acaddr_hash_lock); > +} > #endif >