All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] net: ipv4 -- Introduce ifa limit per net
@ 2016-03-04 21:39 Cyrill Gorcunov
  2016-03-04 22:50 ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-04 21:39 UTC (permalink / raw)
  To: NETDEV
  Cc: Solar Designer, Vasily Averin, Andrey Vagin, Pavel Emelianov,
	Vladimir Davydov, Konstantin Khorenko, David Miller,
	Eric Dumazet

Currenlty all the kernels (including vanilla) free ifa
list under rtln_lock() taken which takes a huge time
to release all entries when we stop the container.
Moreover it's allowed to create unlimited number
of addresses from inside of net-namespace if
CAP-NET_ADMIN granted (which is common for containers).

Lets introduce per-net limit (4096 by default)
of addresses, which can be tuned up via sysfs
entry /proc/sys/net/ipv4/ifa_limit.

Reported-by: Solar Designer <solar@openwall.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
CC: Vasily Averin <vvs@virtuozzo.com>
CC: Andrey Vagin <avagin@virtuozzo.com>
CC: Pavel Emelianov <xemul@virtuozzo.com>
CC: Vladimir Davydov <vdavydov@virtuozzo.com>
CC: Konstantin Khorenko <khorenko@virtuozzo.com>
CC: David Miller <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
---

Please share the idea if there some more elegant way exist
to fix this problem, maybe I miss something obvious. Thanks!

 include/net/netns/ipv4.h   |    3 +++
 net/ipv4/devinet.c         |   34 +++++++++++++++++++---------------
 net/ipv4/sysctl_net_ipv4.c |    8 ++++++++
 3 files changed, 30 insertions(+), 15 deletions(-)

Index: linux-ml.git/include/net/netns/ipv4.h
===================================================================
--- linux-ml.git.orig/include/net/netns/ipv4.h
+++ linux-ml.git/include/net/netns/ipv4.h
@@ -77,6 +77,8 @@ struct netns_ipv4 {
 
 	struct local_ports ip_local_ports;
 
+	int sysctl_ifa_limit;
+
 	int sysctl_tcp_ecn;
 	int sysctl_tcp_ecn_fallback;
 
@@ -101,6 +103,7 @@ struct netns_ipv4 {
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
+	atomic_t ifa_nr;
 
 #ifdef CONFIG_SYSCTL
 	unsigned long *sysctl_local_reserved_ports;
Index: linux-ml.git/net/ipv4/devinet.c
===================================================================
--- linux-ml.git.orig/net/ipv4/devinet.c
+++ linux-ml.git/net/ipv4/devinet.c
@@ -194,8 +194,11 @@ static void devinet_sysctl_unregister(st
 
 /* Locks all the inet devices. */
 
-static struct in_ifaddr *inet_alloc_ifa(void)
+static struct in_ifaddr *inet_alloc_ifa(struct net *net)
 {
+	if (atomic_add_return(1, &net->ipv4.ifa_nr) >
+	    net->ipv4.sysctl_ifa_limit)
+		return NULL;
 	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
 }
 
@@ -207,8 +210,9 @@ static void inet_rcu_free_ifa(struct rcu
 	kfree(ifa);
 }
 
-static void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct net *net, struct in_ifaddr *ifa)
 {
+	atomic_dec(&net->ipv4.ifa_nr);
 	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 }
 
@@ -296,7 +300,7 @@ static void inetdev_destroy(struct in_de
 
 	while ((ifa = in_dev->ifa_list) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
-		inet_free_ifa(ifa);
+		inet_free_ifa(dev_net(dev), ifa);
 	}
 
 	RCU_INIT_POINTER(dev->ip_ptr, NULL);
@@ -361,7 +365,7 @@ static void __inet_del_ifa(struct in_dev
 				rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
 				blocking_notifier_call_chain(&inetaddr_chain,
 						NETDEV_DOWN, ifa);
-				inet_free_ifa(ifa);
+				inet_free_ifa(dev_net(in_dev->dev), ifa);
 			} else {
 				promote = ifa;
 				break;
@@ -420,7 +424,7 @@ static void __inet_del_ifa(struct in_dev
 
 	}
 	if (destroy)
-		inet_free_ifa(ifa1);
+		inet_free_ifa(dev_net(in_dev->dev), ifa1);
 }
 
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
@@ -442,7 +446,7 @@ static int __inet_insert_ifa(struct in_i
 	ASSERT_RTNL();
 
 	if (!ifa->ifa_local) {
-		inet_free_ifa(ifa);
+		inet_free_ifa(dev_net(in_dev->dev), ifa);
 		return 0;
 	}
 
@@ -457,11 +461,11 @@ static int __inet_insert_ifa(struct in_i
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa)) {
 			if (ifa1->ifa_local == ifa->ifa_local) {
-				inet_free_ifa(ifa);
+				inet_free_ifa(dev_net(in_dev->dev), ifa);
 				return -EEXIST;
 			}
 			if (ifa1->ifa_scope != ifa->ifa_scope) {
-				inet_free_ifa(ifa);
+				inet_free_ifa(dev_net(in_dev->dev), ifa);
 				return -EINVAL;
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -502,7 +506,7 @@ static int inet_set_ifa(struct net_devic
 	ASSERT_RTNL();
 
 	if (!in_dev) {
-		inet_free_ifa(ifa);
+		inet_free_ifa(dev_net(dev), ifa);
 		return -ENOBUFS;
 	}
 	ipv4_devconf_setall(in_dev);
@@ -768,7 +772,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s
 	if (!in_dev)
 		goto errout;
 
-	ifa = inet_alloc_ifa();
+	ifa = inet_alloc_ifa(net);
 	if (!ifa)
 		/*
 		 * A potential indev allocation can be left alive, it stays
@@ -817,7 +821,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s
 	return ifa;
 
 errout_free:
-	inet_free_ifa(ifa);
+	inet_free_ifa(net, ifa);
 errout:
 	return ERR_PTR(err);
 }
@@ -865,13 +869,13 @@ static int inet_rtm_newaddr(struct sk_bu
 					       true, ifa);
 
 			if (ret < 0) {
-				inet_free_ifa(ifa);
+				inet_free_ifa(net, ifa);
 				return ret;
 			}
 		}
 		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
 	} else {
-		inet_free_ifa(ifa);
+		inet_free_ifa(net, ifa);
 
 		if (nlh->nlmsg_flags & NLM_F_EXCL ||
 		    !(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1055,7 +1059,7 @@ int devinet_ioctl(struct net *net, unsig
 
 		if (!ifa) {
 			ret = -ENOBUFS;
-			ifa = inet_alloc_ifa();
+			ifa = inet_alloc_ifa(net);
 			if (!ifa)
 				break;
 			INIT_HLIST_NODE(&ifa->hash);
@@ -1408,7 +1412,7 @@ static int inetdev_event(struct notifier
 		if (!inetdev_valid_mtu(dev->mtu))
 			break;
 		if (dev->flags & IFF_LOOPBACK) {
-			struct in_ifaddr *ifa = inet_alloc_ifa();
+			struct in_ifaddr *ifa = inet_alloc_ifa(dev_net(dev));
 
 			if (ifa) {
 				INIT_HLIST_NODE(&ifa->hash);
Index: linux-ml.git/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-ml.git.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-ml.git/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,13 @@ static struct ctl_table ipv4_net_table[]
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "ifa_limit",
+		.data		= &init_net.ipv4.sysctl_ifa_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
@@ -988,6 +995,7 @@ static __net_init int ipv4_sysctl_init_n
 	if (!net->ipv4.sysctl_local_reserved_ports)
 		goto err_ports;
 
+	net->ipv4.sysctl_ifa_limit = 4096;
 	return 0;
 
 err_ports:

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-04 21:39 [RFC] net: ipv4 -- Introduce ifa limit per net Cyrill Gorcunov
@ 2016-03-04 22:50 ` David Miller
  2016-03-05  0:08   ` Eric Dumazet
  2016-03-05  6:58   ` Cyrill Gorcunov
  0 siblings, 2 replies; 50+ messages in thread
From: David Miller @ 2016-03-04 22:50 UTC (permalink / raw)
  To: gorcunov
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko, eric.dumazet

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 5 Mar 2016 00:39:20 +0300

> Currenlty all the kernels (including vanilla) free ifa
> list under rtln_lock() taken which takes a huge time
> to release all entries when we stop the container.
> Moreover it's allowed to create unlimited number
> of addresses from inside of net-namespace if
> CAP-NET_ADMIN granted (which is common for containers).
> 
> Lets introduce per-net limit (4096 by default)
> of addresses, which can be tuned up via sysfs
> entry /proc/sys/net/ipv4/ifa_limit.
> 
> Reported-by: Solar Designer <solar@openwall.com>
> Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>

Arbitrary limits are... arbitrary.

If the freeing loop is the issue, splice the list at teardown and
process that list asynchronously via a workqueue or similar.

Thanks.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-04 22:50 ` David Miller
@ 2016-03-05  0:08   ` Eric Dumazet
  2016-03-05  4:11     ` David Miller
  2016-03-05  6:58   ` Cyrill Gorcunov
  1 sibling, 1 reply; 50+ messages in thread
From: Eric Dumazet @ 2016-03-05  0:08 UTC (permalink / raw)
  To: David Miller
  Cc: gorcunov, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On ven., 2016-03-04 at 17:50 -0500, David Miller wrote:
> From: Cyrill Gorcunov <gorcunov@gmail.com>
> Date: Sat, 5 Mar 2016 00:39:20 +0300
> 
> > Currenlty all the kernels (including vanilla) free ifa
> > list under rtln_lock() taken which takes a huge time
> > to release all entries when we stop the container.
> > Moreover it's allowed to create unlimited number
> > of addresses from inside of net-namespace if
> > CAP-NET_ADMIN granted (which is common for containers).
> > 
> > Lets introduce per-net limit (4096 by default)
> > of addresses, which can be tuned up via sysfs
> > entry /proc/sys/net/ipv4/ifa_limit.
> > 
> > Reported-by: Solar Designer <solar@openwall.com>
> > Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> 
> Arbitrary limits are... arbitrary.
> 
> If the freeing loop is the issue, splice the list at teardown and
> process that list asynchronously via a workqueue or similar.
> 
> Thanks.

Also I wonder if the problem is not a quadratic behavior.

__inet_del_ifa() should probably take into account in_dev->dead  (no
promotion, no list scan...)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05  0:08   ` Eric Dumazet
@ 2016-03-05  4:11     ` David Miller
  2016-03-05  7:18       ` Cyrill Gorcunov
  2016-03-05 15:57       ` Cyrill Gorcunov
  0 siblings, 2 replies; 50+ messages in thread
From: David Miller @ 2016-03-05  4:11 UTC (permalink / raw)
  To: eric.dumazet
  Cc: gorcunov, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 04 Mar 2016 16:08:30 -0800

> __inet_del_ifa() should probably take into account in_dev->dead (no
> promotion, no list scan...)

Indeed, that is the real problem:

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8c3df2c..7412feb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -334,6 +334,12 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 
 	ASSERT_RTNL();
 
+	/* None of these potentially quadratic scans matter if the
+	 * device is being destroyed.
+	 */
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* 1. Deleting primary ifaddr forces deletion all secondaries
 	 * unless alias promotion is set
 	 **/
@@ -380,6 +386,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			fib_del_ifaddr(ifa, ifa1);
 	}
 
+no_promotions:
 	/* 2. Unlink it */
 
 	*ifap = ifa1->ifa_next;

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-04 22:50 ` David Miller
  2016-03-05  0:08   ` Eric Dumazet
@ 2016-03-05  6:58   ` Cyrill Gorcunov
  1 sibling, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-05  6:58 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko, eric.dumazet

On Fri, Mar 04, 2016 at 05:50:32PM -0500, David Miller wrote:
> 
> Arbitrary limits are... arbitrary.
> 
> If the freeing loop is the issue, splice the list at teardown and
> process that list asynchronously via a workqueue or similar.

Thanks! I'll try.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05  4:11     ` David Miller
@ 2016-03-05  7:18       ` Cyrill Gorcunov
  2016-03-05 15:57       ` Cyrill Gorcunov
  1 sibling, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-05  7:18 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Fri, Mar 04, 2016 at 11:11:09PM -0500, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Fri, 04 Mar 2016 16:08:30 -0800
> 
> > __inet_del_ifa() should probably take into account in_dev->dead (no
> > promotion, no list scan...)
> 
> Indeed, that is the real problem:

Oh, this email dropped into my inbox a way later then
I wrote the reply. I'll test it, thanks David and Eric!

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05  4:11     ` David Miller
  2016-03-05  7:18       ` Cyrill Gorcunov
@ 2016-03-05 15:57       ` Cyrill Gorcunov
  2016-03-05 16:33         ` David Miller
  1 sibling, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-05 15:57 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Fri, Mar 04, 2016 at 11:11:09PM -0500, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Fri, 04 Mar 2016 16:08:30 -0800
> 
> > __inet_del_ifa() should probably take into account in_dev->dead (no
> > promotion, no list scan...)
> 
> Indeed, that is the real problem:

Well, tried it out. Indeed it partially released the contention
but with patch applied I stuck with

Samples: 20K of event 'cpu-clock', Event count (approx.): 4647374938
Overhead  Shared Object        Symbol
  19.27%  [kernel]             [k] __local_bh_enable_ip
  15.97%  [kernel]             [k] lock_acquire
  15.12%  [kernel]             [k] fib_del_ifaddr
  11.66%  [kernel]             [k] lock_release
   7.57%  [kernel]             [k] lock_is_held
   5.35%  [kernel]             [k] lock_acquired
   3.26%  [kernel]             [k] _raw_spin_unlock_irqrestore
   3.04%  [kernel]             [k] __local_bh_disable_ip
   2.10%  [kernel]             [k] _raw_spin_unlock_irq
   1.54%  [kernel]             [k] native_save_fl
   1.37%  [kernel]             [k] ___might_sleep
   0.90%  [kernel]             [k] do_raw_spin_trylock
   0.83%  [kernel]             [k] nf_ct_iterate_cleanup
   0.77%  [kernel]             [k] debug_lockdep_rcu_enabled
   0.62%  [kernel]             [k] tick_nohz_idle_enter
   0.61%  [kernel]             [k] _raw_spin_lock
   0.58%  [kernel]             [k] __slab_alloc.isra.43.constprop.47
   0.42%  [kernel]             [k] get_parent_ip
   0.40%  [kernel]             [k] preempt_count_sub
   0.36%  [kernel]             [k] native_save_fl
   0.34%  [kernel]             [k] _raw_spin_unlock
   0.31%  [kernel]             [k] do_raw_spin_unlock

and until everything get cleaned up I couldn't connect
to the node via ssh. I continue playing with patch maybe
I find some other optimization paths. Thanks!

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05 15:57       ` Cyrill Gorcunov
@ 2016-03-05 16:33         ` David Miller
  2016-03-05 17:00           ` Cyrill Gorcunov
  2016-03-05 18:44           ` Cyrill Gorcunov
  0 siblings, 2 replies; 50+ messages in thread
From: David Miller @ 2016-03-05 16:33 UTC (permalink / raw)
  To: gorcunov
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 5 Mar 2016 18:57:14 +0300

> On Fri, Mar 04, 2016 at 11:11:09PM -0500, David Miller wrote:
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Fri, 04 Mar 2016 16:08:30 -0800
>> 
>> > __inet_del_ifa() should probably take into account in_dev->dead (no
>> > promotion, no list scan...)
>> 
>> Indeed, that is the real problem:
> 
> Well, tried it out. Indeed it partially released the contention
> but with patch applied I stuck with
 ...
> and until everything get cleaned up I couldn't connect
> to the node via ssh. I continue playing with patch maybe
> I find some other optimization paths. Thanks!

What is the order of magnitude of the delay, as a function of
number of IP aliases installed, compred to before the patch?

The remaining cost you are seeing comes of course from the router
deletion, whose path is:

	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
		fib_inetaddr_event()
			fib_del_ifaddr(ifa, NULL);

Which does another full list scan trying to handle primaries and
secondaries.

Probably the same optimization can be applied there, see patch below.
And if that doesn't do it, there is a really easy way to batch the
delete by scanning the FIB tree in one go and deleting every entry
that points to "in_dev".  But I suspect we really won't need that.

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4734475..21add55 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -922,6 +922,9 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		subnet = 1;
 	}
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* Deletion is more complicated than add.
 	 * We should take care of not to delete too much :-)
 	 *
@@ -997,6 +1000,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		}
 	}
 
+no_promotions:
 	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 	if (subnet && ifa->ifa_prefixlen < 31) {

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05 16:33         ` David Miller
@ 2016-03-05 17:00           ` Cyrill Gorcunov
  2016-03-05 18:44           ` Cyrill Gorcunov
  1 sibling, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-05 17:00 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Sat, Mar 05, 2016 at 11:33:12AM -0500, David Miller wrote:
> > and until everything get cleaned up I couldn't connect
> > to the node via ssh. I continue playing with patch maybe
> > I find some other optimization paths. Thanks!
> 
> What is the order of magnitude of the delay, as a function of
> number of IP aliases installed, compred to before the patch?

You know I didn't measured precise numbers. The script (which
I of course forgot to attach in first report) creates 65025
addresses and on exit it takes ~10 minutes (it also depends
on load on the host because I've been testing inside VM).

I'll create some kind of graph for that if you interested,
should I?

> The remaining cost you are seeing comes of course from the router
> deletion, whose path is:
> 
> 	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
> 		fib_inetaddr_event()
> 			fib_del_ifaddr(ifa, NULL);
> 
> Which does another full list scan trying to handle primaries and
> secondaries.
> 
> Probably the same optimization can be applied there, see patch below.
> And if that doesn't do it, there is a really easy way to batch the
> delete by scanning the FIB tree in one go and deleting every entry
> that points to "in_dev".  But I suspect we really won't need that.

I'll test it David, in a couple of hours I hope. And report the
result.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05 16:33         ` David Miller
  2016-03-05 17:00           ` Cyrill Gorcunov
@ 2016-03-05 18:44           ` Cyrill Gorcunov
  2016-03-06 10:09             ` Cyrill Gorcunov
  1 sibling, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-05 18:44 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Sat, Mar 05, 2016 at 11:33:12AM -0500, David Miller wrote:
...
> 
> Probably the same optimization can be applied there, see patch below.
> And if that doesn't do it, there is a really easy way to batch the
> delete by scanning the FIB tree in one go and deleting every entry
> that points to "in_dev".  But I suspect we really won't need that.

It made it to work faster but still for 10000 addresses it takes
~3-4 minutes to become alive again.

David, give me some time, I'll prepare tests and report the
results on patches and unpatched versions. And thanks a huge
for both patches!

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-05 18:44           ` Cyrill Gorcunov
@ 2016-03-06 10:09             ` Cyrill Gorcunov
  2016-03-06 16:23               ` Eric Dumazet
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-06 10:09 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Sat, Mar 05, 2016 at 09:44:59PM +0300, Cyrill Gorcunov wrote:
> On Sat, Mar 05, 2016 at 11:33:12AM -0500, David Miller wrote:
> ...
> > 
> > Probably the same optimization can be applied there, see patch below.
> > And if that doesn't do it, there is a really easy way to batch the
> > delete by scanning the FIB tree in one go and deleting every entry
> > that points to "in_dev".  But I suspect we really won't need that.
> 
> It made it to work faster but still for 10000 addresses it takes
> ~3-4 minutes to become alive again.
> 
> David, give me some time, I'll prepare tests and report the
> results on patches and unpatched versions. And thanks a huge
> for both patches!

Hi David! So I tried both patched and unpatched versions and the
results didn't varies much.

Unpatched
=========

[root@pcs7 ~]# ./exploit.sh
START 4 addresses STOP 1457255479 1457255480		-> 1
START 144 addresses STOP 1457255481 1457255482		-> 1
START 484 addresses STOP 1457255485 1457255490		-> 5
START 1024 addresses STOP 1457255496 1457255506		-> 10
START 1764 addresses STOP 1457255516 1457255532		-> 16
START 2704 addresses STOP 1457255548 1457255574		-> 26
START 3844 addresses STOP 1457255597 1457255633		-> 36
START 5184 addresses STOP 1457255665 1457255714		-> 49
START 6724 addresses STOP 1457255755 1457255819		-> 64
START 8464 addresses STOP 1457255872 1457255952		-> 80

Patched
=======

[root@pcs7 ~]# ./exploit.sh
START 4 addresses STOP 1457256166 1457256167		-> 1
START 144 addresses STOP 1457256168 1457256170		-> 2
START 484 addresses STOP 1457256173 1457256178		-> 5
START 1024 addresses STOP 1457256184 1457256194		-> 10
START 1764 addresses STOP 1457256206 1457256225		-> 19
START 2704 addresses STOP 1457256243 1457256272		-> 29
START 3844 addresses STOP 1457256303 1457256343		-> 40
START 5184 addresses STOP 1457256377 1457256427		-> 50
START 6724 addresses STOP 1457256472 1457256538		-> 66
START 8464 addresses STOP 1457256609 1457256697		-> 88

The script itself I've been using is the following
---
#!/bin/sh

if [ -z $1 ]; then
       for x in `seq 1 10 100`; do
                echo -n "START "
                (unshare -n /bin/sh exploit.sh $x)
                echo -n " "
                ssh -q -t root@localhost "exit"
                echo `date +%s`
                d2=`date +%s`
       done
else
        for x in `seq 0 $1`; do
                for y in `seq 0 $1`; do
                        ip a a 127.1.$x.$y dev lo
                done
        done
        num=`ip a l dev lo | grep -c "inet "`
        echo -n "$num addresses "
        echo -n "STOP "
        echo -n `date +%s`
        exit
fi
---

This is strange that on patched version it took even longer
but I think this is due to the fact that the test is ran
on VM instead of real hardware.

Anyway, then I run this script for 255 as parameter
in one pass which gen. requests to create 65025 addresses
and kernel start complaining:

Perf output
-----------
  24.95%  [kernel]                      [k] __local_bh_enable_ip
  21.52%  [kernel]                      [k] lock_acquire
  15.54%  [kernel]                      [k] lock_release
   9.84%  [kernel]                      [k] lock_is_held
   7.47%  [kernel]                      [k] lock_acquired
   4.08%  [kernel]                      [k] __local_bh_disable_ip
   1.86%  [kernel]                      [k] native_save_fl
   1.74%  [kernel]                      [k] ___might_sleep
   1.34%  [kernel]                      [k] _raw_spin_unlock_irqrestore
   1.10%  [kernel]                      [k] do_raw_spin_trylock
   0.98%  [kernel]                      [k] __slab_alloc.isra.43.constprop.47
   0.97%  [kernel]                      [k] debug_lockdep_rcu_enabled
   0.93%  [kernel]                      [k] nf_ct_iterate_cleanup
   0.90%  [kernel]                      [k] _raw_spin_lock
   0.54%  [kernel]                      [k] __do_softirq
   0.49%  [kernel]                      [k] get_parent_ip
   0.48%  [kernel]                      [k] _raw_spin_unlock
   0.46%  [kernel]                      [k] preempt_count_sub
   0.42%  [kernel]                      [k] native_save_fl
   0.40%  [kernel]                      [k] preempt_count_add
   0.39%  [kernel]                      [k] do_raw_spin_unlock
   0.36%  [kernel]                      [k] in_lock_functions
   0.35%  [kernel]                      [k] arch_local_irq_save
   0.22%  [kernel]                      [k] _raw_spin_unlock_irq
   0.19%  [kernel]                      [k] nf_conntrack_lock
   0.18%  [kernel]                      [k] local_bh_enable
   0.16%  [kernel]                      [k] trace_preempt_off
   0.16%  [kernel]                      [k] arch_local_irq_save
   0.14%  [kernel]                      [k] console_unlock
   0.14%  [kernel]                      [k] preempt_trace
   0.12%  [kernel]                      [k] local_bh_disable
   0.12%  [kernel]                      [k] _cond_resched
   0.06%  [kernel]                      [k] read_seqcount_begin.constprop.22
   0.06%  perf                          [.] dso__find_symbol
   0.05%  [kernel]                      [k] acpi_pm_read
---

dmesg output
------------

[ 1680.436091] INFO: task kworker/0:2:6888 blocked for more than 120 seconds.
[ 1680.437270]       Tainted: G        W       4.5.0-rc6-dirty #18
[ 1680.438310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1680.439911] kworker/0:2     D ffff8800a2c6bca8     0  6888      2 0x00080080
[ 1680.441137] Workqueue: ipv6_addrconf addrconf_verify_work
[ 1680.442136]  ffff8800a2c6bca8 00ff8800a3c28000 00000000001d5f40 ffff88013a7d5f40
[ 1680.444679]  ffff8800a3c28000 ffff8800a2c6c000 0000000000000246 ffff8800a3c28000
[ 1680.446352]  ffffffff81f31e28 ffffffff81683614 ffff8800a2c6bcc0 ffffffff818365a6
[ 1680.449354] Call Trace:
[ 1680.450039]  [<ffffffff81683614>] ? rtnl_lock+0x17/0x19
[ 1680.450972]  [<ffffffff818365a6>] schedule+0x8b/0xa3
[ 1680.451882]  [<ffffffff8183675a>] schedule_preempt_disabled+0x18/0x24
[ 1680.452948]  [<ffffffff818373eb>] mutex_lock_nested+0x1f1/0x3f1
[ 1680.453959]  [<ffffffff81683614>] rtnl_lock+0x17/0x19
[ 1680.454887]  [<ffffffff81683614>] ? rtnl_lock+0x17/0x19
[ 1680.455858]  [<ffffffff81779fc1>] addrconf_verify_work+0xe/0x1a
[ 1680.456868]  [<ffffffff8109e41a>] process_one_work+0x264/0x4d7
[ 1680.457868]  [<ffffffff8109eb9f>] worker_thread+0x209/0x2c2
[ 1680.458840]  [<ffffffff81136d55>] ? trace_preempt_on+0x9/0x1d
[ 1680.459828]  [<ffffffff8109e996>] ? rescuer_thread+0x2d6/0x2d6
[ 1680.460828]  [<ffffffff810a41a8>] kthread+0xd4/0xdc
[ 1680.461722]  [<ffffffff810a40d4>] ? kthread_parkme+0x24/0x24
[ 1680.462713]  [<ffffffff8183b63f>] ret_from_fork+0x3f/0x70
[ 1680.463703]  [<ffffffff810a40d4>] ? kthread_parkme+0x24/0x24
[ 1680.466805] 3 locks held by kworker/0:2/6888:
[ 1680.467684]  #0:  ("%s"("ipv6_addrconf")){.+.+..}, at: [<ffffffff8109e319>] process_one_work+0x163/0x4d7
[ 1680.469592]  #1:  ((addr_chk_work).work){+.+...}, at: [<ffffffff8109e319>] process_one_work+0x163/0x4d7
[ 1680.471517]  #2:  (rtnl_mutex){+.+.+.}, at: [<ffffffff81683614>] rtnl_lock+0x17/0x19
[ 1680.473664] INFO: task sshd:30767 blocked for more than 120 seconds.
[ 1680.474722]       Tainted: G        W       4.5.0-rc6-dirty #18
[ 1680.475723] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1680.478599] sshd            D ffff880137cdfc58     0 30767   1423 0x00080080
[ 1680.479812]  ffff880137cdfc58 00ff8800ba334000 00000000001d5f40 ffff88013a9d5f40
[ 1680.481532]  ffff8800ba334000 ffff880137ce0000 0000000000000246 ffff8800ba334000
[ 1680.483163]  ffffffff81f31e28 ffffffff81683614 ffff880137cdfc70 ffffffff818365a6
[ 1680.484796] Call Trace:
[ 1680.485437]  [<ffffffff81683614>] ? rtnl_lock+0x17/0x19
[ 1680.486375]  [<ffffffff818365a6>] schedule+0x8b/0xa3
[ 1680.487310]  [<ffffffff8183675a>] schedule_preempt_disabled+0x18/0x24
[ 1680.488362]  [<ffffffff818373eb>] mutex_lock_nested+0x1f1/0x3f1
[ 1680.489359]  [<ffffffff81683614>] rtnl_lock+0x17/0x19
[ 1680.490275]  [<ffffffff81683614>] ? rtnl_lock+0x17/0x19
[ 1680.491202]  [<ffffffff81684365>] rtnetlink_rcv+0x13/0x2a
[ 1680.492147]  [<ffffffff816c8396>] netlink_unicast+0x138/0x1c6
[ 1680.493124]  [<ffffffff816c86bd>] netlink_sendmsg+0x299/0x2e1
[ 1680.494117]  [<ffffffff8165b089>] sock_sendmsg_nosec+0x12/0x1d
[ 1680.495138]  [<ffffffff8165cb18>] SYSC_sendto+0x100/0x142
[ 1680.496088]  [<ffffffff81119e7c>] ? __audit_syscall_entry+0xc0/0xe4
[ 1680.497125]  [<ffffffff8100161c>] ? do_audit_syscall_entry+0x60/0x62
[ 1680.498238]  [<ffffffff810017dd>] ? syscall_trace_enter_phase1+0x10e/0x12f
[ 1680.499338]  [<ffffffff81001017>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 1680.500395]  [<ffffffff8165d18a>] SyS_sendto+0xe/0x10
[ 1680.501308]  [<ffffffff8183b2d7>] entry_SYSCALL_64_fastpath+0x12/0x6f
[ 1680.505284] 1 lock held by sshd/30767:
[ 1680.506080]  #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff81683614>] rtnl_lock+0x17/0x19

And of course it's not accessible via sshd anymore, until everything is cleaned up.
On the other hands I think it's expected results -- 65 thousands of addresses
is a big number and unfortunately there is no way yet to somehow prevent the
net-admins in containers to create them. I didn't look yet maybe it's allowed
to do so via memory cgroup.

IOW, thanks a huge (!) for both patches and I think it's worth to have them
both in -net tree 'cause they are definitely needed, but same time I'll have
to investigate this problem more deeply on Wednesday on the real testing
machine, and will check if memory cgroup may help us here to limit the
resources.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-06 10:09             ` Cyrill Gorcunov
@ 2016-03-06 16:23               ` Eric Dumazet
  2016-03-06 17:06                 ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Dumazet @ 2016-03-06 16:23 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: David Miller, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On dim., 2016-03-06 at 13:09 +0300, Cyrill Gorcunov wrote:

> Anyway, then I run this script for 255 as parameter
> in one pass which gen. requests to create 65025 addresses
> and kernel start complaining:
> 
> Perf output
> -----------
>   24.95%  [kernel]                      [k] __local_bh_enable_ip
>   21.52%  [kernel]                      [k] lock_acquire
>   15.54%  [kernel]                      [k] lock_release
>    9.84%  [kernel]                      [k] lock_is_held
>    7.47%  [kernel]                      [k] lock_acquired
>    4.08%  [kernel]                      [k] __local_bh_disable_ip

Well, this looks like LOCKDEP kernel. Are you really running LOCKDEP on
production kernels ?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-06 16:23               ` Eric Dumazet
@ 2016-03-06 17:06                 ` Cyrill Gorcunov
  2016-03-09 16:39                   ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-06 17:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Sun, Mar 06, 2016 at 08:23:12AM -0800, Eric Dumazet wrote:
> On dim., 2016-03-06 at 13:09 +0300, Cyrill Gorcunov wrote:
> 
> > Anyway, then I run this script for 255 as parameter
> > in one pass which gen. requests to create 65025 addresses
> > and kernel start complaining:
> > 
> > Perf output
> > -----------
> >   24.95%  [kernel]                      [k] __local_bh_enable_ip
> >   21.52%  [kernel]                      [k] lock_acquire
> >   15.54%  [kernel]                      [k] lock_release
> >    9.84%  [kernel]                      [k] lock_is_held
> >    7.47%  [kernel]                      [k] lock_acquired
> >    4.08%  [kernel]                      [k] __local_bh_disable_ip
> 
> Well, this looks like LOCKDEP kernel. Are you really running LOCKDEP on
> production kernels ?

This is vanilla kernel with most of debug features turned on,
not our production one. IIRC in production we don't use
lockdep. I can run tests with lockdep turned off. Still
I think I have to run tests on real hardware instead of
VM to provide you back some sane numbers, and once we
get back from holidays (which will be at Wednesday) I
gonna build the vanilla and run it on real machine
with both David's patches and measure the latency.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-06 17:06                 ` Cyrill Gorcunov
@ 2016-03-09 16:39                   ` Cyrill Gorcunov
  2016-03-09 16:51                     ` Cyrill Gorcunov
                                       ` (2 more replies)
  0 siblings, 3 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 16:39 UTC (permalink / raw)
  To: Eric Dumazet, David Miller
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Sun, Mar 06, 2016 at 08:06:41PM +0300, Cyrill Gorcunov wrote:
> > 
> > Well, this looks like LOCKDEP kernel. Are you really running LOCKDEP on
> > production kernels ?
> 

Hi Eric, David. Sorry for the delay. Finally I've measured the
latency on the hw. It's i7-2600 cpu with 16G of memory. Here
are the collected data.

---
Unpatched vanilla
=================

commit 7f02bf6b5f5de90b7a331759b5364e41c0f39bf9
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue Mar 8 09:41:20 2016 -0800

 Creating new addresses
 ----------------------
  19.26%  [kernel]                      [k] check_lifetime
  13.88%  [kernel]                      [k] __inet_insert_ifa
  13.01%  [kernel]                      [k] inet_rtm_newaddr

 Release
 -------
  20.96%  [kernel]                    [k] _raw_spin_lock
  17.79%  [kernel]                    [k] preempt_count_add
  14.79%  [kernel]                    [k] __local_bh_enable_ip
  13.08%  [kernel]                    [k] preempt_count_sub
   9.21%  [kernel]                    [k] nf_ct_iterate_cleanup
   3.15%  [kernel]                    [k] _raw_spin_unlock
   2.80%  [kernel]                    [k] nf_conntrack_lock
   2.67%  [kernel]                    [k] in_lock_functions
   2.63%  [kernel]                    [k] get_parent_ip
   2.26%  [kernel]                    [k] __inet_del_ifa
   2.17%  [kernel]                    [k] fib_del_ifaddr
   1.77%  [kernel]                    [k] _cond_resched

[root@s125 ~]# ./exploit.sh
START 4		addresses STOP 1457537580 1457537581
START 2704	addresses STOP 1457537584 1457537589
START 10404	addresses STOP 1457537602 1457537622
START 23104	addresses STOP 1457537657 1457537702
START 40804	addresses STOP 1457537784 1457537867
START 63504	addresses STOP 1457538048 1457538187

Patched (David's two patches)
=============================

 Creating new addresses
 ----------------------
  21.63%  [kernel]                    [k] check_lifetime
  14.31%  [kernel]                    [k] __inet_insert_ifa
  13.47%  [kernel]                    [k] inet_rtm_newaddr
   1.53%  [kernel]                    [k] check_preemption_disabled
   1.38%  [kernel]                    [k] page_fault
   1.27%  [kernel]                    [k] unmap_page_range

 Release
 -------
  24.26%  [kernel]                    [k] _raw_spin_lock
  17.55%  [kernel]                    [k] preempt_count_add
  14.81%  [kernel]                    [k] __local_bh_enable_ip
  14.17%  [kernel]                    [k] preempt_count_sub
  10.10%  [kernel]                    [k] nf_ct_iterate_cleanup
   3.00%  [kernel]                    [k] _raw_spin_unlock
   2.95%  [kernel]                    [k] nf_conntrack_lock
   2.86%  [kernel]                    [k] in_lock_functions
   2.73%  [kernel]                    [k] get_parent_ip
   1.91%  [kernel]                    [k] _cond_resched
   0.39%  [kernel]                    [k] task_tick_fair
   0.27%  [kernel]                    [k] native_write_msr_safe
   0.22%  [kernel]                    [k] rcu_check_callbacks
   0.20%  [kernel]                    [k] check_lifetime
   0.18%  [kernel]                    [k] check_preemption_disabled
   0.16%  [kernel]                    [k] hrtimer_active
   0.13%  [kernel]                    [k] __inet_insert_ifa
   0.13%  [kernel]                    [k] __memmove
   0.13%  [kernel]                    [k] inet_rtm_newaddr

[root@s125 ~]# ./exploit.sh
START 4		addresses STOP 1457539863 1457539864
START 2704	addresses STOP 1457539867 1457539872
START 10404	addresses STOP 1457539885 1457539905
START 23104	addresses STOP 1457539938 1457539980
START 40804	addresses STOP 1457540058 1457540132
START 63504	addresses STOP 1457540305 1457540418
---

The lockdep is turned off. And the script itself is
---
[root@s125 ~]# cat ./exploit.sh 
#!/bin/sh

if [ -z $1 ]; then
	for x in `seq 1 50 255`; do
		echo -n "START "
		(unshare -n /bin/sh exploit.sh $x)
		sleep 1
		for j in `seq 0 100`; do
			ip r > /dev/null
		done
		echo -n " "
		echo `date +%s`
	done
else
	for x in `seq 0 $1`; do
		for y in `seq 0 $1`; do
			ip a a 127.1.$x.$y dev lo
		done
	done
	num=`ip a l dev lo | grep -c "inet "`
	echo -n "$num addresses "
	echo -n "STOP "
	echo -n `date +%s`
	exit
fi
---

Note i run ip r in a cycle and added sleep before. On idle
machine this cycle takes ~1 second. But when run when kernel
cleans up the netnamespace it takea a way longer.

Also here is a graph for the data collected (blue line: unpatched
version, red -- patched. Of course with patched version it become
a way more better but still hanging).

https://docs.google.com/spreadsheets/d/1eyQDxjuZY2DHKYksGACpHDDcV1Bd92e-ZiY8ywPKshA/edit?usp=sharing

The perf output earlier shows the "perf top" when addresses
are created and when they are releasing.

The main problem still I think is that we allow to request
as many inet addresses as there is enough free memory and
of course kernel can't handle all in O(1) time, all resources
must be released so there always be some lagging moment. Thus
maybe introducing limits would be a good idea for sysadmins.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 16:39                   ` Cyrill Gorcunov
@ 2016-03-09 16:51                     ` Cyrill Gorcunov
  2016-03-09 16:58                     ` Alexei Starovoitov
  2016-03-09 17:19                     ` David Miller
  2 siblings, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 16:51 UTC (permalink / raw)
  To: Eric Dumazet, David Miller
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Wed, Mar 09, 2016 at 07:39:19PM +0300, Cyrill Gorcunov wrote:
> On Sun, Mar 06, 2016 at 08:06:41PM +0300, Cyrill Gorcunov wrote:
> > > 
> > > Well, this looks like LOCKDEP kernel. Are you really running LOCKDEP on
> > > production kernels ?
> > 
> 
> Hi Eric, David. Sorry for the delay. Finally I've measured the
> latency on the hw. It's i7-2600 cpu with 16G of memory. Here
> are the collected data.
...
> 
> 
> Also here is a graph for the data collected (blue line: unpatched
> version, red -- patched. Of course with patched version it become
> a way more better but still hanging).
> 
> https://docs.google.com/spreadsheets/d/1eyQDxjuZY2DHKYksGACpHDDcV1Bd92e-ZiY8ywPKshA/edit?usp=sharing
> 
> The perf output earlier shows the "perf top" when addresses
> are created and when they are releasing.

In text form
-------------
Num of addresses	Unpatched (sec)		Patched (sec)
4			1			1
2704			5			5
10404			20			20
23104			45			42
40804			83			74
63504			139			113

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 16:39                   ` Cyrill Gorcunov
  2016-03-09 16:51                     ` Cyrill Gorcunov
@ 2016-03-09 16:58                     ` Alexei Starovoitov
  2016-03-09 17:09                       ` Cyrill Gorcunov
  2016-03-09 17:19                     ` David Miller
  2 siblings, 1 reply; 50+ messages in thread
From: Alexei Starovoitov @ 2016-03-09 16:58 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Eric Dumazet, David Miller, netdev, solar, vvs, avagin, xemul,
	vdavydov, khorenko

On Wed, Mar 09, 2016 at 07:39:19PM +0300, Cyrill Gorcunov wrote:
> On Sun, Mar 06, 2016 at 08:06:41PM +0300, Cyrill Gorcunov wrote:
> > > 
> > > Well, this looks like LOCKDEP kernel. Are you really running LOCKDEP on
> > > production kernels ?
> > 
> 
> Hi Eric, David. Sorry for the delay. Finally I've measured the
> latency on the hw. It's i7-2600 cpu with 16G of memory. Here
> are the collected data.
> 
> ---
> Unpatched vanilla
> =================
> 
> commit 7f02bf6b5f5de90b7a331759b5364e41c0f39bf9
> Author: Linus Torvalds <torvalds@linux-foundation.org>
> Date:   Tue Mar 8 09:41:20 2016 -0800
> 
>  Creating new addresses
>  ----------------------
>   19.26%  [kernel]                      [k] check_lifetime
>   13.88%  [kernel]                      [k] __inet_insert_ifa
>   13.01%  [kernel]                      [k] inet_rtm_newaddr
> 
>  Release
>  -------
>   20.96%  [kernel]                    [k] _raw_spin_lock
>   17.79%  [kernel]                    [k] preempt_count_add

above line is an indication that you have:
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
turning it off will speed up things significantly.

>   14.79%  [kernel]                    [k] __local_bh_enable_ip
>   13.08%  [kernel]                    [k] preempt_count_sub
>    9.21%  [kernel]                    [k] nf_ct_iterate_cleanup
>    3.15%  [kernel]                    [k] _raw_spin_unlock
>    2.80%  [kernel]                    [k] nf_conntrack_lock
>    2.67%  [kernel]                    [k] in_lock_functions
>    2.63%  [kernel]                    [k] get_parent_ip
>    2.26%  [kernel]                    [k] __inet_del_ifa
>    2.17%  [kernel]                    [k] fib_del_ifaddr
>    1.77%  [kernel]                    [k] _cond_resched

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 16:58                     ` Alexei Starovoitov
@ 2016-03-09 17:09                       ` Cyrill Gorcunov
  2016-03-09 17:24                         ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 17:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric Dumazet, David Miller, netdev, solar, vvs, avagin, xemul,
	vdavydov, khorenko

On Wed, Mar 09, 2016 at 08:58:52AM -0800, Alexei Starovoitov wrote:
...
> 
> above line is an indication that you have:
> #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
> turning it off will speed up things significantly.

Look, this won't change the overall picture. For sure it will
speedup the kernel but it won't prevent the users from allowing
allocating addresses. So timings will drop a bit but the main
issue will remain -- there is no explicit way to limit this
resource. We can create say 1K of netnamespaces and allocate
100K addresses in each, then start destorying the namespaces
and node gonna be unreachable until everything is freed. The
kernel works as it shold simply in case of highload is stops
react due to big number of rtnl-locks taken.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 16:39                   ` Cyrill Gorcunov
  2016-03-09 16:51                     ` Cyrill Gorcunov
  2016-03-09 16:58                     ` Alexei Starovoitov
@ 2016-03-09 17:19                     ` David Miller
  2 siblings, 0 replies; 50+ messages in thread
From: David Miller @ 2016-03-09 17:19 UTC (permalink / raw)
  To: gorcunov
  Cc: eric.dumazet, netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 9 Mar 2016 19:39:19 +0300

>    9.21%  [kernel]                    [k] nf_ct_iterate_cleanup
 ...
>  Release
>  -------
>   24.26%  [kernel]                    [k] _raw_spin_lock
>   17.55%  [kernel]                    [k] preempt_count_add
>   14.81%  [kernel]                    [k] __local_bh_enable_ip
>   14.17%  [kernel]                    [k] preempt_count_sub
>   10.10%  [kernel]                    [k] nf_ct_iterate_cleanup
 ...
> The main problem still I think is that we allow to request
> as many inet addresses as there is enough free memory and
> of course kernel can't handle all in O(1) time, all resources
> must be released so there always be some lagging moment. Thus
> maybe introducing limits would be a good idea for sysadmins.

Primary problem seems to be netfilter conntrack.

It's at least 10 times more expensive than any of the other
operations and probably is where all of the lock banging is
coming from.

I'm not adding a limit when there is so much low hanging fruit
remaining, no way.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 17:09                       ` Cyrill Gorcunov
@ 2016-03-09 17:24                         ` David Miller
  2016-03-09 17:53                           ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-09 17:24 UTC (permalink / raw)
  To: gorcunov
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 9 Mar 2016 20:09:21 +0300

> On Wed, Mar 09, 2016 at 08:58:52AM -0800, Alexei Starovoitov wrote:
> ...
>> 
>> above line is an indication that you have:
>> #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
>> turning it off will speed up things significantly.
> 
> Look, this won't change the overall picture. For sure it will
> speedup the kernel but it won't prevent the users from allowing
> allocating addresses. So timings will drop a bit but the main
> issue will remain -- there is no explicit way to limit this
> resource. We can create say 1K of netnamespaces and allocate
> 100K addresses in each, then start destorying the namespaces
> and node gonna be unreachable until everything is freed. The
> kernel works as it shold simply in case of highload is stops
> react due to big number of rtnl-locks taken.

We asked you for numbers without a lot of features enabled, it'll
help us diagnose which subsystem still causes a lot of overhead
much more clearly.

So please do so.

Although it's already pretty clear that netfilter conntrack
cleanup is insanely expensive.

You're also jumping to a lot of conclusions, work with us to fix the
fundamental performance problems rather than continually insisting on
a limit.

We should be able to remove millions of IP addresses in less than
half a second, no problem.  Limits make no sense at all.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 17:24                         ` David Miller
@ 2016-03-09 17:53                           ` Cyrill Gorcunov
  2016-03-09 19:55                             ` Cyrill Gorcunov
  2016-03-09 20:27                             ` David Miller
  0 siblings, 2 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 17:53 UTC (permalink / raw)
  To: David Miller
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko

On Wed, Mar 09, 2016 at 12:24:00PM -0500, David Miller wrote:
...
> We asked you for numbers without a lot of features enabled, it'll
> help us diagnose which subsystem still causes a lot of overhead
> much more clearly.
> 
> So please do so.

Sure. Gimme some time and I'll back with numbers.

> Although it's already pretty clear that netfilter conntrack
> cleanup is insanely expensive.

Yes. I can drop it off for a while and run tests without it,
then turn it back and try again. Would you like to see such
numbers?

> You're also jumping to a lot of conclusions, work with us to fix the
> fundamental performance problems rather than continually insisting on
> a limit.
> 
> We should be able to remove millions of IP addresses in less than
> half a second, no problem.  Limits make no sense at all.

Sure, I'll continue experimenting (and turn off preemt as
a first step). Sorry if I sounded rough.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 17:53                           ` Cyrill Gorcunov
@ 2016-03-09 19:55                             ` Cyrill Gorcunov
  2016-03-09 20:27                             ` David Miller
  1 sibling, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 19:55 UTC (permalink / raw)
  To: David Miller, alexei.starovoitov, eric.dumazet
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko

On Wed, Mar 09, 2016 at 08:53:07PM +0300, Cyrill Gorcunov wrote:
> On Wed, Mar 09, 2016 at 12:24:00PM -0500, David Miller wrote:
> ...
> > We asked you for numbers without a lot of features enabled, it'll
> > help us diagnose which subsystem still causes a lot of overhead
> > much more clearly.
> > 
> > So please do so.
> 
> Sure. Gimme some time and I'll back with numbers.

OK, here are the results (with preempt-debug/trace disabled,
on the kernel with David's two patches).
---

 No conntrack
 ------------
 [root@s125 ~]# ./exploit.sh 
 START 4	addresses STOP 1457549979 1457549980	-> 1
 START 2704	addresses STOP 1457549983 1457549984	-> 1
 START 10404	addresses STOP 1457549996 1457549997	-> 1
 START 23104	addresses STOP 1457550029 1457550030	-> 1
 START 40804	addresses STOP 1457550103 1457550104	-> 1
 START 63504	addresses STOP 1457550267 1457550268	-> 1

all works quite fast, takes 1 second.

 With conntrack
 --------------

 1) In a middle of "release -> create new addresses" transition

  27.53%  [kernel]                    [k] __local_bh_enable_ip
  26.29%  [kernel]                    [k] _raw_spin_lock
   6.00%  [kernel]                    [k] nf_ct_iterate_cleanup
   3.95%  [kernel]                    [k] nf_conntrack_lock
   2.94%  [kernel]                    [k] _raw_spin_unlock
   1.91%  [kernel]                    [k] _cond_resched
   1.78%  [kernel]                    [k] check_lifetime
   1.25%  [kernel]                    [k] __inet_insert_ifa
   1.19%  [kernel]                    [k] inet_rtm_newaddr

 2) Last one with 63K of addresses releasing

  36.36%  [kernel]                 [k] __local_bh_enable_ip
  34.75%  [kernel]                 [k] _raw_spin_lock
   7.93%  [kernel]                 [k] nf_ct_iterate_cleanup
   5.11%  [kernel]                 [k] nf_conntrack_lock
   3.71%  [kernel]                 [k] _raw_spin_unlock
   2.51%  [kernel]                 [k] _cond_resched
   0.89%  [kernel]                 [k] task_tick_fair
   0.77%  [kernel]                 [k] native_write_msr_safe
   0.58%  [kernel]                 [k] hrtimer_active
   0.52%  [kernel]                 [k] rcu_check_callbacks

[root@s125 ~]# ./exploit.sh 
START 4		addresses STOP 1457552395 1457552397	-> 2
START 2704	addresses STOP 1457552399 1457552403	-> 4
START 10404	addresses STOP 1457552415 1457552429	-> 14
START 23104	addresses STOP 1457552461 1457552492	-> 31
START 40804	addresses STOP 1457552566 1457552620	-> 54
START 63504	addresses STOP 1457552785 1457552870	-> 85

at the final stage it took 85 seconds to become alive. All
eaten inside nf_ct_iterate_cleanup (actually inside
get_next_corpse). IIRC there were some feature of perf
which could annotate the instructions, no? Have to refresh
memory on how to use perf record and such...

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 17:53                           ` Cyrill Gorcunov
  2016-03-09 19:55                             ` Cyrill Gorcunov
@ 2016-03-09 20:27                             ` David Miller
  2016-03-09 20:41                               ` Cyrill Gorcunov
  1 sibling, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-09 20:27 UTC (permalink / raw)
  To: gorcunov
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 9 Mar 2016 20:53:07 +0300

> On Wed, Mar 09, 2016 at 12:24:00PM -0500, David Miller wrote:
> ...
>> We asked you for numbers without a lot of features enabled, it'll
>> help us diagnose which subsystem still causes a lot of overhead
>> much more clearly.
>> 
>> So please do so.
> 
> Sure. Gimme some time and I'll back with numbers.
> 
>> Although it's already pretty clear that netfilter conntrack
>> cleanup is insanely expensive.
> 
> Yes. I can drop it off for a while and run tests without it,
> then turn it back and try again. Would you like to see such
> numbers?

That would be very helpful, yes.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 20:27                             ` David Miller
@ 2016-03-09 20:41                               ` Cyrill Gorcunov
  2016-03-09 20:47                                 ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 20:41 UTC (permalink / raw)
  To: David Miller
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko

On Wed, Mar 09, 2016 at 03:27:30PM -0500, David Miller wrote:
> > 
> > Yes. I can drop it off for a while and run tests without it,
> > then turn it back and try again. Would you like to see such
> > numbers?
> 
> That would be very helpful, yes.

Just sent out. Take a look please. Indeed it sits inside get_next_corpse
a lot. And now I think I've to figure out where we can optimize it.
Continue tomorrow.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 20:41                               ` Cyrill Gorcunov
@ 2016-03-09 20:47                                 ` David Miller
  2016-03-09 20:57                                   ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-09 20:47 UTC (permalink / raw)
  To: gorcunov
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 9 Mar 2016 23:41:58 +0300

> On Wed, Mar 09, 2016 at 03:27:30PM -0500, David Miller wrote:
>> > 
>> > Yes. I can drop it off for a while and run tests without it,
>> > then turn it back and try again. Would you like to see such
>> > numbers?
>> 
>> That would be very helpful, yes.
> 
> Just sent out. Take a look please. Indeed it sits inside get_next_corpse
> a lot. And now I think I've to figure out where we can optimize it.
> Continue tomorrow.

The problem is that the masquerading code flushes the entire conntrack
table once for _every_ address removed.

The code path is:

masq_device_event()
	if (event == NETDEV_DOWN) {
		/* Device was downed.  Search entire table for
		 * conntracks which were associated with that device,
		 * and forget them.
		 */
		NF_CT_ASSERT(dev->ifindex != 0);

		nf_ct_iterate_cleanup(net, device_cmp,
				      (void *)(long)dev->ifindex, 0, 0);

So if you have a million IP addresses, this flush happens a million times
on inetdev destroy.

Part of the problem is that we emit NETDEV_DOWN inetdev notifiers per
address removed, instead of once per inetdev destroy.

Maybe if we put some boolean state into the inetdev, we could make sure
we did this flush only once time while inetdev->dead = 1.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 20:47                                 ` David Miller
@ 2016-03-09 20:57                                   ` Cyrill Gorcunov
  2016-03-09 21:10                                     ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 20:57 UTC (permalink / raw)
  To: David Miller
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

On Wed, Mar 09, 2016 at 03:47:25PM -0500, David Miller wrote:
> From: Cyrill Gorcunov <gorcunov@gmail.com>
> Date: Wed, 9 Mar 2016 23:41:58 +0300
> 
> > On Wed, Mar 09, 2016 at 03:27:30PM -0500, David Miller wrote:
> >> > 
> >> > Yes. I can drop it off for a while and run tests without it,
> >> > then turn it back and try again. Would you like to see such
> >> > numbers?
> >> 
> >> That would be very helpful, yes.
> > 
> > Just sent out. Take a look please. Indeed it sits inside get_next_corpse
> > a lot. And now I think I've to figure out where we can optimize it.
> > Continue tomorrow.
> 
> The problem is that the masquerading code flushes the entire conntrack
> table once for _every_ address removed.
> 
> The code path is:
> 
> masq_device_event()
> 	if (event == NETDEV_DOWN) {
> 		/* Device was downed.  Search entire table for
> 		 * conntracks which were associated with that device,
> 		 * and forget them.
> 		 */
> 		NF_CT_ASSERT(dev->ifindex != 0);
> 
> 		nf_ct_iterate_cleanup(net, device_cmp,
> 				      (void *)(long)dev->ifindex, 0, 0);
> 
> So if you have a million IP addresses, this flush happens a million times
> on inetdev destroy.
> 
> Part of the problem is that we emit NETDEV_DOWN inetdev notifiers per
> address removed, instead of once per inetdev destroy.
> 
> Maybe if we put some boolean state into the inetdev, we could make sure
> we did this flush only once time while inetdev->dead = 1.

Aha! So in your patch __inet_del_ifa bypass first blocking_notifier_call_chain

__inet_del_ifa
	...
	if (in_dev->dead)
		goto no_promotions;

	// First call to NETDEV_DOWN
...
no_promotions:
	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);

and here we call for NETDEV_DOWN, which then hits masq_device_event
and go further to conntrack code.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 20:57                                   ` Cyrill Gorcunov
@ 2016-03-09 21:10                                     ` David Miller
  2016-03-09 21:16                                       ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-09 21:10 UTC (permalink / raw)
  To: gorcunov
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 9 Mar 2016 23:57:47 +0300

> Aha! So in your patch __inet_del_ifa bypass first blocking_notifier_call_chain
> 
> __inet_del_ifa
> 	...
> 	if (in_dev->dead)
> 		goto no_promotions;
> 
> 	// First call to NETDEV_DOWN
> ...
> no_promotions:
> 	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
> 	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
> 
> and here we call for NETDEV_DOWN, which then hits masq_device_event
> and go further to conntrack code.

Yes that's where the notifier comes from, which happens with or without
my patch.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 21:10                                     ` David Miller
@ 2016-03-09 21:16                                       ` Cyrill Gorcunov
  2016-03-10 10:20                                         ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-09 21:16 UTC (permalink / raw)
  To: David Miller
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

On Wed, Mar 09, 2016 at 04:10:38PM -0500, David Miller wrote:
> > 
> > and here we call for NETDEV_DOWN, which then hits masq_device_event
> > and go further to conntrack code.
> 
> Yes that's where the notifier comes from, which happens with or without
> my patch.

Thanks for explanation, Dave! I'll continue on this task tomorrow
tryin to implement optimization you proposed.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-09 21:16                                       ` Cyrill Gorcunov
@ 2016-03-10 10:20                                         ` Cyrill Gorcunov
  2016-03-10 11:03                                           ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 10:20 UTC (permalink / raw)
  To: David Miller, alexei.starovoitov, eric.dumazet
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko, pablo,
	netfilter-devel

On Thu, Mar 10, 2016 at 12:16:29AM +0300, Cyrill Gorcunov wrote:
> 
> Thanks for explanation, Dave! I'll continue on this task tomorrow
> tryin to implement optimization you proposed.

OK, here are the results for the preliminary patch with conntrack running
---
[root@s125 ~]# ./exploit.sh 
START 4		addresses STOP 1457604516 1457604518
START 2704 	addresses STOP 1457604520 1457604521
START 10404 	addresses STOP 1457604533 1457604534
START 23104 	addresses STOP 1457604566 1457604567
START 40804 	addresses STOP 1457604642 1457604643
START 63504 	addresses STOP 1457604809 1457604810

It takes ~1 second for each case, which is great.
---
 net/ipv4/devinet.c |   13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

Index: linux-ml.git/net/ipv4/devinet.c
===================================================================
--- linux-ml.git.orig/net/ipv4/devinet.c
+++ linux-ml.git/net/ipv4/devinet.c
@@ -403,7 +403,18 @@ no_promotions:
 	   So that, this order is correct.
 	 */
 	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
-	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+
+	if (!in_dev->dead || (in_dev->dead && !ifa1->ifa_next)) {
+		/*
+		 * We might be destroying device with millions
+		 * of addresses assigned, so we need to call
+		 * the notifier on the last pass only, otherwise
+		 * conntack code (if kernel supports) gonna scan
+		 * on each cleanup iteration wasting huge amount
+		 * of time.
+		 */
+		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+	}
 
 	if (promote) {
 		struct in_ifaddr *next_sec = promote->ifa_next;

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 10:20                                         ` Cyrill Gorcunov
@ 2016-03-10 11:03                                           ` Cyrill Gorcunov
  2016-03-10 15:09                                             ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 11:03 UTC (permalink / raw)
  To: David Miller, alexei.starovoitov, eric.dumazet
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko, pablo,
	netfilter-devel

On Thu, Mar 10, 2016 at 01:20:18PM +0300, Cyrill Gorcunov wrote:
> On Thu, Mar 10, 2016 at 12:16:29AM +0300, Cyrill Gorcunov wrote:
> > 
> > Thanks for explanation, Dave! I'll continue on this task tomorrow
> > tryin to implement optimization you proposed.
> 
> OK, here are the results for the preliminary patch with conntrack running
...
>  net/ipv4/devinet.c |   13 ++++++++++++-
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> Index: linux-ml.git/net/ipv4/devinet.c
> ===================================================================
> --- linux-ml.git.orig/net/ipv4/devinet.c
> +++ linux-ml.git/net/ipv4/devinet.c
> @@ -403,7 +403,18 @@ no_promotions:
>  	   So that, this order is correct.
>  	 */

This patch is wrong, so drop it please. I'll do another.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 11:03                                           ` Cyrill Gorcunov
@ 2016-03-10 15:09                                             ` Cyrill Gorcunov
  2016-03-10 18:01                                               ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 15:09 UTC (permalink / raw)
  To: David Miller, alexei.starovoitov, eric.dumazet
  Cc: netdev, solar, vvs, avagin, xemul, vdavydov, khorenko, pablo,
	netfilter-devel

On Thu, Mar 10, 2016 at 02:03:24PM +0300, Cyrill Gorcunov wrote:
> On Thu, Mar 10, 2016 at 01:20:18PM +0300, Cyrill Gorcunov wrote:
> > On Thu, Mar 10, 2016 at 12:16:29AM +0300, Cyrill Gorcunov wrote:
> > > 
> > > Thanks for explanation, Dave! I'll continue on this task tomorrow
> > > tryin to implement optimization you proposed.
> > 
> > OK, here are the results for the preliminary patch with conntrack running
> ...
> >  net/ipv4/devinet.c |   13 ++++++++++++-
> >  1 file changed, 12 insertions(+), 1 deletion(-)
> > 
> > Index: linux-ml.git/net/ipv4/devinet.c
> > ===================================================================
> > --- linux-ml.git.orig/net/ipv4/devinet.c
> > +++ linux-ml.git/net/ipv4/devinet.c
> > @@ -403,7 +403,18 @@ no_promotions:
> >  	   So that, this order is correct.
> >  	 */
> 
> This patch is wrong, so drop it please. I'll do another.

Here I think is a better variant. The resulst are good
enough -- 1 sec for cleanup. Does the patch look sane?
---
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c |   15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

Index: linux-ml.git/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
===================================================================
--- linux-ml.git.orig/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ linux-ml.git/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -108,9 +108,22 @@ static int masq_inet_event(struct notifi
 			   unsigned long event,
 			   void *ptr)
 {
-	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
 	struct netdev_notifier_info info;
 
+	if (event == NETDEV_DOWN) {
+		/*
+		 * When we meet dead device which is
+		 * being released with dozeon of addresses
+		 * assigned -- we can optimize calls
+		 * to conntrack cleanups and do it only
+		 * once.
+		 */
+		if (ifa->ifa_dev->dead && ifa->ifa_next)
+			return NOTIFY_DONE;
+	}
+
 	netdev_notifier_info_init(&info, dev);
 	return masq_device_event(this, event, &info);
 }

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 15:09                                             ` Cyrill Gorcunov
@ 2016-03-10 18:01                                               ` David Miller
  2016-03-10 18:48                                                 ` Cyrill Gorcunov
  2016-03-10 19:02                                                 ` Cong Wang
  0 siblings, 2 replies; 50+ messages in thread
From: David Miller @ 2016-03-10 18:01 UTC (permalink / raw)
  To: gorcunov
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 10 Mar 2016 18:09:20 +0300

> On Thu, Mar 10, 2016 at 02:03:24PM +0300, Cyrill Gorcunov wrote:
>> On Thu, Mar 10, 2016 at 01:20:18PM +0300, Cyrill Gorcunov wrote:
>> > On Thu, Mar 10, 2016 at 12:16:29AM +0300, Cyrill Gorcunov wrote:
>> > > 
>> > > Thanks for explanation, Dave! I'll continue on this task tomorrow
>> > > tryin to implement optimization you proposed.
>> > 
>> > OK, here are the results for the preliminary patch with conntrack running
>> ...
>> >  net/ipv4/devinet.c |   13 ++++++++++++-
>> >  1 file changed, 12 insertions(+), 1 deletion(-)
>> > 
>> > Index: linux-ml.git/net/ipv4/devinet.c
>> > ===================================================================
>> > --- linux-ml.git.orig/net/ipv4/devinet.c
>> > +++ linux-ml.git/net/ipv4/devinet.c
>> > @@ -403,7 +403,18 @@ no_promotions:
>> >  	   So that, this order is correct.
>> >  	 */
>> 
>> This patch is wrong, so drop it please. I'll do another.
> 
> Here I think is a better variant. The resulst are good
> enough -- 1 sec for cleanup. Does the patch look sane?

I'm tempted to say that we should provide these notifier handlers with
the information they need, explicitly, to handle this case.

Most intdev notifiers actually want to know the individual addresses
that get removed, one by one.  That's handled by the existing
NETDEV_DOWN event and the ifa we pass to that.

But some, like this netfilter masq case, would be satisfied with a
single event that tells them the whole inetdev instance is being torn
down.  Which is the case we care about here.

We currently don't use NETDEV_UNREGISTER for inetdev notifiers, so
maybe we could use that.

And that is consistent with the core netdev notifier that triggers
this call chain in the first place.

Roughly, something like this:

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8c3df2c..6eee5cb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -292,6 +292,11 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 	in_dev->dead = 1;
 
+	if (in_dev->ifa_list)
+		blocking_notifier_call_chain(&inetaddr_chain,
+					     NETDEV_UNREGISTER,
+					     in_dev->ifa_list);
+
 	ip_mc_destroy_dev(in_dev);
 
 	while ((ifa = in_dev->ifa_list) != NULL) {
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index c6eb421..1bb8026 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -111,6 +111,10 @@ static int masq_inet_event(struct notifier_block *this,
 	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
 	struct netdev_notifier_info info;
 
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+	event = NETDEV_DOWN;
+
 	netdev_notifier_info_init(&info, dev);
 	return masq_device_event(this, event, &info);
 }





^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 18:01                                               ` David Miller
@ 2016-03-10 18:48                                                 ` Cyrill Gorcunov
  2016-03-10 19:02                                                 ` Cong Wang
  1 sibling, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 18:48 UTC (permalink / raw)
  To: David Miller
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

On Thu, Mar 10, 2016 at 01:01:38PM -0500, David Miller wrote:
> From: Cyrill Gorcunov <gorcunov@gmail.com>
> Date: Thu, 10 Mar 2016 18:09:20 +0300
> 
> > On Thu, Mar 10, 2016 at 02:03:24PM +0300, Cyrill Gorcunov wrote:
> >> On Thu, Mar 10, 2016 at 01:20:18PM +0300, Cyrill Gorcunov wrote:
> >> > On Thu, Mar 10, 2016 at 12:16:29AM +0300, Cyrill Gorcunov wrote:
> >> > > 
> >> > > Thanks for explanation, Dave! I'll continue on this task tomorrow
> >> > > tryin to implement optimization you proposed.
> >> > 
> >> > OK, here are the results for the preliminary patch with conntrack running
> >> ...
> >> >  net/ipv4/devinet.c |   13 ++++++++++++-
> >> >  1 file changed, 12 insertions(+), 1 deletion(-)
> >> > 
> >> > Index: linux-ml.git/net/ipv4/devinet.c
> >> > ===================================================================
> >> > --- linux-ml.git.orig/net/ipv4/devinet.c
> >> > +++ linux-ml.git/net/ipv4/devinet.c
> >> > @@ -403,7 +403,18 @@ no_promotions:
> >> >  	   So that, this order is correct.
> >> >  	 */
> >> 
> >> This patch is wrong, so drop it please. I'll do another.
> > 
> > Here I think is a better variant. The resulst are good
> > enough -- 1 sec for cleanup. Does the patch look sane?
> 
> I'm tempted to say that we should provide these notifier handlers with
> the information they need, explicitly, to handle this case.
> 
> Most intdev notifiers actually want to know the individual addresses
> that get removed, one by one.  That's handled by the existing
> NETDEV_DOWN event and the ifa we pass to that.
> 
> But some, like this netfilter masq case, would be satisfied with a
> single event that tells them the whole inetdev instance is being torn
> down.  Which is the case we care about here.
> 
> We currently don't use NETDEV_UNREGISTER for inetdev notifiers, so
> maybe we could use that.
> 
> And that is consistent with the core netdev notifier that triggers
> this call chain in the first place.
> 
> Roughly, something like this:

I see. Dave, gimme some time to test but I'm sure it'll work.
I don't have some strong opinion here, so your patch looks
pretty fine to me. But maybe people from netdev camp have
some other ideas.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 18:01                                               ` David Miller
  2016-03-10 18:48                                                 ` Cyrill Gorcunov
@ 2016-03-10 19:02                                                 ` Cong Wang
  2016-03-10 19:55                                                   ` David Miller
  1 sibling, 1 reply; 50+ messages in thread
From: Cong Wang @ 2016-03-10 19:02 UTC (permalink / raw)
  To: David Miller
  Cc: Cyrill Gorcunov, Alexei Starovoitov, Eric Dumazet,
	Linux Kernel Network Developers, solar, Vasily Averin, avagin,
	xemul, vdavydov, khorenko, Pablo Neira Ayuso, netfilter-devel

On Thu, Mar 10, 2016 at 10:01 AM, David Miller <davem@davemloft.net> wrote:
> I'm tempted to say that we should provide these notifier handlers with
> the information they need, explicitly, to handle this case.
>
> Most intdev notifiers actually want to know the individual addresses
> that get removed, one by one.  That's handled by the existing
> NETDEV_DOWN event and the ifa we pass to that.
>
> But some, like this netfilter masq case, would be satisfied with a
> single event that tells them the whole inetdev instance is being torn
> down.  Which is the case we care about here.
>
> We currently don't use NETDEV_UNREGISTER for inetdev notifiers, so
> maybe we could use that.
>
> And that is consistent with the core netdev notifier that triggers
> this call chain in the first place.
>
> Roughly, something like this:
>
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index 8c3df2c..6eee5cb 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -292,6 +292,11 @@ static void inetdev_destroy(struct in_device *in_dev)
>
>         in_dev->dead = 1;
>
> +       if (in_dev->ifa_list)
> +               blocking_notifier_call_chain(&inetaddr_chain,
> +                                            NETDEV_UNREGISTER,
> +                                            in_dev->ifa_list);
> +
>         ip_mc_destroy_dev(in_dev);


Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
is happening and masq already registers a netdev notifier...



>
>         while ((ifa = in_dev->ifa_list) != NULL) {
> diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> index c6eb421..1bb8026 100644
> --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> @@ -111,6 +111,10 @@ static int masq_inet_event(struct notifier_block *this,
>         struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
>         struct netdev_notifier_info info;
>
> +       if (event != NETDEV_UNREGISTER)
> +               return NOTIFY_DONE;
> +       event = NETDEV_DOWN;
> +
>         netdev_notifier_info_init(&info, dev);
>         return masq_device_event(this, event, &info);
>  }

If masq really doesn't care about inetdev destroy or inetaddr removal,
we should just remove its inetaddr notifier.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 19:02                                                 ` Cong Wang
@ 2016-03-10 19:55                                                   ` David Miller
  2016-03-10 20:01                                                     ` Cyrill Gorcunov
  2016-03-10 21:09                                                     ` Cong Wang
  0 siblings, 2 replies; 50+ messages in thread
From: David Miller @ 2016-03-10 19:55 UTC (permalink / raw)
  To: xiyou.wangcong
  Cc: gorcunov, alexei.starovoitov, eric.dumazet, netdev, solar, vvs,
	avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 10 Mar 2016 11:02:28 -0800

> On Thu, Mar 10, 2016 at 10:01 AM, David Miller <davem@davemloft.net> wrote:
>> I'm tempted to say that we should provide these notifier handlers with
>> the information they need, explicitly, to handle this case.
>>
>> Most intdev notifiers actually want to know the individual addresses
>> that get removed, one by one.  That's handled by the existing
>> NETDEV_DOWN event and the ifa we pass to that.
>>
>> But some, like this netfilter masq case, would be satisfied with a
>> single event that tells them the whole inetdev instance is being torn
>> down.  Which is the case we care about here.
>>
>> We currently don't use NETDEV_UNREGISTER for inetdev notifiers, so
>> maybe we could use that.
>>
>> And that is consistent with the core netdev notifier that triggers
>> this call chain in the first place.
>>
>> Roughly, something like this:
>>
>> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
>> index 8c3df2c..6eee5cb 100644
>> --- a/net/ipv4/devinet.c
>> +++ b/net/ipv4/devinet.c
>> @@ -292,6 +292,11 @@ static void inetdev_destroy(struct in_device *in_dev)
>>
>>         in_dev->dead = 1;
>>
>> +       if (in_dev->ifa_list)
>> +               blocking_notifier_call_chain(&inetaddr_chain,
>> +                                            NETDEV_UNREGISTER,
>> +                                            in_dev->ifa_list);
>> +
>>         ip_mc_destroy_dev(in_dev);
> 
> 
> Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
> is happening and masq already registers a netdev notifier...

Indeed, good catch.  Therefore:

1) Keep the masq netdev notifier.  That will flush the conntrack table
   for the inetdev_destroy event.

2) Make the inetdev notifier only do something if inetdev->dead is
   false.  (ie. we are flushing an individual address)

And then we don't need the NETDEV_UNREGISTER thing at all:

diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index c6eb421..f71841a 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -108,10 +108,20 @@ static int masq_inet_event(struct notifier_block *this,
 			   unsigned long event,
 			   void *ptr)
 {
-	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
 	struct netdev_notifier_info info;
+	struct in_ifaddr *ifa = ptr;
+	struct in_device *idev;
 
-	netdev_notifier_info_init(&info, dev);
+	/* The masq_dev_notifier will catch the case of the device going
+	 * down.  So if the inetdev is dead and being destroyed we have
+	 * no work to do.  Otherwise this is an individual address removal
+	 * and we have to perform the flush.
+	 */
+	idev = ifa->ifa_dev;
+	if (idev->dead)
+		return NOTIFY_DONE;
+
+	netdev_notifier_info_init(&info, idev->dev);
 	return masq_device_event(this, event, &info);
 }
 

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 19:55                                                   ` David Miller
@ 2016-03-10 20:01                                                     ` Cyrill Gorcunov
  2016-03-10 20:03                                                       ` David Miller
  2016-03-10 21:09                                                     ` Cong Wang
  1 sibling, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 20:01 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Thu, Mar 10, 2016 at 02:55:43PM -0500, David Miller wrote:
> > 
> > Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
> > is happening and masq already registers a netdev notifier...
> 
> Indeed, good catch.  Therefore:
> 
> 1) Keep the masq netdev notifier.  That will flush the conntrack table
>    for the inetdev_destroy event.
> 
> 2) Make the inetdev notifier only do something if inetdev->dead is
>    false.  (ie. we are flushing an individual address)
> 
> And then we don't need the NETDEV_UNREGISTER thing at all:
> 
> diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> index c6eb421..f71841a 100644
> --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> @@ -108,10 +108,20 @@ static int masq_inet_event(struct notifier_block *this,
>  			   unsigned long event,
>  			   void *ptr)
>  {
> -	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
>  	struct netdev_notifier_info info;
> +	struct in_ifaddr *ifa = ptr;
> +	struct in_device *idev;
>  
> -	netdev_notifier_info_init(&info, dev);
> +	/* The masq_dev_notifier will catch the case of the device going
> +	 * down.  So if the inetdev is dead and being destroyed we have
> +	 * no work to do.  Otherwise this is an individual address removal
> +	 * and we have to perform the flush.
> +	 */
> +	idev = ifa->ifa_dev;
> +	if (idev->dead)
> +		return NOTIFY_DONE;
> +
> +	netdev_notifier_info_init(&info, idev->dev);
>  	return masq_device_event(this, event, &info);
>  }

Guys, I'm lost. Currently masq_device_event calls for conntrack
cleanup with device index, so that once device is going down, the
appropriate conntracks gonna be dropped off. Now if device is dead
nobody will cleanup the conntracks?

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 20:01                                                     ` Cyrill Gorcunov
@ 2016-03-10 20:03                                                       ` David Miller
  2016-03-10 20:13                                                         ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-10 20:03 UTC (permalink / raw)
  To: gorcunov
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 10 Mar 2016 23:01:34 +0300

> On Thu, Mar 10, 2016 at 02:55:43PM -0500, David Miller wrote:
>> > 
>> > Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
>> > is happening and masq already registers a netdev notifier...
>> 
>> Indeed, good catch.  Therefore:
>> 
>> 1) Keep the masq netdev notifier.  That will flush the conntrack table
>>    for the inetdev_destroy event.
>> 
>> 2) Make the inetdev notifier only do something if inetdev->dead is
>>    false.  (ie. we are flushing an individual address)
>> 
>> And then we don't need the NETDEV_UNREGISTER thing at all:
>> 
>> diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
>> index c6eb421..f71841a 100644
>> --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
>> +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
>> @@ -108,10 +108,20 @@ static int masq_inet_event(struct notifier_block *this,
>>  			   unsigned long event,
>>  			   void *ptr)
>>  {
>> -	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
>>  	struct netdev_notifier_info info;
>> +	struct in_ifaddr *ifa = ptr;
>> +	struct in_device *idev;
>>  
>> -	netdev_notifier_info_init(&info, dev);
>> +	/* The masq_dev_notifier will catch the case of the device going
>> +	 * down.  So if the inetdev is dead and being destroyed we have
>> +	 * no work to do.  Otherwise this is an individual address removal
>> +	 * and we have to perform the flush.
>> +	 */
>> +	idev = ifa->ifa_dev;
>> +	if (idev->dead)
>> +		return NOTIFY_DONE;
>> +
>> +	netdev_notifier_info_init(&info, idev->dev);
>>  	return masq_device_event(this, event, &info);
>>  }
> 
> Guys, I'm lost. Currently masq_device_event calls for conntrack
> cleanup with device index, so that once device is going down, the
> appropriate conntracks gonna be dropped off. Now if device is dead
> nobody will cleanup the conntracks?

Both notifiers are run in the inetdev_destroy() case.

Maybe that's what you are missing.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 20:03                                                       ` David Miller
@ 2016-03-10 20:13                                                         ` Cyrill Gorcunov
  2016-03-10 20:19                                                           ` Cyrill Gorcunov
  2016-03-10 21:05                                                           ` David Miller
  0 siblings, 2 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 20:13 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Thu, Mar 10, 2016 at 03:03:11PM -0500, David Miller wrote:
> From: Cyrill Gorcunov <gorcunov@gmail.com>
> Date: Thu, 10 Mar 2016 23:01:34 +0300
> 
> > On Thu, Mar 10, 2016 at 02:55:43PM -0500, David Miller wrote:
> >> > 
> >> > Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
> >> > is happening and masq already registers a netdev notifier...
> >> 
> >> Indeed, good catch.  Therefore:
> >> 
> >> 1) Keep the masq netdev notifier.  That will flush the conntrack table
> >>    for the inetdev_destroy event.
> >> 
> >> 2) Make the inetdev notifier only do something if inetdev->dead is
> >>    false.  (ie. we are flushing an individual address)
> >> 
> >> And then we don't need the NETDEV_UNREGISTER thing at all:
> >> 
> >> diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> >> index c6eb421..f71841a 100644
> >> --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> >> +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
> >> @@ -108,10 +108,20 @@ static int masq_inet_event(struct notifier_block *this,
> >>  			   unsigned long event,
> >>  			   void *ptr)
> >>  {
> >> -	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
> >>  	struct netdev_notifier_info info;
> >> +	struct in_ifaddr *ifa = ptr;
> >> +	struct in_device *idev;
> >>  
> >> -	netdev_notifier_info_init(&info, dev);
> >> +	/* The masq_dev_notifier will catch the case of the device going
> >> +	 * down.  So if the inetdev is dead and being destroyed we have
> >> +	 * no work to do.  Otherwise this is an individual address removal
> >> +	 * and we have to perform the flush.
> >> +	 */
> >> +	idev = ifa->ifa_dev;
> >> +	if (idev->dead)
> >> +		return NOTIFY_DONE;
> >> +
> >> +	netdev_notifier_info_init(&info, idev->dev);
> >>  	return masq_device_event(this, event, &info);
> >>  }
> > 
> > Guys, I'm lost. Currently masq_device_event calls for conntrack
> > cleanup with device index, so that once device is going down, the
> > appropriate conntracks gonna be dropped off. Now if device is dead
> > nobody will cleanup the conntracks?
> 
> Both notifiers are run in the inetdev_destroy() case.
> 
> Maybe that's what you are missing.

No :) Look, here is what I mean. Previously with your two patches
we've been calling nf-cleanup for every address, so we had to make
code call for cleanup for one time only. Now with the patch above
the code flow is the following

inetdev_destroy
	in_dev->dead = 1;
	...
	inet_del_ifa
		...
		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
		...
		masq_inet_event
		 ...
		  masq_device_event
			if (idev->dead)
				return NOTIFY_DONE;

and nobody calls for nf_ct_iterate_cleanup, no?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 20:13                                                         ` Cyrill Gorcunov
@ 2016-03-10 20:19                                                           ` Cyrill Gorcunov
  2016-03-10 21:05                                                           ` David Miller
  1 sibling, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 20:19 UTC (permalink / raw)
  To: David Miller, xiyou.wangcong
  Cc: alexei.starovoitov, eric.dumazet, netdev, solar, vvs, avagin,
	xemul, vdavydov, khorenko, pablo, netfilter-devel

On Thu, Mar 10, 2016 at 11:13:51PM +0300, Cyrill Gorcunov wrote:
> > 
> > Both notifiers are run in the inetdev_destroy() case.
> > 
> > Maybe that's what you are missing.
> 
> No :) Look, here is what I mean. Previously with your two patches
> we've been calling nf-cleanup for every address, so we had to make
> code call for cleanup for one time only. Now with the patch above
> the code flow is the following

Ah, I'm idiot, drop the question.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 20:13                                                         ` Cyrill Gorcunov
  2016-03-10 20:19                                                           ` Cyrill Gorcunov
@ 2016-03-10 21:05                                                           ` David Miller
  2016-03-10 21:19                                                             ` Cyrill Gorcunov
  1 sibling, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-10 21:05 UTC (permalink / raw)
  To: gorcunov
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 10 Mar 2016 23:13:51 +0300

> On Thu, Mar 10, 2016 at 03:03:11PM -0500, David Miller wrote:
>> From: Cyrill Gorcunov <gorcunov@gmail.com>
>> Date: Thu, 10 Mar 2016 23:01:34 +0300
>> 
>> > On Thu, Mar 10, 2016 at 02:55:43PM -0500, David Miller wrote:
>> >> > 
>> >> > Hmm, but inetdev_destroy() is only called when NETDEV_UNREGISTER
>> >> > is happening and masq already registers a netdev notifier...
>> >> 
>> >> Indeed, good catch.  Therefore:
>> >> 
>> >> 1) Keep the masq netdev notifier.  That will flush the conntrack table
>> >>    for the inetdev_destroy event.
>> >> 
>> >> 2) Make the inetdev notifier only do something if inetdev->dead is
>> >>    false.  (ie. we are flushing an individual address)
>> >> 
>> >> And then we don't need the NETDEV_UNREGISTER thing at all:
>> >> 
>> >> diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
>> >> index c6eb421..f71841a 100644
>> >> --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
>> >> +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
>> >> @@ -108,10 +108,20 @@ static int masq_inet_event(struct notifier_block *this,
>> >>  			   unsigned long event,
>> >>  			   void *ptr)
>> >>  {
>> >> -	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
>> >>  	struct netdev_notifier_info info;
>> >> +	struct in_ifaddr *ifa = ptr;
>> >> +	struct in_device *idev;
>> >>  
>> >> -	netdev_notifier_info_init(&info, dev);
>> >> +	/* The masq_dev_notifier will catch the case of the device going
>> >> +	 * down.  So if the inetdev is dead and being destroyed we have
>> >> +	 * no work to do.  Otherwise this is an individual address removal
>> >> +	 * and we have to perform the flush.
>> >> +	 */
>> >> +	idev = ifa->ifa_dev;
>> >> +	if (idev->dead)
>> >> +		return NOTIFY_DONE;
>> >> +
>> >> +	netdev_notifier_info_init(&info, idev->dev);
>> >>  	return masq_device_event(this, event, &info);
>> >>  }
>> > 
>> > Guys, I'm lost. Currently masq_device_event calls for conntrack
>> > cleanup with device index, so that once device is going down, the
>> > appropriate conntracks gonna be dropped off. Now if device is dead
>> > nobody will cleanup the conntracks?
>> 
>> Both notifiers are run in the inetdev_destroy() case.
>> 
>> Maybe that's what you are missing.
> 
> No :) Look, here is what I mean. Previously with your two patches
> we've been calling nf-cleanup for every address, so we had to make
> code call for cleanup for one time only. Now with the patch above
> the code flow is the following
> 
> inetdev_destroy
> 	in_dev->dead = 1;
> 	...
> 	inet_del_ifa
> 		...
> 		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
> 		...
> 		masq_inet_event
> 		 ...
> 		  masq_device_event
> 			if (idev->dead)
> 				return NOTIFY_DONE;
> 
> and nobody calls for nf_ct_iterate_cleanup, no?

Oh yes they do, from masq's non-inet notifier.  masq registers two
notifiers, one for generic netdev and one for inetdev.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 19:55                                                   ` David Miller
  2016-03-10 20:01                                                     ` Cyrill Gorcunov
@ 2016-03-10 21:09                                                     ` Cong Wang
  1 sibling, 0 replies; 50+ messages in thread
From: Cong Wang @ 2016-03-10 21:09 UTC (permalink / raw)
  To: David Miller
  Cc: Cyrill Gorcunov, Alexei Starovoitov, Eric Dumazet,
	Linux Kernel Network Developers, solar, Vasily Averin, avagin,
	xemul, vdavydov, khorenko, Pablo Neira Ayuso, netfilter-devel

On Thu, Mar 10, 2016 at 11:55 AM, David Miller <davem@davemloft.net> wrote:
> Indeed, good catch.  Therefore:
>
> 1) Keep the masq netdev notifier.  That will flush the conntrack table
>    for the inetdev_destroy event.
>
> 2) Make the inetdev notifier only do something if inetdev->dead is
>    false.  (ie. we are flushing an individual address)
>
> And then we don't need the NETDEV_UNREGISTER thing at all:


This makes sense to me. I guess similar thing needs to do for IPv6 masq too.

Thanks.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 21:05                                                           ` David Miller
@ 2016-03-10 21:19                                                             ` Cyrill Gorcunov
  2016-03-10 21:59                                                               ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 21:19 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Thu, Mar 10, 2016 at 04:05:21PM -0500, David Miller wrote:
> > 
> > and nobody calls for nf_ct_iterate_cleanup, no?
> 
> Oh yes they do, from masq's non-inet notifier.  masq registers two
> notifiers, one for generic netdev and one for inetdev.

Thanks a huge David! I'll test it just to be sure.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 21:19                                                             ` Cyrill Gorcunov
@ 2016-03-10 21:59                                                               ` Cyrill Gorcunov
  2016-03-10 22:36                                                                 ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 21:59 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Fri, Mar 11, 2016 at 12:19:45AM +0300, Cyrill Gorcunov wrote:
> > 
> > Oh yes they do, from masq's non-inet notifier.  masq registers two
> > notifiers, one for generic netdev and one for inetdev.
> 
> Thanks a huge David! I'll test it just to be sure.

Works like a charm! So David, what are the next steps then?
Mind to gather all your patches into one (maybe)?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 21:59                                                               ` Cyrill Gorcunov
@ 2016-03-10 22:36                                                                 ` David Miller
  2016-03-10 22:40                                                                   ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: David Miller @ 2016-03-10 22:36 UTC (permalink / raw)
  To: gorcunov
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 11 Mar 2016 00:59:59 +0300

> On Fri, Mar 11, 2016 at 12:19:45AM +0300, Cyrill Gorcunov wrote:
>> > 
>> > Oh yes they do, from masq's non-inet notifier.  masq registers two
>> > notifiers, one for generic netdev and one for inetdev.
>> 
>> Thanks a huge David! I'll test it just to be sure.
> 
> Works like a charm! So David, what are the next steps then?
> Mind to gather all your patches into one (maybe)?

I'll re-review all of the changes tomorrow and also look into ipv6
masq, to see if it needs the same treatment, as well.

Thanks for all of your help and testing so far.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 22:36                                                                 ` David Miller
@ 2016-03-10 22:40                                                                   ` Cyrill Gorcunov
  2016-03-11 20:40                                                                     ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-10 22:40 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Thu, Mar 10, 2016 at 05:36:30PM -0500, David Miller wrote:
> > 
> > Works like a charm! So David, what are the next steps then?
> > Mind to gather all your patches into one (maybe)?
> 
> I'll re-review all of the changes tomorrow and also look into ipv6
> masq, to see if it needs the same treatment, as well.
> 
> Thanks for all of your help and testing so far.

Thanks a lot, David!

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-10 22:40                                                                   ` Cyrill Gorcunov
@ 2016-03-11 20:40                                                                     ` David Miller
  2016-03-11 20:58                                                                       ` Florian Westphal
                                                                                         ` (2 more replies)
  0 siblings, 3 replies; 50+ messages in thread
From: David Miller @ 2016-03-11 20:40 UTC (permalink / raw)
  To: gorcunov
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 11 Mar 2016 01:40:56 +0300

> On Thu, Mar 10, 2016 at 05:36:30PM -0500, David Miller wrote:
>> > 
>> > Works like a charm! So David, what are the next steps then?
>> > Mind to gather all your patches into one (maybe)?
>> 
>> I'll re-review all of the changes tomorrow and also look into ipv6
>> masq, to see if it needs the same treatment, as well.
>> 
>> Thanks for all of your help and testing so far.
> 
> Thanks a lot, David!

Cyrill please retest this final patch and let me know if it still works
properly.

I looked at ipv6, and it's more complicated.  The problem is that ipv6
doesn't mark the inet6dev object as dead in the NETDEV_DOWN case, in
fact it keeps the object around.  It only releases it and marks it
dead in the NETDEV_UNREGISTER case.

We pay a very large price for having allowed the behavior of ipv6 and
ipv4 to diverge so greatly in these areas :-(

Nevertheless we should try to fix it somehow, maybe we can detect the
situation in another way for the ipv6 side.

====================
ipv4: Don't do expensive useless work during inetdev destroy.

When an inetdev is destroyed, every address assigned to the interface
is removed.  And in this scenerio we do two pointless things which can
be very expensive if the number of assigned interfaces is large:

1) Address promotion.  We are deleting all addresses, so there is no
   point in doing this.

2) A full nf conntrack table purge for every address.  We only need to
   do this once, as is already caught by the existing
   masq_dev_notifier so masq_inet_event() can skip this.

Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index f6303b1..0212591 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -334,6 +334,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 
 	ASSERT_RTNL();
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* 1. Deleting primary ifaddr forces deletion all secondaries
 	 * unless alias promotion is set
 	 **/
@@ -380,6 +383,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			fib_del_ifaddr(ifa, ifa1);
 	}
 
+no_promotions:
 	/* 2. Unlink it */
 
 	*ifap = ifa1->ifa_next;
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index c6eb421..ea91058 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -108,10 +108,18 @@ static int masq_inet_event(struct notifier_block *this,
 			   unsigned long event,
 			   void *ptr)
 {
-	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+	struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
 	struct netdev_notifier_info info;
 
-	netdev_notifier_info_init(&info, dev);
+	/* The masq_dev_notifier will catch the case of the device going
+	 * down.  So if the inetdev is dead and being destroyed we have
+	 * no work to do.  Otherwise this is an individual address removal
+	 * and we have to perform the flush.
+	 */
+	if (idev->dead)
+		return NOTIFY_DONE;
+
+	netdev_notifier_info_init(&info, idev->dev);
 	return masq_device_event(this, event, &info);
 }
 

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-11 20:40                                                                     ` David Miller
@ 2016-03-11 20:58                                                                       ` Florian Westphal
  2016-03-11 21:00                                                                       ` Cyrill Gorcunov
  2016-03-11 21:22                                                                       ` Cyrill Gorcunov
  2 siblings, 0 replies; 50+ messages in thread
From: Florian Westphal @ 2016-03-11 20:58 UTC (permalink / raw)
  To: David Miller
  Cc: gorcunov, xiyou.wangcong, alexei.starovoitov, eric.dumazet,
	netdev, solar, vvs, avagin, xemul, vdavydov, khorenko, pablo,
	netfilter-devel

David Miller <davem@davemloft.net> wrote:
> From: Cyrill Gorcunov <gorcunov@gmail.com>
> Date: Fri, 11 Mar 2016 01:40:56 +0300
> 
> > On Thu, Mar 10, 2016 at 05:36:30PM -0500, David Miller wrote:
> >> > 
> >> > Works like a charm! So David, what are the next steps then?
> >> > Mind to gather all your patches into one (maybe)?
> >> 
> >> I'll re-review all of the changes tomorrow and also look into ipv6
> >> masq, to see if it needs the same treatment, as well.
> >> 
> >> Thanks for all of your help and testing so far.
> > 
> > Thanks a lot, David!
> 
> Cyrill please retest this final patch and let me know if it still works
> properly.
> 
> I looked at ipv6, and it's more complicated.  The problem is that ipv6
> doesn't mark the inet6dev object as dead in the NETDEV_DOWN case, in
> fact it keeps the object around.  It only releases it and marks it
> dead in the NETDEV_UNREGISTER case.
> 
> We pay a very large price for having allowed the behavior of ipv6 and
> ipv4 to diverge so greatly in these areas :-(
> 
> Nevertheless we should try to fix it somehow, maybe we can detect the
> situation in another way for the ipv6 side.

Note that as the ipv6 inet notifier is atomic; now that
nf_ct_iterate_cleanup can schedule the ipv6 masq version defers the
cleanup to a work queue, with a backlog cap of 16.  So in case
of a gazillion events most will be ignored and teardown should not be
delayed (at least not even close to what ipv4 masq did).

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-11 20:40                                                                     ` David Miller
  2016-03-11 20:58                                                                       ` Florian Westphal
@ 2016-03-11 21:00                                                                       ` Cyrill Gorcunov
  2016-03-11 21:22                                                                       ` Cyrill Gorcunov
  2 siblings, 0 replies; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-11 21:00 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Fri, Mar 11, 2016 at 03:40:46PM -0500, David Miller wrote:
> > 
> > Thanks a lot, David!
> 
> Cyrill please retest this final patch and let me know if it still works
> properly.
...

Thanks David! Give me some time, gonna build and run test.

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-11 20:40                                                                     ` David Miller
  2016-03-11 20:58                                                                       ` Florian Westphal
  2016-03-11 21:00                                                                       ` Cyrill Gorcunov
@ 2016-03-11 21:22                                                                       ` Cyrill Gorcunov
  2016-03-11 21:59                                                                         ` Cyrill Gorcunov
  2 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-11 21:22 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Fri, Mar 11, 2016 at 03:40:46PM -0500, David Miller wrote:
> > Thanks a lot, David!
> 
> Cyrill please retest this final patch and let me know if it still works
> properly.
> 
> I looked at ipv6, and it's more complicated.  The problem is that ipv6
> doesn't mark the inet6dev object as dead in the NETDEV_DOWN case, in
> fact it keeps the object around.  It only releases it and marks it
> dead in the NETDEV_UNREGISTER case.
> 
> We pay a very large price for having allowed the behavior of ipv6 and
> ipv4 to diverge so greatly in these areas :-(
> 
> Nevertheless we should try to fix it somehow, maybe we can detect the
> situation in another way for the ipv6 side.

David, thanks a huge! But you forgot to merge your patch #2
(once I add it manually on top, it works quite well :)
---
 net/ipv4/fib_frontend.c |    4 ++++
 1 file changed, 4 insertions(+)

Index: linux-ml.git/net/ipv4/fib_frontend.c
===================================================================
--- linux-ml.git.orig/net/ipv4/fib_frontend.c
+++ linux-ml.git/net/ipv4/fib_frontend.c
@@ -922,6 +922,9 @@ void fib_del_ifaddr(struct in_ifaddr *if
 		subnet = 1;
 	}
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* Deletion is more complicated than add.
 	 * We should take care of not to delete too much :-)
 	 *
@@ -997,6 +1000,7 @@ void fib_del_ifaddr(struct in_ifaddr *if
 		}
 	}
 
+no_promotions:
 	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 	if (subnet && ifa->ifa_prefixlen < 31) {

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-11 21:22                                                                       ` Cyrill Gorcunov
@ 2016-03-11 21:59                                                                         ` Cyrill Gorcunov
  2016-03-14  3:29                                                                           ` David Miller
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2016-03-11 21:59 UTC (permalink / raw)
  To: David Miller
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

On Sat, Mar 12, 2016 at 12:22:47AM +0300, Cyrill Gorcunov wrote:
> On Fri, Mar 11, 2016 at 03:40:46PM -0500, David Miller wrote:
> > > Thanks a lot, David!
> > 
> > Cyrill please retest this final patch and let me know if it still works
> > properly.
> > 
> > I looked at ipv6, and it's more complicated.  The problem is that ipv6
> > doesn't mark the inet6dev object as dead in the NETDEV_DOWN case, in
> > fact it keeps the object around.  It only releases it and marks it
> > dead in the NETDEV_UNREGISTER case.
> > 
> > We pay a very large price for having allowed the behavior of ipv6 and
> > ipv4 to diverge so greatly in these areas :-(
> > 
> > Nevertheless we should try to fix it somehow, maybe we can detect the
> > situation in another way for the ipv6 side.
> 
> David, thanks a huge! But you forgot to merge your patch #2
> (once I add it manually on top, it works quite well :)

Here is a cumulative one, which works just brilliant! Thanks a lot, David!
(I cahcnged reported-by tag, since it's Solar Designer who tell us
 about the issue, I forgot to mentioned it in first report, very
 sorry).
---
From: David Miller <davem@davemloft.net>
Subject: [PATCH] ipv4: Don't do expensive useless work during inetdev destroy.

When an inetdev is destroyed, every address assigned to the interface
is removed.  And in this scenerio we do two pointless things which can
be very expensive if the number of assigned interfaces is large:

1) Address promotion.  We are deleting all addresses, so there is no
   point in doing this.

2) A full nf conntrack table purge for every address.  We only need to
   do this once, as is already caught by the existing
   masq_dev_notifier so masq_inet_event() can skip this.

Reported-by: Solar Designer <solar@openwall.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Tested-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
 net/ipv4/devinet.c                          |    4 ++++
 net/ipv4/fib_frontend.c                     |    4 ++++
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c |   12 ++++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

Index: linux-ml.git/net/ipv4/devinet.c
===================================================================
--- linux-ml.git.orig/net/ipv4/devinet.c
+++ linux-ml.git/net/ipv4/devinet.c
@@ -334,6 +334,9 @@ static void __inet_del_ifa(struct in_dev
 
 	ASSERT_RTNL();
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* 1. Deleting primary ifaddr forces deletion all secondaries
 	 * unless alias promotion is set
 	 **/
@@ -380,6 +383,7 @@ static void __inet_del_ifa(struct in_dev
 			fib_del_ifaddr(ifa, ifa1);
 	}
 
+no_promotions:
 	/* 2. Unlink it */
 
 	*ifap = ifa1->ifa_next;
Index: linux-ml.git/net/ipv4/fib_frontend.c
===================================================================
--- linux-ml.git.orig/net/ipv4/fib_frontend.c
+++ linux-ml.git/net/ipv4/fib_frontend.c
@@ -922,6 +922,9 @@ void fib_del_ifaddr(struct in_ifaddr *if
 		subnet = 1;
 	}
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* Deletion is more complicated than add.
 	 * We should take care of not to delete too much :-)
 	 *
@@ -997,6 +1000,7 @@ void fib_del_ifaddr(struct in_ifaddr *if
 		}
 	}
 
+no_promotions:
 	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 	if (subnet && ifa->ifa_prefixlen < 31) {
Index: linux-ml.git/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
===================================================================
--- linux-ml.git.orig/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ linux-ml.git/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -108,10 +108,18 @@ static int masq_inet_event(struct notifi
 			   unsigned long event,
 			   void *ptr)
 {
-	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+	struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
 	struct netdev_notifier_info info;
 
-	netdev_notifier_info_init(&info, dev);
+	/* The masq_dev_notifier will catch the case of the device going
+	 * down.  So if the inetdev is dead and being destroyed we have
+	 * no work to do.  Otherwise this is an individual address removal
+	 * and we have to perform the flush.
+	 */
+	if (idev->dead)
+		return NOTIFY_DONE;
+
+	netdev_notifier_info_init(&info, idev->dev);
 	return masq_device_event(this, event, &info);
 }
 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] net: ipv4 -- Introduce ifa limit per net
  2016-03-11 21:59                                                                         ` Cyrill Gorcunov
@ 2016-03-14  3:29                                                                           ` David Miller
  0 siblings, 0 replies; 50+ messages in thread
From: David Miller @ 2016-03-14  3:29 UTC (permalink / raw)
  To: gorcunov
  Cc: xiyou.wangcong, alexei.starovoitov, eric.dumazet, netdev, solar,
	vvs, avagin, xemul, vdavydov, khorenko, pablo, netfilter-devel

From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 12 Mar 2016 00:59:12 +0300

> Here is a cumulative one, which works just brilliant! Thanks a lot, David!
> (I cahcnged reported-by tag, since it's Solar Designer who tell us
>  about the issue, I forgot to mentioned it in first report, very
>  sorry).

Thanks for fixing this up and retesting, applied and queued up for -stable.

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2016-03-14  3:29 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-04 21:39 [RFC] net: ipv4 -- Introduce ifa limit per net Cyrill Gorcunov
2016-03-04 22:50 ` David Miller
2016-03-05  0:08   ` Eric Dumazet
2016-03-05  4:11     ` David Miller
2016-03-05  7:18       ` Cyrill Gorcunov
2016-03-05 15:57       ` Cyrill Gorcunov
2016-03-05 16:33         ` David Miller
2016-03-05 17:00           ` Cyrill Gorcunov
2016-03-05 18:44           ` Cyrill Gorcunov
2016-03-06 10:09             ` Cyrill Gorcunov
2016-03-06 16:23               ` Eric Dumazet
2016-03-06 17:06                 ` Cyrill Gorcunov
2016-03-09 16:39                   ` Cyrill Gorcunov
2016-03-09 16:51                     ` Cyrill Gorcunov
2016-03-09 16:58                     ` Alexei Starovoitov
2016-03-09 17:09                       ` Cyrill Gorcunov
2016-03-09 17:24                         ` David Miller
2016-03-09 17:53                           ` Cyrill Gorcunov
2016-03-09 19:55                             ` Cyrill Gorcunov
2016-03-09 20:27                             ` David Miller
2016-03-09 20:41                               ` Cyrill Gorcunov
2016-03-09 20:47                                 ` David Miller
2016-03-09 20:57                                   ` Cyrill Gorcunov
2016-03-09 21:10                                     ` David Miller
2016-03-09 21:16                                       ` Cyrill Gorcunov
2016-03-10 10:20                                         ` Cyrill Gorcunov
2016-03-10 11:03                                           ` Cyrill Gorcunov
2016-03-10 15:09                                             ` Cyrill Gorcunov
2016-03-10 18:01                                               ` David Miller
2016-03-10 18:48                                                 ` Cyrill Gorcunov
2016-03-10 19:02                                                 ` Cong Wang
2016-03-10 19:55                                                   ` David Miller
2016-03-10 20:01                                                     ` Cyrill Gorcunov
2016-03-10 20:03                                                       ` David Miller
2016-03-10 20:13                                                         ` Cyrill Gorcunov
2016-03-10 20:19                                                           ` Cyrill Gorcunov
2016-03-10 21:05                                                           ` David Miller
2016-03-10 21:19                                                             ` Cyrill Gorcunov
2016-03-10 21:59                                                               ` Cyrill Gorcunov
2016-03-10 22:36                                                                 ` David Miller
2016-03-10 22:40                                                                   ` Cyrill Gorcunov
2016-03-11 20:40                                                                     ` David Miller
2016-03-11 20:58                                                                       ` Florian Westphal
2016-03-11 21:00                                                                       ` Cyrill Gorcunov
2016-03-11 21:22                                                                       ` Cyrill Gorcunov
2016-03-11 21:59                                                                         ` Cyrill Gorcunov
2016-03-14  3:29                                                                           ` David Miller
2016-03-10 21:09                                                     ` Cong Wang
2016-03-09 17:19                     ` David Miller
2016-03-05  6:58   ` Cyrill Gorcunov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.