All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next-2.6 PATCH] net: fast consecutive name allocation
@ 2009-11-13  5:01 Octavian Purdila
  2009-11-13  5:20 ` Octavian Purdila
  0 siblings, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13  5:01 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 621 bytes --]


This patch speeds up the network device name allocation for the case
where a significant number of devices of the same type are created
consecutively.

Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
and sysfs entries disabled:

Without the patch           With the patch

real    0m 43.43s	    real    0m 0.49s
user    0m 0.00s	    user    0m 0.00s
sys     0m 43.43s	    sys     0m 0.48s

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
---
 include/net/net_namespace.h |    3 +++
 net/core/dev.c              |   23 ++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletions(-)

[-- Attachment #2: 3ad8e007a0c929204c3ee7e7afa309e2e53b5b8a.diff --]
[-- Type: text/x-patch, Size: 2073 bytes --]

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0addd45..39c65a2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,9 @@ struct net {
 	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
+	/* fast consecutive name allocation (e.g. eth0, eth1, ...) */
+	char                    fcna_name[IFNAMSIZ];
+	int                     fcna_no;
 
 	/* core fib_rules */
 	struct list_head	rules_ops;
diff --git a/net/core/dev.c b/net/core/dev.c
index ad8e320..008e3c7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -226,8 +226,12 @@ static int list_netdevice(struct net_device *dev)
  */
 static void unlist_netdevice(struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
+
 	ASSERT_RTNL();
 
+	net->fcna_no = -1;
+
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
 	list_del_rcu(&dev->dev_list);
@@ -872,6 +876,16 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 		if (p[1] != 'd' || strchr(p + 2, '%'))
 			return -EINVAL;
 
+		/* avoid fast allocation for strange templates like "fan%dcy" */
+		if (net->fcna_no >= 0 && p[2] == 0 &&
+		    net->fcna_name[p - name] == 0 &&
+		    memcmp(name, net->fcna_name, p - name) == 0) {
+			snprintf(buf, IFNAMSIZ, name, ++net->fcna_no);
+			if (!__dev_get_by_name(net, buf))
+				return net->fcna_no;
+			net->fcna_no = -1;
+		}
+
 		/* Use one page as a bit array of possible slots */
 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 		if (!inuse)
@@ -894,8 +908,15 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 	}
 
 	snprintf(buf, IFNAMSIZ, name, i);
-	if (!__dev_get_by_name(net, buf))
+	if (!__dev_get_by_name(net, buf)) {
+		if (p[2] == 0) {
+			memcpy(net->fcna_name, name, p - name);
+			net->fcna_name[p - name] = 0;
+			net->fcna_no = i;
+		}  else
+			net->fcna_no = -1;
 		return i;
+	}
 
 	/* It is possible to run out of possible slots
 	 * when the name is long and there isn't enough space left

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  5:01 [net-next-2.6 PATCH] net: fast consecutive name allocation Octavian Purdila
@ 2009-11-13  5:20 ` Octavian Purdila
  2009-11-13  6:12   ` Eric Dumazet
  2009-11-14  0:04   ` Stephen Hemminger
  0 siblings, 2 replies; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13  5:20 UTC (permalink / raw)
  To: netdev

On Friday 13 November 2009 07:01:14 you wrote:
> This patch speeds up the network device name allocation for the case
> where a significant number of devices of the same type are created
> consecutively.
> 
> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> and sysfs entries disabled:
> 
> Without the patch           With the patch
> 
> real    0m 43.43s	    real    0m 0.49s
> user    0m 0.00s	    user    0m 0.00s
> sys     0m 43.43s	    sys     0m 0.48s
> 

Oops, pasting root prompts (e.g. # modprobe ....) directly into the git commit message is not a good idea :) Here it is again, with the full commit message.

[net-next-2.6 PATCH] net: fast consecutive name allocation

This patch speeds up the network device name allocation for the case
where a significant number of devices of the same type are created
consecutively.

Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
and sysfs entries disabled:

$ time insmod /lib/modules/dummy.ko numdummies=8000

Without the patch           With the patch

real    0m 43.43s	    real    0m 0.49s
user    0m 0.00s	    user    0m 0.00s
sys     0m 43.43s	    sys     0m 0.48s

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
---
 include/net/net_namespace.h |    3 +++
 net/core/dev.c              |   23 ++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0addd45..39c65a2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,9 @@ struct net {
 	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
+	/* fast consecutive name allocation (e.g. eth0, eth1, ...) */
+	char                    fcna_name[IFNAMSIZ];
+	int                     fcna_no;
 
 	/* core fib_rules */
 	struct list_head	rules_ops;
diff --git a/net/core/dev.c b/net/core/dev.c
index ad8e320..008e3c7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -226,8 +226,12 @@ static int list_netdevice(struct net_device *dev)
  */
 static void unlist_netdevice(struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
+
 	ASSERT_RTNL();
 
+	net->fcna_no = -1;
+
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
 	list_del_rcu(&dev->dev_list);
@@ -872,6 +876,16 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 		if (p[1] != 'd' || strchr(p + 2, '%'))
 			return -EINVAL;
 
+		/* avoid fast allocation for strange templates like "fan%dcy" */
+		if (net->fcna_no >= 0 && p[2] == 0 &&
+		    net->fcna_name[p - name] == 0 &&
+		    memcmp(name, net->fcna_name, p - name) == 0) {
+			snprintf(buf, IFNAMSIZ, name, ++net->fcna_no);
+			if (!__dev_get_by_name(net, buf))
+				return net->fcna_no;
+			net->fcna_no = -1;
+		}
+
 		/* Use one page as a bit array of possible slots */
 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 		if (!inuse)
@@ -894,8 +908,15 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 	}
 
 	snprintf(buf, IFNAMSIZ, name, i);
-	if (!__dev_get_by_name(net, buf))
+	if (!__dev_get_by_name(net, buf)) {
+		if (p[2] == 0) {
+			memcpy(net->fcna_name, name, p - name);
+			net->fcna_name[p - name] = 0;
+			net->fcna_no = i;
+		}  else
+			net->fcna_no = -1;
 		return i;
+	}
 
 	/* It is possible to run out of possible slots
 	 * when the name is long and there isn't enough space left
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  5:20 ` Octavian Purdila
@ 2009-11-13  6:12   ` Eric Dumazet
  2009-11-13  6:26     ` Stephen Hemminger
  2009-11-13  9:55     ` Octavian Purdila
  2009-11-14  0:04   ` Stephen Hemminger
  1 sibling, 2 replies; 28+ messages in thread
From: Eric Dumazet @ 2009-11-13  6:12 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: netdev

Octavian Purdila a écrit :
> On Friday 13 November 2009 07:01:14 you wrote:
>> This patch speeds up the network device name allocation for the case
>> where a significant number of devices of the same type are created
>> consecutively.
>>
>> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
>> and sysfs entries disabled:
>>
>> Without the patch           With the patch
>>
>> real    0m 43.43s	    real    0m 0.49s
>> user    0m 0.00s	    user    0m 0.00s
>> sys     0m 43.43s	    sys     0m 0.48s
>>
> 
> Oops, pasting root prompts (e.g. # modprobe ....) directly into the git commit message is not a good idea :) Here it is again, with the full commit message.
> 
> [net-next-2.6 PATCH] net: fast consecutive name allocation
> 
> This patch speeds up the network device name allocation for the case
> where a significant number of devices of the same type are created
> consecutively.
> 
> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> and sysfs entries disabled:
> 
> $ time insmod /lib/modules/dummy.ko numdummies=8000
> 
> Without the patch           With the patch
> 
> real    0m 43.43s	    real    0m 0.49s
> user    0m 0.00s	    user    0m 0.00s
> sys     0m 43.43s	    sys     0m 0.48s
> 
> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> ---

Honestly I dont like this bloat.

Changing dummy.c is trivial, and you can allocate 100.000.000 dummies if you want now :)

I not tested yet this patch but here it is :

[PATCH] dummy: Allow more than 32768 dummies

And speedup name allocation : O(N) instead of O(N^2)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/dummy.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 37dcfdc..f600c4c 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -107,12 +107,14 @@ static struct rtnl_link_ops dummy_link_ops __read_mostly = {
 module_param(numdummies, int, 0);
 MODULE_PARM_DESC(numdummies, "Number of dummy pseudo devices");
 
-static int __init dummy_init_one(void)
+static int __init dummy_init_one(int i)
 {
 	struct net_device *dev_dummy;
 	int err;
+	char name[IFNAMSIZ];
 
-	dev_dummy = alloc_netdev(0, "dummy%d", dummy_setup);
+	snprintf(name, IFNAMSIZ, "dummy%d", i);
+	dev_dummy = alloc_netdev(0, name, dummy_setup);
 	if (!dev_dummy)
 		return -ENOMEM;
 
@@ -139,7 +141,7 @@ static int __init dummy_init_module(void)
 	err = __rtnl_link_register(&dummy_link_ops);
 
 	for (i = 0; i < numdummies && !err; i++)
-		err = dummy_init_one();
+		err = dummy_init_one(i);
 	if (err < 0)
 		__rtnl_link_unregister(&dummy_link_ops);
 	rtnl_unlock();


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  6:12   ` Eric Dumazet
@ 2009-11-13  6:26     ` Stephen Hemminger
  2009-11-13  7:09       ` Eric Dumazet
  2009-11-13  9:51       ` Octavian Purdila
  2009-11-13  9:55     ` Octavian Purdila
  1 sibling, 2 replies; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13  6:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Octavian Purdila, netdev

On Fri, 13 Nov 2009 07:12:35 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Octavian Purdila a écrit :
> > On Friday 13 November 2009 07:01:14 you wrote:
> >> This patch speeds up the network device name allocation for the case
> >> where a significant number of devices of the same type are created
> >> consecutively.
> >>
> >> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> >> and sysfs entries disabled:
> >>
> >> Without the patch           With the patch
> >>
> >> real    0m 43.43s	    real    0m 0.49s
> >> user    0m 0.00s	    user    0m 0.00s
> >> sys     0m 43.43s	    sys     0m 0.48s
> >>
> > 

No one has give a reasonable use case for this network device name
explosion, what is the benchmark doing this nosense, and how do I
get paid to do it...

But I have to say no for another reason. You cause the kernel to choose
a different name for the case where a device is deleted or renamed.
The old code would find and fill the hole when a new device was added.

Since this is a semantic ABI change, the kind that drives users nuts.

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  6:26     ` Stephen Hemminger
@ 2009-11-13  7:09       ` Eric Dumazet
  2009-11-13  9:51       ` Octavian Purdila
  1 sibling, 0 replies; 28+ messages in thread
From: Eric Dumazet @ 2009-11-13  7:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Octavian Purdila, netdev

Stephen Hemminger a écrit :
> No one has give a reasonable use case for this network device name
> explosion, what is the benchmark doing this nosense, and how do I
> get paid to do it...
> 
> But I have to say no for another reason. You cause the kernel to choose
> a different name for the case where a device is deleted or renamed.
> The old code would find and fill the hole when a new device was added.
> 
> Since this is a semantic ABI change, the kind that drives users nuts.
> 

Yes, I see your point Stephen, but it would be trivial to skip already
used devices as well with litle change.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  6:26     ` Stephen Hemminger
  2009-11-13  7:09       ` Eric Dumazet
@ 2009-11-13  9:51       ` Octavian Purdila
  2009-11-13 22:29         ` Stephen Hemminger
  1 sibling, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13  9:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, netdev

On Friday 13 November 2009 08:26:08 you wrote:
> On Fri, 13 Nov 2009 07:12:35 +0100
> 
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Octavian Purdila a écrit :
> > > On Friday 13 November 2009 07:01:14 you wrote:
> > >> This patch speeds up the network device name allocation for the case
> > >> where a significant number of devices of the same type are created
> > >> consecutively.
> > >>
> > >> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > >> and sysfs entries disabled:
> > >>
> > >> Without the patch           With the patch
> > >>
> > >> real    0m 43.43s	    real    0m 0.49s
> > >> user    0m 0.00s	    user    0m 0.00s
> > >> sys     0m 43.43s	    sys     0m 0.48s
> 
> No one has give a reasonable use case for this network device name
> explosion, what is the benchmark doing this nosense, and how do I
> get paid to do it...
> 

For us the usecase is creating interfaces that get used by applications that 
generate all sorts of traffic. This allows us to simulate realistic end user 
traffic (e.g. coming from a full blown stack). That sounds reasonable to us :)

Also, I've seen other people reporting here to use more then 8000 interfaces.

> But I have to say no for another reason. You cause the kernel to choose
> a different name for the case where a device is deleted or renamed.
> The old code would find and fill the hole when a new device was added.
> 
> Since this is a semantic ABI change, the kind that drives users nuts.
> 

The intent was to keep the old behavior. When the device is deleted we stop 
fast allocation and we resume it only after we go through the old code once 
again. Did I miss something?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  6:12   ` Eric Dumazet
  2009-11-13  6:26     ` Stephen Hemminger
@ 2009-11-13  9:55     ` Octavian Purdila
  2009-11-13 16:40       ` Ben Greear
  1 sibling, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13  9:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Friday 13 November 2009 08:12:35 you wrote:
> Octavian Purdila a écrit :
> > On Friday 13 November 2009 07:01:14 you wrote:
> >> This patch speeds up the network device name allocation for the case
> >> where a significant number of devices of the same type are created
> >> consecutively.
> >>
> >> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> >> and sysfs entries disabled:
> >>
> >> Without the patch           With the patch
> >>
> >> real    0m 43.43s	    real    0m 0.49s
> >> user    0m 0.00s	    user    0m 0.00s
> >> sys     0m 43.43s	    sys     0m 0.48s
> >
> > Oops, pasting root prompts (e.g. # modprobe ....) directly into the git
> > commit message is not a good idea :) Here it is again, with the full
> > commit message.
> >
> > [net-next-2.6 PATCH] net: fast consecutive name allocation
> >
> > This patch speeds up the network device name allocation for the case
> > where a significant number of devices of the same type are created
> > consecutively.
> >
> > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > and sysfs entries disabled:
> >
> > $ time insmod /lib/modules/dummy.ko numdummies=8000
> >
> > Without the patch           With the patch
> >
> > real    0m 43.43s	    real    0m 0.49s
> > user    0m 0.00s	    user    0m 0.00s
> > sys     0m 43.43s	    sys     0m 0.48s
> >
> > Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> > ---
> 
> Honestly I dont like this bloat.
> 
> Changing dummy.c is trivial, and you can allocate 100.000.000 dummies if
>  you want now :)
> 

Yep we can do that - actually we are doing exactly this in our drivers. But in 
that way, you get to "bloat" every driver which needs this. 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  9:55     ` Octavian Purdila
@ 2009-11-13 16:40       ` Ben Greear
  0 siblings, 0 replies; 28+ messages in thread
From: Ben Greear @ 2009-11-13 16:40 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: Eric Dumazet, netdev

Octavian Purdila wrote:
> On Friday 13 November 2009 08:12:35 you wrote:
>   
>> Octavian Purdila a écrit :
>>     
>>> On Friday 13 November 2009 07:01:14 you wrote:
>>>       
>>>> This patch speeds up the network device name allocation for the case
>>>> where a significant number of devices of the same type are created
>>>> consecutively.
>>>>
>>>> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
>>>> and sysfs entries disabled:
>>>>
>>>> Without the patch           With the patch
>>>>
>>>> real    0m 43.43s	    real    0m 0.49s
>>>> user    0m 0.00s	    user    0m 0.00s
>>>> sys     0m 43.43s	    sys     0m 0.48s
>>>>         
>>> Oops, pasting root prompts (e.g. # modprobe ....) directly into the git
>>> commit message is not a good idea :) Here it is again, with the full
>>> commit message.
>>>
>>> [net-next-2.6 PATCH] net: fast consecutive name allocation
>>>
>>> This patch speeds up the network device name allocation for the case
>>> where a significant number of devices of the same type are created
>>> consecutively.
>>>
>>> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
>>> and sysfs entries disabled:
>>>
>>> $ time insmod /lib/modules/dummy.ko numdummies=8000
>>>
>>> Without the patch           With the patch
>>>
>>> real    0m 43.43s	    real    0m 0.49s
>>> user    0m 0.00s	    user    0m 0.00s
>>> sys     0m 43.43s	    sys     0m 0.48s
>>>
>>> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
>>> ---
>>>       
>> Honestly I dont like this bloat.
>>
>> Changing dummy.c is trivial, and you can allocate 100.000.000 dummies if
>>  you want now :)
>>
>>     
>
> Yep we can do that - actually we are doing exactly this in our drivers. But in 
> that way, you get to "bloat" every driver which needs this. 
>   
For mac-vlans, .1q vlans, VETH, at least, you can choose the name when 
you create them
via 'ip'.  So, you can do the logic in user-space.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  9:51       ` Octavian Purdila
@ 2009-11-13 22:29         ` Stephen Hemminger
  2009-11-13 22:40           ` Benjamin LaHaise
  0 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 22:29 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: Eric Dumazet, netdev

On Fri, 13 Nov 2009 11:51:31 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:

> For us the usecase is creating interfaces that get used by applications that 
> generate all sorts of traffic. This allows us to simulate realistic end user 
> traffic (e.g. coming from a full blown stack). That sounds reasonable to us :)

So it is lots of pseudo-devices for a special purpose test machine.
That's great use of Linux, but not a case worth optimizing for in the mainline kernel.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 22:29         ` Stephen Hemminger
@ 2009-11-13 22:40           ` Benjamin LaHaise
  2009-11-13 22:49             ` Stephen Hemminger
  0 siblings, 1 reply; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-13 22:40 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Octavian Purdila, Eric Dumazet, netdev

On Fri, Nov 13, 2009 at 02:29:39PM -0800, Stephen Hemminger wrote:
> So it is lots of pseudo-devices for a special purpose test machine.
> That's great use of Linux, but not a case worth optimizing for in the mainline kernel.

He's not the only one who needs that, I certainly do.  BRAS application 
where traffic is being aggregated (ie PPPoE and L2TP servers) from lots of 
customers requires it.  There are also people hitting the same scaling 
issues in embedded PPPoE on wireless networks.

		-ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 22:40           ` Benjamin LaHaise
@ 2009-11-13 22:49             ` Stephen Hemminger
  2009-11-13 23:35               ` Benjamin LaHaise
  2009-11-14  7:08               ` Benny Amorsen
  0 siblings, 2 replies; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 22:49 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Octavian Purdila, Eric Dumazet, netdev

On Fri, 13 Nov 2009 17:40:43 -0500
Benjamin LaHaise <bcrl@lhnet.ca> wrote:

> On Fri, Nov 13, 2009 at 02:29:39PM -0800, Stephen Hemminger wrote:
> > So it is lots of pseudo-devices for a special purpose test machine.
> > That's great use of Linux, but not a case worth optimizing for in the mainline kernel.
> 
> He's not the only one who needs that, I certainly do.  BRAS application 
> where traffic is being aggregated (ie PPPoE and L2TP servers) from lots of 
> customers requires it.  There are also people hitting the same scaling 
> issues in embedded PPPoE on wireless networks.
> 
> 		-ben

Then maybe network devices aren't the right layering model. At some
point the paradigm has to be re-examined.

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 22:49             ` Stephen Hemminger
@ 2009-11-13 23:35               ` Benjamin LaHaise
  2009-11-13 23:39                 ` Stephen Hemminger
  2009-11-14  7:08               ` Benny Amorsen
  1 sibling, 1 reply; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-13 23:35 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Octavian Purdila, Eric Dumazet, netdev

On Fri, Nov 13, 2009 at 02:49:37PM -0800, Stephen Hemminger wrote:
> Then maybe network devices aren't the right layering model. At some
> point the paradigm has to be re-examined.

What is the right model for dealing with lots of connections to users and 
routes?  This problem isn't going to go away given the increases in 
connectivity and processing power that happen each year.  Today, software 
routing of 10Gbps links is a reality -- part of what comes with that ability 
of hardware is the need to deal with the fact that 10Gbps aggregates a lot 
of users.

		-ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 23:35               ` Benjamin LaHaise
@ 2009-11-13 23:39                 ` Stephen Hemminger
  2009-11-13 23:52                   ` Benjamin LaHaise
  0 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 23:39 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Octavian Purdila, Eric Dumazet, netdev

On Fri, 13 Nov 2009 18:35:04 -0500
Benjamin LaHaise <bcrl@lhnet.ca> wrote:

> On Fri, Nov 13, 2009 at 02:49:37PM -0800, Stephen Hemminger wrote:
> > Then maybe network devices aren't the right layering model. At some
> > point the paradigm has to be re-examined.
> 
> What is the right model for dealing with lots of connections to users and 
> routes?  This problem isn't going to go away given the increases in 
> connectivity and processing power that happen each year.  Today, software 
> routing of 10Gbps links is a reality -- part of what comes with that ability 
> of hardware is the need to deal with the fact that 10Gbps aggregates a lot 
> of users.
> 
> 		-ben

Well TCP handles lots of connections, but a socket has different overhead
than a network device. Why should 10Gbps need 10K PPPoE sessions?
Even Vlan's are less overhead than PPP

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 23:39                 ` Stephen Hemminger
@ 2009-11-13 23:52                   ` Benjamin LaHaise
  2009-11-14  2:59                     ` David Miller
  0 siblings, 1 reply; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-13 23:52 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Octavian Purdila, Eric Dumazet, netdev

On Fri, Nov 13, 2009 at 03:39:24PM -0800, Stephen Hemminger wrote:
> Well TCP handles lots of connections, but a socket has different overhead
> than a network device. Why should 10Gbps need 10K PPPoE sessions?
> Even Vlan's are less overhead than PPP

PPP's overhead is acceptable.  It makes managing networks a lot easier, since 
the authentication done by PPP is able to look up any end user specific 
information required (ie static ips and routes), while the access part of 
the network is a fairly generic config that uses switchs and things like the 
GVRP.  Without that, the configuration of any aggregation switch becomes a 
huge management nightmare.

If you don't want the overhead from this kind of scaling, stick it under a 
config option, but please don't stop other people from pushing Linux into 
new uses which have these scaling requirements.

		-ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13  5:20 ` Octavian Purdila
  2009-11-13  6:12   ` Eric Dumazet
@ 2009-11-14  0:04   ` Stephen Hemminger
  2009-11-14  0:14     ` Octavian Purdila
  1 sibling, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-14  0:04 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: netdev

On Fri, 13 Nov 2009 07:20:19 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:

> On Friday 13 November 2009 07:01:14 you wrote:
> > This patch speeds up the network device name allocation for the case
> > where a significant number of devices of the same type are created
> > consecutively.
> > 
> > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > and sysfs entries disabled:
> > 
> > Without the patch           With the patch
> > 
> > real    0m 43.43s	    real    0m 0.49s
> > user    0m 0.00s	    user    0m 0.00s
> > sys     0m 43.43s	    sys     0m 0.48s

Since the main overhead here is building the bitmap table used in the
name scan. Why not mantain the bitmap table between calls by
implementing a rbtree with prefix -> bitmap.
The tree would have to be limited and per namespace but then you
could handle the general case of adding a device, then its vlans,
then another device, ...


-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14  0:04   ` Stephen Hemminger
@ 2009-11-14  0:14     ` Octavian Purdila
  2009-11-14  0:20       ` Stephen Hemminger
  0 siblings, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-14  0:14 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

On Saturday 14 November 2009 02:04:45 you wrote:
> On Fri, 13 Nov 2009 07:20:19 +0200
> 
> Octavian Purdila <opurdila@ixiacom.com> wrote:
> > On Friday 13 November 2009 07:01:14 you wrote:
> > > This patch speeds up the network device name allocation for the case
> > > where a significant number of devices of the same type are created
> > > consecutively.
> > >
> > > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > > and sysfs entries disabled:
> > >
> > > Without the patch           With the patch
> > >
> > > real    0m 43.43s	    real    0m 0.49s
> > > user    0m 0.00s	    user    0m 0.00s
> > > sys     0m 43.43s	    sys     0m 0.48s
> 
> Since the main overhead here is building the bitmap table used in the
> name scan. Why not mantain the bitmap table between calls by
> implementing a rbtree with prefix -> bitmap.
> The tree would have to be limited and per namespace but then you
> could handle the general case of adding a device, then its vlans,
> then another device, ...
> 

I'll do that !

That was my original intent but I thought it would be too much bloat :) But I 
see your point, even if it is more complex, its more useful.

Thanks,
tavi

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14  0:14     ` Octavian Purdila
@ 2009-11-14  0:20       ` Stephen Hemminger
  0 siblings, 0 replies; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-14  0:20 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: netdev

On Sat, 14 Nov 2009 02:14:21 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:

> On Saturday 14 November 2009 02:04:45 you wrote:
> > On Fri, 13 Nov 2009 07:20:19 +0200
> > 
> > Octavian Purdila <opurdila@ixiacom.com> wrote:
> > > On Friday 13 November 2009 07:01:14 you wrote:
> > > > This patch speeds up the network device name allocation for the case
> > > > where a significant number of devices of the same type are created
> > > > consecutively.
> > > >
> > > > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > > > and sysfs entries disabled:
> > > >
> > > > Without the patch           With the patch
> > > >
> > > > real    0m 43.43s	    real    0m 0.49s
> > > > user    0m 0.00s	    user    0m 0.00s
> > > > sys     0m 43.43s	    sys     0m 0.48s
> > 
> > Since the main overhead here is building the bitmap table used in the
> > name scan. Why not mantain the bitmap table between calls by
> > implementing a rbtree with prefix -> bitmap.
> > The tree would have to be limited and per namespace but then you
> > could handle the general case of adding a device, then its vlans,
> > then another device, ...
> > 
> 
> I'll do that !
> 
> That was my original intent but I thought it would be too much bloat :) But I 
> see your point, even if it is more complex, its more useful.

There might even be a VM notifier hook that could be used to drop the whole
tree if any memory pressure was felt.

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 23:52                   ` Benjamin LaHaise
@ 2009-11-14  2:59                     ` David Miller
  2009-11-14  6:24                       ` Benjamin LaHaise
  2009-11-14 22:36                       ` Mark Smith
  0 siblings, 2 replies; 28+ messages in thread
From: David Miller @ 2009-11-14  2:59 UTC (permalink / raw)
  To: bcrl; +Cc: shemminger, opurdila, eric.dumazet, netdev

From: Benjamin LaHaise <bcrl@lhnet.ca>
Date: Fri, 13 Nov 2009 18:52:10 -0500

> If you don't want the overhead from this kind of scaling, stick it under a 
> config option, but please don't stop other people from pushing Linux into 
> new uses which have these scaling requirements.

This 'scaling requirement' only exists in environments where people
undersubsribe their networks, right?

I'm not saying we won't put scaling into these areas, I'm just trying
to make a point to show that this "need" only exists because people
have purposefully created these situations where they feel the need to
massively control their users usage in order to generate revenue.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14  2:59                     ` David Miller
@ 2009-11-14  6:24                       ` Benjamin LaHaise
  2009-11-14 22:36                       ` Mark Smith
  1 sibling, 0 replies; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-14  6:24 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, opurdila, eric.dumazet, netdev

On Fri, Nov 13, 2009 at 06:59:37PM -0800, David Miller wrote:
> From: Benjamin LaHaise <bcrl@lhnet.ca>
> Date: Fri, 13 Nov 2009 18:52:10 -0500
> 
> > If you don't want the overhead from this kind of scaling, stick it under a 
> > config option, but please don't stop other people from pushing Linux into 
> > new uses which have these scaling requirements.
> 
> This 'scaling requirement' only exists in environments where people
> undersubsribe their networks, right?

Depends on how you look at things.  The case of lots of interfaces going 
up/down can occur during normal operations.  The incumbent telco in this 
area has occasional flaps that reset thousands of sessions.  The problem 
relates to how things flop over to a different path within their network, 
as they don't provide hot standby circuits for all the aggregated traffic 
coming in -- a link down results in a flap of all the L2TP sessions.  As 
for it being underprovisioned, that doesn't really apply.  The core LNS 
boxes are kept from having saturated links, as that results in poor user 
performance.  Plus they have substantially more CPU than embedded routers.

> I'm not saying we won't put scaling into these areas, I'm just trying
> to make a point to show that this "need" only exists because people
> have purposefully created these situations where they feel the need to
> massively control their users usage in order to generate revenue.

I've finally got some of the userspace bits necessary for parallel network 
device creation wired up.  Will reducing the granularity of rtnl_lock() for 
devices which can handle it be okay?  That will get a factor of 4 to 8 
improvement from current single socket hardware.

The other way I'm working around the scaling issues is to use network 
namespaces.  Babylon (the L2TP/PPPoE stack I'm working on) can now split 
interface creation across some number of network namespaces.  This keeps 
the number of interfaces in a given net instance down to 5-10,000.  That 
really helps avoid some of the scaling issues, as we're pretty good in 
that range.

The worst part of all the overhead during setup and teardown is that very 
little traffic can pass while this is occurring, effectively making it an 
outage, hence the desire to minimise outage situations.

		-ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-13 22:49             ` Stephen Hemminger
  2009-11-13 23:35               ` Benjamin LaHaise
@ 2009-11-14  7:08               ` Benny Amorsen
  2009-11-14  7:21                 ` Eric Dumazet
  1 sibling, 1 reply; 28+ messages in thread
From: Benny Amorsen @ 2009-11-14  7:08 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Benjamin LaHaise, Octavian Purdila, Eric Dumazet, netdev

Stephen Hemminger <shemminger@vyatta.com> writes:

> Then maybe network devices aren't the right layering model. At some
> point the paradigm has to be re-examined.

I'm not quite sure where this becomes a problem. We have 1185 network
interfaces (VLAN's) on one box. Boot time is a problem, but other than
that it works ok. If something like this would help speed up booting,
that would be very nice.


/Benny


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14  7:08               ` Benny Amorsen
@ 2009-11-14  7:21                 ` Eric Dumazet
  2009-11-14 16:16                   ` Ben Greear
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2009-11-14  7:21 UTC (permalink / raw)
  To: Benny Amorsen
  Cc: Stephen Hemminger, Benjamin LaHaise, Octavian Purdila, netdev

Benny Amorsen a écrit :
> Stephen Hemminger <shemminger@vyatta.com> writes:
> 
>> Then maybe network devices aren't the right layering model. At some
>> point the paradigm has to be re-examined.
> 
> I'm not quite sure where this becomes a problem. We have 1185 network
> interfaces (VLAN's) on one box. Boot time is a problem, but other than
> that it works ok. If something like this would help speed up booting,
> that would be very nice.
> 

It would be very nice if you tell us why booting is very long,
ie what is done and how much time it takes.

It's clear that ~1000 vlans is quite reasonable :)



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14  7:21                 ` Eric Dumazet
@ 2009-11-14 16:16                   ` Ben Greear
  0 siblings, 0 replies; 28+ messages in thread
From: Ben Greear @ 2009-11-14 16:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Benny Amorsen, Stephen Hemminger, Benjamin LaHaise,
	Octavian Purdila, netdev

Eric Dumazet wrote:
> Benny Amorsen a écrit :
>   
>> Stephen Hemminger <shemminger@vyatta.com> writes:
>>
>>     
>>> Then maybe network devices aren't the right layering model. At some
>>> point the paradigm has to be re-examined.
>>>       
>> I'm not quite sure where this becomes a problem. We have 1185 network
>> interfaces (VLAN's) on one box. Boot time is a problem, but other than
>> that it works ok. If something like this would help speed up booting,
>> that would be very nice.
>>
>>     
>
> It would be very nice if you tell us why booting is very long,
> ie what is done and how much time it takes.
>
> It's clear that ~1000 vlans is quite reasonable :)
>   
At least sometimes, hotplug can degrade into a nasty case where it runs 
'ifconfig -a' for each
interface created.   (This was F11 if I recall correctly).

If you configure hotplug to ignore vlans, it might help your boot time.

Thanks,
Ben

>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14  2:59                     ` David Miller
  2009-11-14  6:24                       ` Benjamin LaHaise
@ 2009-11-14 22:36                       ` Mark Smith
  2009-11-15  1:22                         ` Stephen Hemminger
  2009-11-15  1:55                         ` Denys Fedoryschenko
  1 sibling, 2 replies; 28+ messages in thread
From: Mark Smith @ 2009-11-14 22:36 UTC (permalink / raw)
  To: David Miller; +Cc: bcrl, shemminger, opurdila, eric.dumazet, netdev

On Fri, 13 Nov 2009 18:59:37 -0800 (PST)
David Miller <davem@davemloft.net> wrote:

> From: Benjamin LaHaise <bcrl@lhnet.ca>
> Date: Fri, 13 Nov 2009 18:52:10 -0500
> 
> > If you don't want the overhead from this kind of scaling, stick it under a 
> > config option, but please don't stop other people from pushing Linux into 
> > new uses which have these scaling requirements.
> 
> This 'scaling requirement' only exists in environments where people
> undersubsribe their networks, right?
> 
> I'm not saying we won't put scaling into these areas, I'm just trying
> to make a point to show that this "need" only exists because people
> have purposefully created these situations where they feel the need to
> massively control their users usage in order to generate revenue.

I'm don't understand that comment, and I work for (and
designed most of the infrastructure for) an ISP that usually has
well over 40 000 concurrent PPPoE sesssions at any one time.

The fundamental purpose of PPPoE is nothing to do with any scaling or
architecture, it is purely to make a more modern shared networking
technology like Ethernet look like high speed dial up. This has occurred
mainly because when broadband came along it allowed ISPs to introduce
it quickly, without having to also upgrade their dial up oriented
backend systems i.e. customer authentication/accounting and customer
support systems. It wasn't ideal then and it isn't ideal now. PPPoE adds
an overhead of 8 bytes per packet, yet the only thing it is doing is
changing ethernet from multipoint to point-to-point so PPP can run
over it and providing ISPs with an ability to identify the subscriber.
There are other methods to solve customer identity problem without the
PPPoE overheads. Moving to them however can be a long drawn out process
because it also means changes to customer's CPE settings, or running
the old and new methods in parallel for the foreseeable future.

On the occasions I've looked at whether a Linux box would be an
alternative to the Cisco BRAS platform we use, the last time I looked
the number of sessions people were saying they were running was
500. I don't consider Linux to be feasible in that role until you're
able to run at least 5000 sessions on a single box. I'm a bit unusual
in that regard, as I prefer the "lots of smaller, increase chances
of failure, but consequences of failure" model - you manage the
larger number of them via configuration templating / scripted
change deployment. You need to chose your subscriber per device level,
and if 500 is the current limit for Linux, then in my opinion it is
currently too low for my application. Others in the industry might
consider 5000 too low, as they are running devices that can handle 32
000 or 64 000 PPPoE sessions.



> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14 22:36                       ` Mark Smith
@ 2009-11-15  1:22                         ` Stephen Hemminger
  2009-11-15  1:49                           ` Mark Smith
  2009-11-15  1:55                         ` Denys Fedoryschenko
  1 sibling, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-15  1:22 UTC (permalink / raw)
  To: Mark Smith; +Cc: David Miller, bcrl, opurdila, eric.dumazet, netdev

On Sun, 15 Nov 2009 09:06:04 +1030
Mark Smith <lk-netdev@lk-netdev.nosense.org> wrote:

> The fundamental purpose of PPPoE is nothing to do with any scaling or
> architecture, it is purely to make a more modern shared networking
> technology like Ethernet look like high speed dial up. This has occurred
> mainly because when broadband came along it allowed ISPs to introduce
> it quickly, without having to also upgrade their dial up oriented
> backend systems i.e. customer authentication/accounting and customer
> support systems. It wasn't ideal then and it isn't ideal now. PPPoE adds
> an overhead of 8 bytes per packet, yet the only thing it is doing is
> changing ethernet from multipoint to point-to-point so PPP can run
> over it and providing ISPs with an ability to identify the subscriber.
> There are other methods to solve customer identity problem without the
> PPPoE overheads. Moving to them however can be a long drawn out process
> because it also means changes to customer's CPE settings, or running
> the old and new methods in parallel for the foreseeable future.

Carriers still haven't figured out that circuit switched networks don't
scale. They just can't learn the lesson of the Internet.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-15  1:22                         ` Stephen Hemminger
@ 2009-11-15  1:49                           ` Mark Smith
  0 siblings, 0 replies; 28+ messages in thread
From: Mark Smith @ 2009-11-15  1:49 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, bcrl, opurdila, eric.dumazet, netdev

On Sat, 14 Nov 2009 17:22:24 -0800
Stephen Hemminger <shemminger@vyatta.com> wrote:

> On Sun, 15 Nov 2009 09:06:04 +1030
> Mark Smith <lk-netdev@lk-netdev.nosense.org> wrote:
> 
> > The fundamental purpose of PPPoE is nothing to do with any scaling or
> > architecture, it is purely to make a more modern shared networking
> > technology like Ethernet look like high speed dial up. This has occurred
> > mainly because when broadband came along it allowed ISPs to introduce
> > it quickly, without having to also upgrade their dial up oriented
> > backend systems i.e. customer authentication/accounting and customer
> > support systems. It wasn't ideal then and it isn't ideal now. PPPoE adds
> > an overhead of 8 bytes per packet, yet the only thing it is doing is
> > changing ethernet from multipoint to point-to-point so PPP can run
> > over it and providing ISPs with an ability to identify the subscriber.
> > There are other methods to solve customer identity problem without the
> > PPPoE overheads. Moving to them however can be a long drawn out process
> > because it also means changes to customer's CPE settings, or running
> > the old and new methods in parallel for the foreseeable future.
> 
> Carriers still haven't figured out that circuit switched networks don't
> scale. They just can't learn the lesson of the Internet.

I don't really think that is the case. The authors of the PPPoE
spec were all from "Internet" companies, including UUNET, the first
Internet company, and the largest at the time, so I'm sure they all knew
about Internet scaling.

Here's what they had to say in the RFC2516 intro:

"  Modern access technologies are faced with several conflicting goals.
   It is desirable to connect multiple hosts at a remote site through
   the same customer premise access device.  It is also a goal to
   provide access control and billing functionality in a manner similar
   to dial-up services using PPP.  In many access technologies, the most
   cost effective method to attach multiple hosts to the customer
   premise access device, is via Ethernet.  In addition, it is desirable
   to keep the cost of this device as low as possible while requiring
   little or no configuration."



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-14 22:36                       ` Mark Smith
  2009-11-15  1:22                         ` Stephen Hemminger
@ 2009-11-15  1:55                         ` Denys Fedoryschenko
  2009-11-15  7:48                           ` Eric Dumazet
  2009-11-15 16:50                           ` Benjamin LaHaise
  1 sibling, 2 replies; 28+ messages in thread
From: Denys Fedoryschenko @ 2009-11-15  1:55 UTC (permalink / raw)
  To: Mark Smith; +Cc: David Miller, bcrl, shemminger, opurdila, eric.dumazet, netdev

On Sunday 15 November 2009 00:36:04 Mark Smith wrote:
> On the occasions I've looked at whether a Linux box would be an
> alternative to the Cisco BRAS platform we use, the last time I looked
> the number of sessions people were saying they were running was
> 500. I don't consider Linux to be feasible in that role until you're
> able to run at least 5000 sessions on a single box. I'm a bit unusual
I am running up to 3500 on single NAS, but there is only 3 biggest one like 
this, and i am limited only by subscribers on this location (network is 
distributed over the country, and i have around 200 NAS servers running in 
summary). And it is just PC bought from nearest supermarket with cheap PCI 
RTL8169, and similar quality LOM adapter e1000e. Everything running on 
cheapest USB flash from same supermarket.

For my case running Linux NAS on cheap PC's is only choice. It is 3rd world 
country, and many reasons (i can explain each, but it is not technical 
subject) doesn't let me to think, that "professional" equipment is feasible 
for me.

Here people build networks on cheapest unmanageable switches, same 
cost/quality 802.11b/g wireless networks, and only a way to terminate them 
reliably is PPPoE. I know, it is also weak and easy to break, but it is 
single choice i have.
I know also ISP's in Russia, who have somehow partially "managed" networks, 
but PPPoE letting them to drop running costs.

And interface creation speed is important for me, when electricity goes down 
here, many customers disconnects (up to 500 on single NAS), and then join 
again to NAS. Load average was jumping to sky on such situations, just option 
to not create sysfs entries helped me a lot (was posted recently).
Electricity outage is usual here, happens 2-3 times daily.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-15  1:55                         ` Denys Fedoryschenko
@ 2009-11-15  7:48                           ` Eric Dumazet
  2009-11-15 16:50                           ` Benjamin LaHaise
  1 sibling, 0 replies; 28+ messages in thread
From: Eric Dumazet @ 2009-11-15  7:48 UTC (permalink / raw)
  To: Denys Fedoryschenko
  Cc: Mark Smith, David Miller, bcrl, shemminger, opurdila, netdev

Denys Fedoryschenko a écrit :
> On Sunday 15 November 2009 00:36:04 Mark Smith wrote:
>> On the occasions I've looked at whether a Linux box would be an
>> alternative to the Cisco BRAS platform we use, the last time I looked
>> the number of sessions people were saying they were running was
>> 500. I don't consider Linux to be feasible in that role until you're
>> able to run at least 5000 sessions on a single box. I'm a bit unusual
> I am running up to 3500 on single NAS, but there is only 3 biggest one like 
> this, and i am limited only by subscribers on this location (network is 
> distributed over the country, and i have around 200 NAS servers running in 
> summary). And it is just PC bought from nearest supermarket with cheap PCI 
> RTL8169, and similar quality LOM adapter e1000e. Everything running on 
> cheapest USB flash from same supermarket.
> 
> For my case running Linux NAS on cheap PC's is only choice. It is 3rd world 
> country, and many reasons (i can explain each, but it is not technical 
> subject) doesn't let me to think, that "professional" equipment is feasible 
> for me.
> 
> Here people build networks on cheapest unmanageable switches, same 
> cost/quality 802.11b/g wireless networks, and only a way to terminate them 
> reliably is PPPoE. I know, it is also weak and easy to break, but it is 
> single choice i have.
> I know also ISP's in Russia, who have somehow partially "managed" networks, 
> but PPPoE letting them to drop running costs.
> 
> And interface creation speed is important for me, when electricity goes down 
> here, many customers disconnects (up to 500 on single NAS), and then join 
> again to NAS. Load average was jumping to sky on such situations, just option 
> to not create sysfs entries helped me a lot (was posted recently).
> Electricity outage is usual here, happens 2-3 times daily.

I found in my cases (not pppoe) that load was very high because of udev,
doing crazy loops of :

if (!rtnl_trylock())
     return restart_syscall();

About pppoe, we have a 16 slots hash table, protected by a single rwlock.

This wont scale to 50000 sessions, unless we use larger hashtable and
maybe RCU as well.

About the dismantling phase, it is currently a synchronous thing
(as the resquester process has to wait for many rcu grace periods
for each netdevice to dismantle). Thats typically ~20 ms per device !

For 'anonymous' netdevices, we probably could queue them and use a
 worker thread to handle this queue using the new batch mode,
added in net-next-2.6.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
  2009-11-15  1:55                         ` Denys Fedoryschenko
  2009-11-15  7:48                           ` Eric Dumazet
@ 2009-11-15 16:50                           ` Benjamin LaHaise
  1 sibling, 0 replies; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-15 16:50 UTC (permalink / raw)
  To: Denys Fedoryschenko
  Cc: Mark Smith, David Miller, shemminger, opurdila, eric.dumazet, netdev

Hi Denys,

On Sun, Nov 15, 2009 at 03:55:14AM +0200, Denys Fedoryschenko wrote:
> And interface creation speed is important for me, when electricity goes down 
> here, many customers disconnects (up to 500 on single NAS), and then join 
> again to NAS. Load average was jumping to sky on such situations, just option 
> to not create sysfs entries helped me a lot (was posted recently).
> Electricity outage is usual here, happens 2-3 times daily.

This is exactly the type of scenario I'm looking at.  The design of the 
Babylon PPP stack is meant to scale somewhat better that pppd.  It uses a 
single process (although I'm starting to add threads to improve scaling on 
SMP systems) for all PPP/L2TP sessions, and has rather lower connection 
setup overhead (no fork()/exec() being the biggest one).  With udev tuned, 
irqbalance disabled and a few other tweaks, it gets >500 connections per 
second in startup on a modern 2.6GHz processor for L2TP traffic.  There 
is PPPoE support, but it needs a bit more work done to scale automatically 
(there are a few hardcoded limits in the PPPoE implementation).

		-ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2009-11-15 16:50 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-13  5:01 [net-next-2.6 PATCH] net: fast consecutive name allocation Octavian Purdila
2009-11-13  5:20 ` Octavian Purdila
2009-11-13  6:12   ` Eric Dumazet
2009-11-13  6:26     ` Stephen Hemminger
2009-11-13  7:09       ` Eric Dumazet
2009-11-13  9:51       ` Octavian Purdila
2009-11-13 22:29         ` Stephen Hemminger
2009-11-13 22:40           ` Benjamin LaHaise
2009-11-13 22:49             ` Stephen Hemminger
2009-11-13 23:35               ` Benjamin LaHaise
2009-11-13 23:39                 ` Stephen Hemminger
2009-11-13 23:52                   ` Benjamin LaHaise
2009-11-14  2:59                     ` David Miller
2009-11-14  6:24                       ` Benjamin LaHaise
2009-11-14 22:36                       ` Mark Smith
2009-11-15  1:22                         ` Stephen Hemminger
2009-11-15  1:49                           ` Mark Smith
2009-11-15  1:55                         ` Denys Fedoryschenko
2009-11-15  7:48                           ` Eric Dumazet
2009-11-15 16:50                           ` Benjamin LaHaise
2009-11-14  7:08               ` Benny Amorsen
2009-11-14  7:21                 ` Eric Dumazet
2009-11-14 16:16                   ` Ben Greear
2009-11-13  9:55     ` Octavian Purdila
2009-11-13 16:40       ` Ben Greear
2009-11-14  0:04   ` Stephen Hemminger
2009-11-14  0:14     ` Octavian Purdila
2009-11-14  0:20       ` Stephen Hemminger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.