* [net-next-2.6 PATCH] net: fast consecutive name allocation
@ 2009-11-13 5:01 Octavian Purdila
2009-11-13 5:20 ` Octavian Purdila
0 siblings, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13 5:01 UTC (permalink / raw)
To: netdev
[-- Attachment #1: Type: text/plain, Size: 621 bytes --]
This patch speeds up the network device name allocation for the case
where a significant number of devices of the same type are created
consecutively.
Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
and sysfs entries disabled:
Without the patch With the patch
real 0m 43.43s real 0m 0.49s
user 0m 0.00s user 0m 0.00s
sys 0m 43.43s sys 0m 0.48s
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
---
include/net/net_namespace.h | 3 +++
net/core/dev.c | 23 ++++++++++++++++++++++-
2 files changed, 25 insertions(+), 1 deletions(-)
[-- Attachment #2: 3ad8e007a0c929204c3ee7e7afa309e2e53b5b8a.diff --]
[-- Type: text/x-patch, Size: 2073 bytes --]
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0addd45..39c65a2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,9 @@ struct net {
struct list_head dev_base_head;
struct hlist_head *dev_name_head;
struct hlist_head *dev_index_head;
+ /* fast consecutive name allocation (e.g. eth0, eth1, ...) */
+ char fcna_name[IFNAMSIZ];
+ int fcna_no;
/* core fib_rules */
struct list_head rules_ops;
diff --git a/net/core/dev.c b/net/core/dev.c
index ad8e320..008e3c7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -226,8 +226,12 @@ static int list_netdevice(struct net_device *dev)
*/
static void unlist_netdevice(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
+
ASSERT_RTNL();
+ net->fcna_no = -1;
+
/* Unlink dev from the device chain */
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
@@ -872,6 +876,16 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
+ /* avoid fast allocation for strange templates like "fan%dcy" */
+ if (net->fcna_no >= 0 && p[2] == 0 &&
+ net->fcna_name[p - name] == 0 &&
+ memcmp(name, net->fcna_name, p - name) == 0) {
+ snprintf(buf, IFNAMSIZ, name, ++net->fcna_no);
+ if (!__dev_get_by_name(net, buf))
+ return net->fcna_no;
+ net->fcna_no = -1;
+ }
+
/* Use one page as a bit array of possible slots */
inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
if (!inuse)
@@ -894,8 +908,15 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
}
snprintf(buf, IFNAMSIZ, name, i);
- if (!__dev_get_by_name(net, buf))
+ if (!__dev_get_by_name(net, buf)) {
+ if (p[2] == 0) {
+ memcpy(net->fcna_name, name, p - name);
+ net->fcna_name[p - name] = 0;
+ net->fcna_no = i;
+ } else
+ net->fcna_no = -1;
return i;
+ }
/* It is possible to run out of possible slots
* when the name is long and there isn't enough space left
^ permalink raw reply related [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 5:01 [net-next-2.6 PATCH] net: fast consecutive name allocation Octavian Purdila
@ 2009-11-13 5:20 ` Octavian Purdila
2009-11-13 6:12 ` Eric Dumazet
2009-11-14 0:04 ` Stephen Hemminger
0 siblings, 2 replies; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13 5:20 UTC (permalink / raw)
To: netdev
On Friday 13 November 2009 07:01:14 you wrote:
> This patch speeds up the network device name allocation for the case
> where a significant number of devices of the same type are created
> consecutively.
>
> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> and sysfs entries disabled:
>
> Without the patch With the patch
>
> real 0m 43.43s real 0m 0.49s
> user 0m 0.00s user 0m 0.00s
> sys 0m 43.43s sys 0m 0.48s
>
Oops, pasting root prompts (e.g. # modprobe ....) directly into the git commit message is not a good idea :) Here it is again, with the full commit message.
[net-next-2.6 PATCH] net: fast consecutive name allocation
This patch speeds up the network device name allocation for the case
where a significant number of devices of the same type are created
consecutively.
Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
and sysfs entries disabled:
$ time insmod /lib/modules/dummy.ko numdummies=8000
Without the patch With the patch
real 0m 43.43s real 0m 0.49s
user 0m 0.00s user 0m 0.00s
sys 0m 43.43s sys 0m 0.48s
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
---
include/net/net_namespace.h | 3 +++
net/core/dev.c | 23 ++++++++++++++++++++++-
2 files changed, 25 insertions(+), 1 deletions(-)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0addd45..39c65a2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,9 @@ struct net {
struct list_head dev_base_head;
struct hlist_head *dev_name_head;
struct hlist_head *dev_index_head;
+ /* fast consecutive name allocation (e.g. eth0, eth1, ...) */
+ char fcna_name[IFNAMSIZ];
+ int fcna_no;
/* core fib_rules */
struct list_head rules_ops;
diff --git a/net/core/dev.c b/net/core/dev.c
index ad8e320..008e3c7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -226,8 +226,12 @@ static int list_netdevice(struct net_device *dev)
*/
static void unlist_netdevice(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
+
ASSERT_RTNL();
+ net->fcna_no = -1;
+
/* Unlink dev from the device chain */
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
@@ -872,6 +876,16 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
+ /* avoid fast allocation for strange templates like "fan%dcy" */
+ if (net->fcna_no >= 0 && p[2] == 0 &&
+ net->fcna_name[p - name] == 0 &&
+ memcmp(name, net->fcna_name, p - name) == 0) {
+ snprintf(buf, IFNAMSIZ, name, ++net->fcna_no);
+ if (!__dev_get_by_name(net, buf))
+ return net->fcna_no;
+ net->fcna_no = -1;
+ }
+
/* Use one page as a bit array of possible slots */
inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
if (!inuse)
@@ -894,8 +908,15 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
}
snprintf(buf, IFNAMSIZ, name, i);
- if (!__dev_get_by_name(net, buf))
+ if (!__dev_get_by_name(net, buf)) {
+ if (p[2] == 0) {
+ memcpy(net->fcna_name, name, p - name);
+ net->fcna_name[p - name] = 0;
+ net->fcna_no = i;
+ } else
+ net->fcna_no = -1;
return i;
+ }
/* It is possible to run out of possible slots
* when the name is long and there isn't enough space left
--
1.5.6.5
^ permalink raw reply related [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 5:20 ` Octavian Purdila
@ 2009-11-13 6:12 ` Eric Dumazet
2009-11-13 6:26 ` Stephen Hemminger
2009-11-13 9:55 ` Octavian Purdila
2009-11-14 0:04 ` Stephen Hemminger
1 sibling, 2 replies; 28+ messages in thread
From: Eric Dumazet @ 2009-11-13 6:12 UTC (permalink / raw)
To: Octavian Purdila; +Cc: netdev
Octavian Purdila a écrit :
> On Friday 13 November 2009 07:01:14 you wrote:
>> This patch speeds up the network device name allocation for the case
>> where a significant number of devices of the same type are created
>> consecutively.
>>
>> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
>> and sysfs entries disabled:
>>
>> Without the patch With the patch
>>
>> real 0m 43.43s real 0m 0.49s
>> user 0m 0.00s user 0m 0.00s
>> sys 0m 43.43s sys 0m 0.48s
>>
>
> Oops, pasting root prompts (e.g. # modprobe ....) directly into the git commit message is not a good idea :) Here it is again, with the full commit message.
>
> [net-next-2.6 PATCH] net: fast consecutive name allocation
>
> This patch speeds up the network device name allocation for the case
> where a significant number of devices of the same type are created
> consecutively.
>
> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> and sysfs entries disabled:
>
> $ time insmod /lib/modules/dummy.ko numdummies=8000
>
> Without the patch With the patch
>
> real 0m 43.43s real 0m 0.49s
> user 0m 0.00s user 0m 0.00s
> sys 0m 43.43s sys 0m 0.48s
>
> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> ---
Honestly I dont like this bloat.
Changing dummy.c is trivial, and you can allocate 100.000.000 dummies if you want now :)
I not tested yet this patch but here it is :
[PATCH] dummy: Allow more than 32768 dummies
And speedup name allocation : O(N) instead of O(N^2)
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
drivers/net/dummy.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 37dcfdc..f600c4c 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -107,12 +107,14 @@ static struct rtnl_link_ops dummy_link_ops __read_mostly = {
module_param(numdummies, int, 0);
MODULE_PARM_DESC(numdummies, "Number of dummy pseudo devices");
-static int __init dummy_init_one(void)
+static int __init dummy_init_one(int i)
{
struct net_device *dev_dummy;
int err;
+ char name[IFNAMSIZ];
- dev_dummy = alloc_netdev(0, "dummy%d", dummy_setup);
+ snprintf(name, IFNAMSIZ, "dummy%d", i);
+ dev_dummy = alloc_netdev(0, name, dummy_setup);
if (!dev_dummy)
return -ENOMEM;
@@ -139,7 +141,7 @@ static int __init dummy_init_module(void)
err = __rtnl_link_register(&dummy_link_ops);
for (i = 0; i < numdummies && !err; i++)
- err = dummy_init_one();
+ err = dummy_init_one(i);
if (err < 0)
__rtnl_link_unregister(&dummy_link_ops);
rtnl_unlock();
^ permalink raw reply related [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 6:12 ` Eric Dumazet
@ 2009-11-13 6:26 ` Stephen Hemminger
2009-11-13 7:09 ` Eric Dumazet
2009-11-13 9:51 ` Octavian Purdila
2009-11-13 9:55 ` Octavian Purdila
1 sibling, 2 replies; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 6:26 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Octavian Purdila, netdev
On Fri, 13 Nov 2009 07:12:35 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Octavian Purdila a écrit :
> > On Friday 13 November 2009 07:01:14 you wrote:
> >> This patch speeds up the network device name allocation for the case
> >> where a significant number of devices of the same type are created
> >> consecutively.
> >>
> >> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> >> and sysfs entries disabled:
> >>
> >> Without the patch With the patch
> >>
> >> real 0m 43.43s real 0m 0.49s
> >> user 0m 0.00s user 0m 0.00s
> >> sys 0m 43.43s sys 0m 0.48s
> >>
> >
No one has give a reasonable use case for this network device name
explosion, what is the benchmark doing this nosense, and how do I
get paid to do it...
But I have to say no for another reason. You cause the kernel to choose
a different name for the case where a device is deleted or renamed.
The old code would find and fill the hole when a new device was added.
Since this is a semantic ABI change, the kind that drives users nuts.
--
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 6:26 ` Stephen Hemminger
@ 2009-11-13 7:09 ` Eric Dumazet
2009-11-13 9:51 ` Octavian Purdila
1 sibling, 0 replies; 28+ messages in thread
From: Eric Dumazet @ 2009-11-13 7:09 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Octavian Purdila, netdev
Stephen Hemminger a écrit :
> No one has give a reasonable use case for this network device name
> explosion, what is the benchmark doing this nosense, and how do I
> get paid to do it...
>
> But I have to say no for another reason. You cause the kernel to choose
> a different name for the case where a device is deleted or renamed.
> The old code would find and fill the hole when a new device was added.
>
> Since this is a semantic ABI change, the kind that drives users nuts.
>
Yes, I see your point Stephen, but it would be trivial to skip already
used devices as well with litle change.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 6:26 ` Stephen Hemminger
2009-11-13 7:09 ` Eric Dumazet
@ 2009-11-13 9:51 ` Octavian Purdila
2009-11-13 22:29 ` Stephen Hemminger
1 sibling, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13 9:51 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Eric Dumazet, netdev
On Friday 13 November 2009 08:26:08 you wrote:
> On Fri, 13 Nov 2009 07:12:35 +0100
>
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Octavian Purdila a écrit :
> > > On Friday 13 November 2009 07:01:14 you wrote:
> > >> This patch speeds up the network device name allocation for the case
> > >> where a significant number of devices of the same type are created
> > >> consecutively.
> > >>
> > >> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > >> and sysfs entries disabled:
> > >>
> > >> Without the patch With the patch
> > >>
> > >> real 0m 43.43s real 0m 0.49s
> > >> user 0m 0.00s user 0m 0.00s
> > >> sys 0m 43.43s sys 0m 0.48s
>
> No one has give a reasonable use case for this network device name
> explosion, what is the benchmark doing this nosense, and how do I
> get paid to do it...
>
For us the usecase is creating interfaces that get used by applications that
generate all sorts of traffic. This allows us to simulate realistic end user
traffic (e.g. coming from a full blown stack). That sounds reasonable to us :)
Also, I've seen other people reporting here to use more then 8000 interfaces.
> But I have to say no for another reason. You cause the kernel to choose
> a different name for the case where a device is deleted or renamed.
> The old code would find and fill the hole when a new device was added.
>
> Since this is a semantic ABI change, the kind that drives users nuts.
>
The intent was to keep the old behavior. When the device is deleted we stop
fast allocation and we resume it only after we go through the old code once
again. Did I miss something?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 6:12 ` Eric Dumazet
2009-11-13 6:26 ` Stephen Hemminger
@ 2009-11-13 9:55 ` Octavian Purdila
2009-11-13 16:40 ` Ben Greear
1 sibling, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-13 9:55 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
On Friday 13 November 2009 08:12:35 you wrote:
> Octavian Purdila a écrit :
> > On Friday 13 November 2009 07:01:14 you wrote:
> >> This patch speeds up the network device name allocation for the case
> >> where a significant number of devices of the same type are created
> >> consecutively.
> >>
> >> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> >> and sysfs entries disabled:
> >>
> >> Without the patch With the patch
> >>
> >> real 0m 43.43s real 0m 0.49s
> >> user 0m 0.00s user 0m 0.00s
> >> sys 0m 43.43s sys 0m 0.48s
> >
> > Oops, pasting root prompts (e.g. # modprobe ....) directly into the git
> > commit message is not a good idea :) Here it is again, with the full
> > commit message.
> >
> > [net-next-2.6 PATCH] net: fast consecutive name allocation
> >
> > This patch speeds up the network device name allocation for the case
> > where a significant number of devices of the same type are created
> > consecutively.
> >
> > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > and sysfs entries disabled:
> >
> > $ time insmod /lib/modules/dummy.ko numdummies=8000
> >
> > Without the patch With the patch
> >
> > real 0m 43.43s real 0m 0.49s
> > user 0m 0.00s user 0m 0.00s
> > sys 0m 43.43s sys 0m 0.48s
> >
> > Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> > ---
>
> Honestly I dont like this bloat.
>
> Changing dummy.c is trivial, and you can allocate 100.000.000 dummies if
> you want now :)
>
Yep we can do that - actually we are doing exactly this in our drivers. But in
that way, you get to "bloat" every driver which needs this.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 9:55 ` Octavian Purdila
@ 2009-11-13 16:40 ` Ben Greear
0 siblings, 0 replies; 28+ messages in thread
From: Ben Greear @ 2009-11-13 16:40 UTC (permalink / raw)
To: Octavian Purdila; +Cc: Eric Dumazet, netdev
Octavian Purdila wrote:
> On Friday 13 November 2009 08:12:35 you wrote:
>
>> Octavian Purdila a écrit :
>>
>>> On Friday 13 November 2009 07:01:14 you wrote:
>>>
>>>> This patch speeds up the network device name allocation for the case
>>>> where a significant number of devices of the same type are created
>>>> consecutively.
>>>>
>>>> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
>>>> and sysfs entries disabled:
>>>>
>>>> Without the patch With the patch
>>>>
>>>> real 0m 43.43s real 0m 0.49s
>>>> user 0m 0.00s user 0m 0.00s
>>>> sys 0m 43.43s sys 0m 0.48s
>>>>
>>> Oops, pasting root prompts (e.g. # modprobe ....) directly into the git
>>> commit message is not a good idea :) Here it is again, with the full
>>> commit message.
>>>
>>> [net-next-2.6 PATCH] net: fast consecutive name allocation
>>>
>>> This patch speeds up the network device name allocation for the case
>>> where a significant number of devices of the same type are created
>>> consecutively.
>>>
>>> Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
>>> and sysfs entries disabled:
>>>
>>> $ time insmod /lib/modules/dummy.ko numdummies=8000
>>>
>>> Without the patch With the patch
>>>
>>> real 0m 43.43s real 0m 0.49s
>>> user 0m 0.00s user 0m 0.00s
>>> sys 0m 43.43s sys 0m 0.48s
>>>
>>> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
>>> ---
>>>
>> Honestly I dont like this bloat.
>>
>> Changing dummy.c is trivial, and you can allocate 100.000.000 dummies if
>> you want now :)
>>
>>
>
> Yep we can do that - actually we are doing exactly this in our drivers. But in
> that way, you get to "bloat" every driver which needs this.
>
For mac-vlans, .1q vlans, VETH, at least, you can choose the name when
you create them
via 'ip'. So, you can do the logic in user-space.
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 9:51 ` Octavian Purdila
@ 2009-11-13 22:29 ` Stephen Hemminger
2009-11-13 22:40 ` Benjamin LaHaise
0 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 22:29 UTC (permalink / raw)
To: Octavian Purdila; +Cc: Eric Dumazet, netdev
On Fri, 13 Nov 2009 11:51:31 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:
> For us the usecase is creating interfaces that get used by applications that
> generate all sorts of traffic. This allows us to simulate realistic end user
> traffic (e.g. coming from a full blown stack). That sounds reasonable to us :)
So it is lots of pseudo-devices for a special purpose test machine.
That's great use of Linux, but not a case worth optimizing for in the mainline kernel.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 22:29 ` Stephen Hemminger
@ 2009-11-13 22:40 ` Benjamin LaHaise
2009-11-13 22:49 ` Stephen Hemminger
0 siblings, 1 reply; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-13 22:40 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Octavian Purdila, Eric Dumazet, netdev
On Fri, Nov 13, 2009 at 02:29:39PM -0800, Stephen Hemminger wrote:
> So it is lots of pseudo-devices for a special purpose test machine.
> That's great use of Linux, but not a case worth optimizing for in the mainline kernel.
He's not the only one who needs that, I certainly do. BRAS application
where traffic is being aggregated (ie PPPoE and L2TP servers) from lots of
customers requires it. There are also people hitting the same scaling
issues in embedded PPPoE on wireless networks.
-ben
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 22:40 ` Benjamin LaHaise
@ 2009-11-13 22:49 ` Stephen Hemminger
2009-11-13 23:35 ` Benjamin LaHaise
2009-11-14 7:08 ` Benny Amorsen
0 siblings, 2 replies; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 22:49 UTC (permalink / raw)
To: Benjamin LaHaise; +Cc: Octavian Purdila, Eric Dumazet, netdev
On Fri, 13 Nov 2009 17:40:43 -0500
Benjamin LaHaise <bcrl@lhnet.ca> wrote:
> On Fri, Nov 13, 2009 at 02:29:39PM -0800, Stephen Hemminger wrote:
> > So it is lots of pseudo-devices for a special purpose test machine.
> > That's great use of Linux, but not a case worth optimizing for in the mainline kernel.
>
> He's not the only one who needs that, I certainly do. BRAS application
> where traffic is being aggregated (ie PPPoE and L2TP servers) from lots of
> customers requires it. There are also people hitting the same scaling
> issues in embedded PPPoE on wireless networks.
>
> -ben
Then maybe network devices aren't the right layering model. At some
point the paradigm has to be re-examined.
--
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 22:49 ` Stephen Hemminger
@ 2009-11-13 23:35 ` Benjamin LaHaise
2009-11-13 23:39 ` Stephen Hemminger
2009-11-14 7:08 ` Benny Amorsen
1 sibling, 1 reply; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-13 23:35 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Octavian Purdila, Eric Dumazet, netdev
On Fri, Nov 13, 2009 at 02:49:37PM -0800, Stephen Hemminger wrote:
> Then maybe network devices aren't the right layering model. At some
> point the paradigm has to be re-examined.
What is the right model for dealing with lots of connections to users and
routes? This problem isn't going to go away given the increases in
connectivity and processing power that happen each year. Today, software
routing of 10Gbps links is a reality -- part of what comes with that ability
of hardware is the need to deal with the fact that 10Gbps aggregates a lot
of users.
-ben
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 23:35 ` Benjamin LaHaise
@ 2009-11-13 23:39 ` Stephen Hemminger
2009-11-13 23:52 ` Benjamin LaHaise
0 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-13 23:39 UTC (permalink / raw)
To: Benjamin LaHaise; +Cc: Octavian Purdila, Eric Dumazet, netdev
On Fri, 13 Nov 2009 18:35:04 -0500
Benjamin LaHaise <bcrl@lhnet.ca> wrote:
> On Fri, Nov 13, 2009 at 02:49:37PM -0800, Stephen Hemminger wrote:
> > Then maybe network devices aren't the right layering model. At some
> > point the paradigm has to be re-examined.
>
> What is the right model for dealing with lots of connections to users and
> routes? This problem isn't going to go away given the increases in
> connectivity and processing power that happen each year. Today, software
> routing of 10Gbps links is a reality -- part of what comes with that ability
> of hardware is the need to deal with the fact that 10Gbps aggregates a lot
> of users.
>
> -ben
Well TCP handles lots of connections, but a socket has different overhead
than a network device. Why should 10Gbps need 10K PPPoE sessions?
Even Vlan's are less overhead than PPP
--
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 23:39 ` Stephen Hemminger
@ 2009-11-13 23:52 ` Benjamin LaHaise
2009-11-14 2:59 ` David Miller
0 siblings, 1 reply; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-13 23:52 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Octavian Purdila, Eric Dumazet, netdev
On Fri, Nov 13, 2009 at 03:39:24PM -0800, Stephen Hemminger wrote:
> Well TCP handles lots of connections, but a socket has different overhead
> than a network device. Why should 10Gbps need 10K PPPoE sessions?
> Even Vlan's are less overhead than PPP
PPP's overhead is acceptable. It makes managing networks a lot easier, since
the authentication done by PPP is able to look up any end user specific
information required (ie static ips and routes), while the access part of
the network is a fairly generic config that uses switchs and things like the
GVRP. Without that, the configuration of any aggregation switch becomes a
huge management nightmare.
If you don't want the overhead from this kind of scaling, stick it under a
config option, but please don't stop other people from pushing Linux into
new uses which have these scaling requirements.
-ben
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 5:20 ` Octavian Purdila
2009-11-13 6:12 ` Eric Dumazet
@ 2009-11-14 0:04 ` Stephen Hemminger
2009-11-14 0:14 ` Octavian Purdila
1 sibling, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-14 0:04 UTC (permalink / raw)
To: Octavian Purdila; +Cc: netdev
On Fri, 13 Nov 2009 07:20:19 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:
> On Friday 13 November 2009 07:01:14 you wrote:
> > This patch speeds up the network device name allocation for the case
> > where a significant number of devices of the same type are created
> > consecutively.
> >
> > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > and sysfs entries disabled:
> >
> > Without the patch With the patch
> >
> > real 0m 43.43s real 0m 0.49s
> > user 0m 0.00s user 0m 0.00s
> > sys 0m 43.43s sys 0m 0.48s
Since the main overhead here is building the bitmap table used in the
name scan. Why not mantain the bitmap table between calls by
implementing a rbtree with prefix -> bitmap.
The tree would have to be limited and per namespace but then you
could handle the general case of adding a device, then its vlans,
then another device, ...
--
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 0:04 ` Stephen Hemminger
@ 2009-11-14 0:14 ` Octavian Purdila
2009-11-14 0:20 ` Stephen Hemminger
0 siblings, 1 reply; 28+ messages in thread
From: Octavian Purdila @ 2009-11-14 0:14 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
On Saturday 14 November 2009 02:04:45 you wrote:
> On Fri, 13 Nov 2009 07:20:19 +0200
>
> Octavian Purdila <opurdila@ixiacom.com> wrote:
> > On Friday 13 November 2009 07:01:14 you wrote:
> > > This patch speeds up the network device name allocation for the case
> > > where a significant number of devices of the same type are created
> > > consecutively.
> > >
> > > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > > and sysfs entries disabled:
> > >
> > > Without the patch With the patch
> > >
> > > real 0m 43.43s real 0m 0.49s
> > > user 0m 0.00s user 0m 0.00s
> > > sys 0m 43.43s sys 0m 0.48s
>
> Since the main overhead here is building the bitmap table used in the
> name scan. Why not mantain the bitmap table between calls by
> implementing a rbtree with prefix -> bitmap.
> The tree would have to be limited and per namespace but then you
> could handle the general case of adding a device, then its vlans,
> then another device, ...
>
I'll do that !
That was my original intent but I thought it would be too much bloat :) But I
see your point, even if it is more complex, its more useful.
Thanks,
tavi
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 0:14 ` Octavian Purdila
@ 2009-11-14 0:20 ` Stephen Hemminger
0 siblings, 0 replies; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-14 0:20 UTC (permalink / raw)
To: Octavian Purdila; +Cc: netdev
On Sat, 14 Nov 2009 02:14:21 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:
> On Saturday 14 November 2009 02:04:45 you wrote:
> > On Fri, 13 Nov 2009 07:20:19 +0200
> >
> > Octavian Purdila <opurdila@ixiacom.com> wrote:
> > > On Friday 13 November 2009 07:01:14 you wrote:
> > > > This patch speeds up the network device name allocation for the case
> > > > where a significant number of devices of the same type are created
> > > > consecutively.
> > > >
> > > > Tests performed on a PPC750 @ 800Mhz machine with per device sysctl
> > > > and sysfs entries disabled:
> > > >
> > > > Without the patch With the patch
> > > >
> > > > real 0m 43.43s real 0m 0.49s
> > > > user 0m 0.00s user 0m 0.00s
> > > > sys 0m 43.43s sys 0m 0.48s
> >
> > Since the main overhead here is building the bitmap table used in the
> > name scan. Why not mantain the bitmap table between calls by
> > implementing a rbtree with prefix -> bitmap.
> > The tree would have to be limited and per namespace but then you
> > could handle the general case of adding a device, then its vlans,
> > then another device, ...
> >
>
> I'll do that !
>
> That was my original intent but I thought it would be too much bloat :) But I
> see your point, even if it is more complex, its more useful.
There might even be a VM notifier hook that could be used to drop the whole
tree if any memory pressure was felt.
--
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 23:52 ` Benjamin LaHaise
@ 2009-11-14 2:59 ` David Miller
2009-11-14 6:24 ` Benjamin LaHaise
2009-11-14 22:36 ` Mark Smith
0 siblings, 2 replies; 28+ messages in thread
From: David Miller @ 2009-11-14 2:59 UTC (permalink / raw)
To: bcrl; +Cc: shemminger, opurdila, eric.dumazet, netdev
From: Benjamin LaHaise <bcrl@lhnet.ca>
Date: Fri, 13 Nov 2009 18:52:10 -0500
> If you don't want the overhead from this kind of scaling, stick it under a
> config option, but please don't stop other people from pushing Linux into
> new uses which have these scaling requirements.
This 'scaling requirement' only exists in environments where people
undersubsribe their networks, right?
I'm not saying we won't put scaling into these areas, I'm just trying
to make a point to show that this "need" only exists because people
have purposefully created these situations where they feel the need to
massively control their users usage in order to generate revenue.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 2:59 ` David Miller
@ 2009-11-14 6:24 ` Benjamin LaHaise
2009-11-14 22:36 ` Mark Smith
1 sibling, 0 replies; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-14 6:24 UTC (permalink / raw)
To: David Miller; +Cc: shemminger, opurdila, eric.dumazet, netdev
On Fri, Nov 13, 2009 at 06:59:37PM -0800, David Miller wrote:
> From: Benjamin LaHaise <bcrl@lhnet.ca>
> Date: Fri, 13 Nov 2009 18:52:10 -0500
>
> > If you don't want the overhead from this kind of scaling, stick it under a
> > config option, but please don't stop other people from pushing Linux into
> > new uses which have these scaling requirements.
>
> This 'scaling requirement' only exists in environments where people
> undersubsribe their networks, right?
Depends on how you look at things. The case of lots of interfaces going
up/down can occur during normal operations. The incumbent telco in this
area has occasional flaps that reset thousands of sessions. The problem
relates to how things flop over to a different path within their network,
as they don't provide hot standby circuits for all the aggregated traffic
coming in -- a link down results in a flap of all the L2TP sessions. As
for it being underprovisioned, that doesn't really apply. The core LNS
boxes are kept from having saturated links, as that results in poor user
performance. Plus they have substantially more CPU than embedded routers.
> I'm not saying we won't put scaling into these areas, I'm just trying
> to make a point to show that this "need" only exists because people
> have purposefully created these situations where they feel the need to
> massively control their users usage in order to generate revenue.
I've finally got some of the userspace bits necessary for parallel network
device creation wired up. Will reducing the granularity of rtnl_lock() for
devices which can handle it be okay? That will get a factor of 4 to 8
improvement from current single socket hardware.
The other way I'm working around the scaling issues is to use network
namespaces. Babylon (the L2TP/PPPoE stack I'm working on) can now split
interface creation across some number of network namespaces. This keeps
the number of interfaces in a given net instance down to 5-10,000. That
really helps avoid some of the scaling issues, as we're pretty good in
that range.
The worst part of all the overhead during setup and teardown is that very
little traffic can pass while this is occurring, effectively making it an
outage, hence the desire to minimise outage situations.
-ben
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-13 22:49 ` Stephen Hemminger
2009-11-13 23:35 ` Benjamin LaHaise
@ 2009-11-14 7:08 ` Benny Amorsen
2009-11-14 7:21 ` Eric Dumazet
1 sibling, 1 reply; 28+ messages in thread
From: Benny Amorsen @ 2009-11-14 7:08 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Benjamin LaHaise, Octavian Purdila, Eric Dumazet, netdev
Stephen Hemminger <shemminger@vyatta.com> writes:
> Then maybe network devices aren't the right layering model. At some
> point the paradigm has to be re-examined.
I'm not quite sure where this becomes a problem. We have 1185 network
interfaces (VLAN's) on one box. Boot time is a problem, but other than
that it works ok. If something like this would help speed up booting,
that would be very nice.
/Benny
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 7:08 ` Benny Amorsen
@ 2009-11-14 7:21 ` Eric Dumazet
2009-11-14 16:16 ` Ben Greear
0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2009-11-14 7:21 UTC (permalink / raw)
To: Benny Amorsen
Cc: Stephen Hemminger, Benjamin LaHaise, Octavian Purdila, netdev
Benny Amorsen a écrit :
> Stephen Hemminger <shemminger@vyatta.com> writes:
>
>> Then maybe network devices aren't the right layering model. At some
>> point the paradigm has to be re-examined.
>
> I'm not quite sure where this becomes a problem. We have 1185 network
> interfaces (VLAN's) on one box. Boot time is a problem, but other than
> that it works ok. If something like this would help speed up booting,
> that would be very nice.
>
It would be very nice if you tell us why booting is very long,
ie what is done and how much time it takes.
It's clear that ~1000 vlans is quite reasonable :)
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 7:21 ` Eric Dumazet
@ 2009-11-14 16:16 ` Ben Greear
0 siblings, 0 replies; 28+ messages in thread
From: Ben Greear @ 2009-11-14 16:16 UTC (permalink / raw)
To: Eric Dumazet
Cc: Benny Amorsen, Stephen Hemminger, Benjamin LaHaise,
Octavian Purdila, netdev
Eric Dumazet wrote:
> Benny Amorsen a écrit :
>
>> Stephen Hemminger <shemminger@vyatta.com> writes:
>>
>>
>>> Then maybe network devices aren't the right layering model. At some
>>> point the paradigm has to be re-examined.
>>>
>> I'm not quite sure where this becomes a problem. We have 1185 network
>> interfaces (VLAN's) on one box. Boot time is a problem, but other than
>> that it works ok. If something like this would help speed up booting,
>> that would be very nice.
>>
>>
>
> It would be very nice if you tell us why booting is very long,
> ie what is done and how much time it takes.
>
> It's clear that ~1000 vlans is quite reasonable :)
>
At least sometimes, hotplug can degrade into a nasty case where it runs
'ifconfig -a' for each
interface created. (This was F11 if I recall correctly).
If you configure hotplug to ignore vlans, it might help your boot time.
Thanks,
Ben
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 2:59 ` David Miller
2009-11-14 6:24 ` Benjamin LaHaise
@ 2009-11-14 22:36 ` Mark Smith
2009-11-15 1:22 ` Stephen Hemminger
2009-11-15 1:55 ` Denys Fedoryschenko
1 sibling, 2 replies; 28+ messages in thread
From: Mark Smith @ 2009-11-14 22:36 UTC (permalink / raw)
To: David Miller; +Cc: bcrl, shemminger, opurdila, eric.dumazet, netdev
On Fri, 13 Nov 2009 18:59:37 -0800 (PST)
David Miller <davem@davemloft.net> wrote:
> From: Benjamin LaHaise <bcrl@lhnet.ca>
> Date: Fri, 13 Nov 2009 18:52:10 -0500
>
> > If you don't want the overhead from this kind of scaling, stick it under a
> > config option, but please don't stop other people from pushing Linux into
> > new uses which have these scaling requirements.
>
> This 'scaling requirement' only exists in environments where people
> undersubsribe their networks, right?
>
> I'm not saying we won't put scaling into these areas, I'm just trying
> to make a point to show that this "need" only exists because people
> have purposefully created these situations where they feel the need to
> massively control their users usage in order to generate revenue.
I'm don't understand that comment, and I work for (and
designed most of the infrastructure for) an ISP that usually has
well over 40 000 concurrent PPPoE sesssions at any one time.
The fundamental purpose of PPPoE is nothing to do with any scaling or
architecture, it is purely to make a more modern shared networking
technology like Ethernet look like high speed dial up. This has occurred
mainly because when broadband came along it allowed ISPs to introduce
it quickly, without having to also upgrade their dial up oriented
backend systems i.e. customer authentication/accounting and customer
support systems. It wasn't ideal then and it isn't ideal now. PPPoE adds
an overhead of 8 bytes per packet, yet the only thing it is doing is
changing ethernet from multipoint to point-to-point so PPP can run
over it and providing ISPs with an ability to identify the subscriber.
There are other methods to solve customer identity problem without the
PPPoE overheads. Moving to them however can be a long drawn out process
because it also means changes to customer's CPE settings, or running
the old and new methods in parallel for the foreseeable future.
On the occasions I've looked at whether a Linux box would be an
alternative to the Cisco BRAS platform we use, the last time I looked
the number of sessions people were saying they were running was
500. I don't consider Linux to be feasible in that role until you're
able to run at least 5000 sessions on a single box. I'm a bit unusual
in that regard, as I prefer the "lots of smaller, increase chances
of failure, but consequences of failure" model - you manage the
larger number of them via configuration templating / scripted
change deployment. You need to chose your subscriber per device level,
and if 500 is the current limit for Linux, then in my opinion it is
currently too low for my application. Others in the industry might
consider 5000 too low, as they are running devices that can handle 32
000 or 64 000 PPPoE sessions.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 22:36 ` Mark Smith
@ 2009-11-15 1:22 ` Stephen Hemminger
2009-11-15 1:49 ` Mark Smith
2009-11-15 1:55 ` Denys Fedoryschenko
1 sibling, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2009-11-15 1:22 UTC (permalink / raw)
To: Mark Smith; +Cc: David Miller, bcrl, opurdila, eric.dumazet, netdev
On Sun, 15 Nov 2009 09:06:04 +1030
Mark Smith <lk-netdev@lk-netdev.nosense.org> wrote:
> The fundamental purpose of PPPoE is nothing to do with any scaling or
> architecture, it is purely to make a more modern shared networking
> technology like Ethernet look like high speed dial up. This has occurred
> mainly because when broadband came along it allowed ISPs to introduce
> it quickly, without having to also upgrade their dial up oriented
> backend systems i.e. customer authentication/accounting and customer
> support systems. It wasn't ideal then and it isn't ideal now. PPPoE adds
> an overhead of 8 bytes per packet, yet the only thing it is doing is
> changing ethernet from multipoint to point-to-point so PPP can run
> over it and providing ISPs with an ability to identify the subscriber.
> There are other methods to solve customer identity problem without the
> PPPoE overheads. Moving to them however can be a long drawn out process
> because it also means changes to customer's CPE settings, or running
> the old and new methods in parallel for the foreseeable future.
Carriers still haven't figured out that circuit switched networks don't
scale. They just can't learn the lesson of the Internet.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-15 1:22 ` Stephen Hemminger
@ 2009-11-15 1:49 ` Mark Smith
0 siblings, 0 replies; 28+ messages in thread
From: Mark Smith @ 2009-11-15 1:49 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, bcrl, opurdila, eric.dumazet, netdev
On Sat, 14 Nov 2009 17:22:24 -0800
Stephen Hemminger <shemminger@vyatta.com> wrote:
> On Sun, 15 Nov 2009 09:06:04 +1030
> Mark Smith <lk-netdev@lk-netdev.nosense.org> wrote:
>
> > The fundamental purpose of PPPoE is nothing to do with any scaling or
> > architecture, it is purely to make a more modern shared networking
> > technology like Ethernet look like high speed dial up. This has occurred
> > mainly because when broadband came along it allowed ISPs to introduce
> > it quickly, without having to also upgrade their dial up oriented
> > backend systems i.e. customer authentication/accounting and customer
> > support systems. It wasn't ideal then and it isn't ideal now. PPPoE adds
> > an overhead of 8 bytes per packet, yet the only thing it is doing is
> > changing ethernet from multipoint to point-to-point so PPP can run
> > over it and providing ISPs with an ability to identify the subscriber.
> > There are other methods to solve customer identity problem without the
> > PPPoE overheads. Moving to them however can be a long drawn out process
> > because it also means changes to customer's CPE settings, or running
> > the old and new methods in parallel for the foreseeable future.
>
> Carriers still haven't figured out that circuit switched networks don't
> scale. They just can't learn the lesson of the Internet.
I don't really think that is the case. The authors of the PPPoE
spec were all from "Internet" companies, including UUNET, the first
Internet company, and the largest at the time, so I'm sure they all knew
about Internet scaling.
Here's what they had to say in the RFC2516 intro:
" Modern access technologies are faced with several conflicting goals.
It is desirable to connect multiple hosts at a remote site through
the same customer premise access device. It is also a goal to
provide access control and billing functionality in a manner similar
to dial-up services using PPP. In many access technologies, the most
cost effective method to attach multiple hosts to the customer
premise access device, is via Ethernet. In addition, it is desirable
to keep the cost of this device as low as possible while requiring
little or no configuration."
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-14 22:36 ` Mark Smith
2009-11-15 1:22 ` Stephen Hemminger
@ 2009-11-15 1:55 ` Denys Fedoryschenko
2009-11-15 7:48 ` Eric Dumazet
2009-11-15 16:50 ` Benjamin LaHaise
1 sibling, 2 replies; 28+ messages in thread
From: Denys Fedoryschenko @ 2009-11-15 1:55 UTC (permalink / raw)
To: Mark Smith; +Cc: David Miller, bcrl, shemminger, opurdila, eric.dumazet, netdev
On Sunday 15 November 2009 00:36:04 Mark Smith wrote:
> On the occasions I've looked at whether a Linux box would be an
> alternative to the Cisco BRAS platform we use, the last time I looked
> the number of sessions people were saying they were running was
> 500. I don't consider Linux to be feasible in that role until you're
> able to run at least 5000 sessions on a single box. I'm a bit unusual
I am running up to 3500 on single NAS, but there is only 3 biggest one like
this, and i am limited only by subscribers on this location (network is
distributed over the country, and i have around 200 NAS servers running in
summary). And it is just PC bought from nearest supermarket with cheap PCI
RTL8169, and similar quality LOM adapter e1000e. Everything running on
cheapest USB flash from same supermarket.
For my case running Linux NAS on cheap PC's is only choice. It is 3rd world
country, and many reasons (i can explain each, but it is not technical
subject) doesn't let me to think, that "professional" equipment is feasible
for me.
Here people build networks on cheapest unmanageable switches, same
cost/quality 802.11b/g wireless networks, and only a way to terminate them
reliably is PPPoE. I know, it is also weak and easy to break, but it is
single choice i have.
I know also ISP's in Russia, who have somehow partially "managed" networks,
but PPPoE letting them to drop running costs.
And interface creation speed is important for me, when electricity goes down
here, many customers disconnects (up to 500 on single NAS), and then join
again to NAS. Load average was jumping to sky on such situations, just option
to not create sysfs entries helped me a lot (was posted recently).
Electricity outage is usual here, happens 2-3 times daily.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-15 1:55 ` Denys Fedoryschenko
@ 2009-11-15 7:48 ` Eric Dumazet
2009-11-15 16:50 ` Benjamin LaHaise
1 sibling, 0 replies; 28+ messages in thread
From: Eric Dumazet @ 2009-11-15 7:48 UTC (permalink / raw)
To: Denys Fedoryschenko
Cc: Mark Smith, David Miller, bcrl, shemminger, opurdila, netdev
Denys Fedoryschenko a écrit :
> On Sunday 15 November 2009 00:36:04 Mark Smith wrote:
>> On the occasions I've looked at whether a Linux box would be an
>> alternative to the Cisco BRAS platform we use, the last time I looked
>> the number of sessions people were saying they were running was
>> 500. I don't consider Linux to be feasible in that role until you're
>> able to run at least 5000 sessions on a single box. I'm a bit unusual
> I am running up to 3500 on single NAS, but there is only 3 biggest one like
> this, and i am limited only by subscribers on this location (network is
> distributed over the country, and i have around 200 NAS servers running in
> summary). And it is just PC bought from nearest supermarket with cheap PCI
> RTL8169, and similar quality LOM adapter e1000e. Everything running on
> cheapest USB flash from same supermarket.
>
> For my case running Linux NAS on cheap PC's is only choice. It is 3rd world
> country, and many reasons (i can explain each, but it is not technical
> subject) doesn't let me to think, that "professional" equipment is feasible
> for me.
>
> Here people build networks on cheapest unmanageable switches, same
> cost/quality 802.11b/g wireless networks, and only a way to terminate them
> reliably is PPPoE. I know, it is also weak and easy to break, but it is
> single choice i have.
> I know also ISP's in Russia, who have somehow partially "managed" networks,
> but PPPoE letting them to drop running costs.
>
> And interface creation speed is important for me, when electricity goes down
> here, many customers disconnects (up to 500 on single NAS), and then join
> again to NAS. Load average was jumping to sky on such situations, just option
> to not create sysfs entries helped me a lot (was posted recently).
> Electricity outage is usual here, happens 2-3 times daily.
I found in my cases (not pppoe) that load was very high because of udev,
doing crazy loops of :
if (!rtnl_trylock())
return restart_syscall();
About pppoe, we have a 16 slots hash table, protected by a single rwlock.
This wont scale to 50000 sessions, unless we use larger hashtable and
maybe RCU as well.
About the dismantling phase, it is currently a synchronous thing
(as the resquester process has to wait for many rcu grace periods
for each netdevice to dismantle). Thats typically ~20 ms per device !
For 'anonymous' netdevices, we probably could queue them and use a
worker thread to handle this queue using the new batch mode,
added in net-next-2.6.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [net-next-2.6 PATCH] net: fast consecutive name allocation
2009-11-15 1:55 ` Denys Fedoryschenko
2009-11-15 7:48 ` Eric Dumazet
@ 2009-11-15 16:50 ` Benjamin LaHaise
1 sibling, 0 replies; 28+ messages in thread
From: Benjamin LaHaise @ 2009-11-15 16:50 UTC (permalink / raw)
To: Denys Fedoryschenko
Cc: Mark Smith, David Miller, shemminger, opurdila, eric.dumazet, netdev
Hi Denys,
On Sun, Nov 15, 2009 at 03:55:14AM +0200, Denys Fedoryschenko wrote:
> And interface creation speed is important for me, when electricity goes down
> here, many customers disconnects (up to 500 on single NAS), and then join
> again to NAS. Load average was jumping to sky on such situations, just option
> to not create sysfs entries helped me a lot (was posted recently).
> Electricity outage is usual here, happens 2-3 times daily.
This is exactly the type of scenario I'm looking at. The design of the
Babylon PPP stack is meant to scale somewhat better that pppd. It uses a
single process (although I'm starting to add threads to improve scaling on
SMP systems) for all PPP/L2TP sessions, and has rather lower connection
setup overhead (no fork()/exec() being the biggest one). With udev tuned,
irqbalance disabled and a few other tweaks, it gets >500 connections per
second in startup on a modern 2.6GHz processor for L2TP traffic. There
is PPPoE support, but it needs a bit more work done to scale automatically
(there are a few hardcoded limits in the PPPoE implementation).
-ben
^ permalink raw reply [flat|nested] 28+ messages in thread
end of thread, other threads:[~2009-11-15 16:50 UTC | newest]
Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-13 5:01 [net-next-2.6 PATCH] net: fast consecutive name allocation Octavian Purdila
2009-11-13 5:20 ` Octavian Purdila
2009-11-13 6:12 ` Eric Dumazet
2009-11-13 6:26 ` Stephen Hemminger
2009-11-13 7:09 ` Eric Dumazet
2009-11-13 9:51 ` Octavian Purdila
2009-11-13 22:29 ` Stephen Hemminger
2009-11-13 22:40 ` Benjamin LaHaise
2009-11-13 22:49 ` Stephen Hemminger
2009-11-13 23:35 ` Benjamin LaHaise
2009-11-13 23:39 ` Stephen Hemminger
2009-11-13 23:52 ` Benjamin LaHaise
2009-11-14 2:59 ` David Miller
2009-11-14 6:24 ` Benjamin LaHaise
2009-11-14 22:36 ` Mark Smith
2009-11-15 1:22 ` Stephen Hemminger
2009-11-15 1:49 ` Mark Smith
2009-11-15 1:55 ` Denys Fedoryschenko
2009-11-15 7:48 ` Eric Dumazet
2009-11-15 16:50 ` Benjamin LaHaise
2009-11-14 7:08 ` Benny Amorsen
2009-11-14 7:21 ` Eric Dumazet
2009-11-14 16:16 ` Ben Greear
2009-11-13 9:55 ` Octavian Purdila
2009-11-13 16:40 ` Ben Greear
2009-11-14 0:04 ` Stephen Hemminger
2009-11-14 0:14 ` Octavian Purdila
2009-11-14 0:20 ` Stephen Hemminger
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.