All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
@ 2009-11-01 16:31 Benjamin LaHaise
  2009-11-01 16:32 ` [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents Benjamin LaHaise
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-01 16:31 UTC (permalink / raw)
  To: Eric Dumazet, Greg Kroah-Hartman
  Cc: Eric W. Biederman, Octavian Purdila, netdev, Cosmin Ratiu, linux-kernel

Use an rbtree in sysfs_dirent to speed up file lookup times

Systems with large numbers (tens of thousands and more) of network 
interfaces stress the sysfs code in ways that make the linear search for 
a name match take far too long.  Avoid this by using an rbtree.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5fad489..30c3fc5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -44,6 +44,7 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd = sd->s_parent;
 	struct sysfs_dirent **pos;
+	struct rb_node **new, *parent;
 
 	BUG_ON(sd->s_sibling);
 
@@ -57,6 +58,27 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 	}
 	sd->s_sibling = *pos;
 	*pos = sd;
+
+	// rb tree insert
+	new = &(parent_sd->s_dir.child_rb_root.rb_node);
+	parent = NULL;
+
+	while (*new) {
+		struct sysfs_dirent *this =
+			container_of(*new, struct sysfs_dirent, s_rb_node);
+		int result = strcmp(sd->s_name, this->s_name);
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else
+			BUG();
+	}
+
+	rb_link_node(&sd->s_rb_node, parent, new);
+	rb_insert_color(&sd->s_rb_node, &parent_sd->s_dir.child_rb_root);
 }
 
 /**
@@ -81,6 +103,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 			break;
 		}
 	}
+
+	rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
 }
 
 /**
@@ -331,6 +355,9 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 	sd->s_mode = mode;
 	sd->s_flags = type;
 
+	if (type == SYSFS_DIR)
+		sd->s_dir.child_rb_root = RB_ROOT;
+
 	return sd;
 
  err_out2:
@@ -630,11 +657,20 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const unsigned char *name)
 {
-	struct sysfs_dirent *sd;
-
-	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
-		if (!strcmp(sd->s_name, name))
-			return sd;
+	struct rb_node *node = parent_sd->s_dir.child_rb_root.rb_node;
+
+	while (node) {
+		struct sysfs_dirent *data =
+			container_of(node, struct sysfs_dirent, s_rb_node);
+		int result;
+		result = strcmp(name, data->s_name);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return data;
+	}
 	return NULL;
 }
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7..600109c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 
 struct sysfs_open_dirent;
 
@@ -17,6 +18,7 @@ struct sysfs_elem_dir {
 	struct kobject		*kobj;
 	/* children list starts here and goes through sd->s_sibling */
 	struct sysfs_dirent	*children;
+	struct rb_root		child_rb_root;
 };
 
 struct sysfs_elem_symlink {
@@ -52,6 +54,7 @@ struct sysfs_dirent {
 	atomic_t		s_active;
 	struct sysfs_dirent	*s_parent;
 	struct sysfs_dirent	*s_sibling;
+	struct rb_node		s_rb_node;
 	const char		*s_name;
 
 	union {

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents
  2009-11-01 16:31 [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Benjamin LaHaise
@ 2009-11-01 16:32 ` Benjamin LaHaise
  2009-11-01 16:33   ` [PATCH 2/3] sysfs directory scaling: count number of children dirs Benjamin LaHaise
  2009-11-03  3:50 ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
  2009-11-03 10:41 ` Eric W. Biederman
  2 siblings, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-01 16:32 UTC (permalink / raw)
  To: Eric Dumazet, Greg Kroah-Hartman
  Cc: Eric W. Biederman, Octavian Purdila, netdev, Cosmin Ratiu, linux-kernel

When adding/removing large numbers of entries from sysfs directories, one 
hot spot is in the link and unlink operations of sysfs.  When linking a 
new sysfs_dirent into the tree, operation can be significantly sped up by 
starting at the end of the list rather than the beginning.  Unlink is 
improved by using a doubly linked list.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
diff -u b/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- b/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,11 +43,18 @@
 static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent **pos;
+	struct sysfs_dirent **pos, *prev = NULL;
 	struct rb_node **new, *parent;
 
 	BUG_ON(sd->s_sibling);
 
+	if (parent_sd->s_dir.children_tail &&
+	    parent_sd->s_dir.children_tail->s_ino < sd->s_ino) {
+		prev = parent_sd->s_dir.children_tail;
+		pos = &prev->s_sibling;
+		goto got_it;
+	}
+
 	/* Store directory entries in order by ino.  This allows
 	 * readdir to properly restart without having to add a
 	 * cursor into the s_dir.children list.
@@ -55,8 +62,13 @@
 	for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
 		if (sd->s_ino < (*pos)->s_ino)
 			break;
+		prev = *pos;
 	}
+got_it:
+	if (prev == parent_sd->s_dir.children_tail)
+		parent_sd->s_dir.children_tail = sd;
 	sd->s_sibling = *pos;
+	sd->s_sibling_prev = prev;
 	*pos = sd;
 
 	// rb tree insert
@@ -93,17 +105,20 @@
  */
 static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
-	struct sysfs_dirent **pos;
-
-	for (pos = &sd->s_parent->s_dir.children; *pos;
-	     pos = &(*pos)->s_sibling) {
-		if (*pos == sd) {
-			*pos = sd->s_sibling;
-			sd->s_sibling = NULL;
-			break;
-		}
-	}
+	struct sysfs_dirent **pos, *prev = NULL;
 
+	prev = sd->s_sibling_prev;
+	if (prev)
+		pos = &prev->s_sibling;
+	else
+		pos = &sd->s_parent->s_dir.children;
+	if (sd == sd->s_parent->s_dir.children_tail)
+		sd->s_parent->s_dir.children_tail = prev;
+	*pos = sd->s_sibling;
+	if (sd->s_sibling)
+		sd->s_sibling->s_sibling_prev = prev;
+	
+	sd->s_sibling_prev = NULL;
 	rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
 }
 
diff -u b/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
--- b/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -17,7 +17,9 @@
 struct sysfs_elem_dir {
 	struct kobject		*kobj;
 	/* children list starts here and goes through sd->s_sibling */
+	
 	struct sysfs_dirent	*children;
+	struct sysfs_dirent	*children_tail;
 	struct rb_root		child_rb_root;
 };
 
@@ -54,6 +56,7 @@
 	atomic_t		s_active;
 	struct sysfs_dirent	*s_parent;
 	struct sysfs_dirent	*s_sibling;
+	struct sysfs_dirent	*s_sibling_prev;
 	struct rb_node		s_rb_node;
 	const char		*s_name;
 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 2/3] sysfs directory scaling: count number of children dirs
  2009-11-01 16:32 ` [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents Benjamin LaHaise
@ 2009-11-01 16:33   ` Benjamin LaHaise
  0 siblings, 0 replies; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-01 16:33 UTC (permalink / raw)
  To: Eric Dumazet, Greg Kroah-Hartman
  Cc: Eric W. Biederman, Octavian Purdila, netdev, Cosmin Ratiu, linux-kernel

sysfs_count_nlink() is a bottleneck during mass bring up of network 
interfaces.  Eliminate this problem by tracking the number of directories.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
diff -u b/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- b/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -70,6 +70,7 @@
 	sd->s_sibling = *pos;
 	sd->s_sibling_prev = prev;
 	*pos = sd;
+	parent_sd->s_nr_children_dir += (sysfs_type(sd) == SYSFS_DIR);
 
 	// rb tree insert
 	new = &(parent_sd->s_dir.child_rb_root.rb_node);
@@ -118,6 +119,7 @@
 	if (sd->s_sibling)
 		sd->s_sibling->s_sibling_prev = prev;
 	
+	sd->s_parent->s_nr_children_dir -= (sysfs_type(sd) == SYSFS_DIR);
 	sd->s_sibling_prev = NULL;
 	rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
 }
diff -u b/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
--- b/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -71,6 +71,8 @@
 	ino_t			s_ino;
 	umode_t			s_mode;
 	struct sysfs_inode_attrs *s_iattr;
+
+	int			s_nr_children_dir;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
only in patch2:
unchanged:
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -191,14 +191,7 @@ static struct lock_class_key sysfs_inode_imutex_key;
 
 static int sysfs_count_nlink(struct sysfs_dirent *sd)
 {
-	struct sysfs_dirent *child;
-	int nr = 0;
-
-	for (child = sd->s_dir.children; child; child = child->s_sibling)
-		if (sysfs_type(child) == SYSFS_DIR)
-			nr++;
-
-	return nr + 2;
+	return sd->s_nr_children_dir + 2;
 }
 
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-01 16:31 [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Benjamin LaHaise
  2009-11-01 16:32 ` [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents Benjamin LaHaise
@ 2009-11-03  3:50 ` Greg KH
  2009-11-03  6:14   ` Eric Dumazet
  2009-11-03 20:01   ` Benjamin LaHaise
  2009-11-03 10:41 ` Eric W. Biederman
  2 siblings, 2 replies; 20+ messages in thread
From: Greg KH @ 2009-11-03  3:50 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Eric Dumazet, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> Use an rbtree in sysfs_dirent to speed up file lookup times
> 
> Systems with large numbers (tens of thousands and more) of network 
> interfaces stress the sysfs code in ways that make the linear search for 
> a name match take far too long.  Avoid this by using an rbtree.

What kind of speedups are you seeing here?  And do these changes cause a
memory increase due to the structure changes which outweigh the
speedups?

What kind of test are you doing to reproduce this?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03  3:50 ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
@ 2009-11-03  6:14   ` Eric Dumazet
  2009-11-03  7:01     ` [PATCH] sysctl: reduce ram usage by 40 % Eric Dumazet
  2009-11-03 16:07     ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
  2009-11-03 20:01   ` Benjamin LaHaise
  1 sibling, 2 replies; 20+ messages in thread
From: Eric Dumazet @ 2009-11-03  6:14 UTC (permalink / raw)
  To: Greg KH
  Cc: Benjamin LaHaise, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

Greg KH a écrit :
> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> Use an rbtree in sysfs_dirent to speed up file lookup times
>>
>> Systems with large numbers (tens of thousands and more) of network 
>> interfaces stress the sysfs code in ways that make the linear search for 
>> a name match take far too long.  Avoid this by using an rbtree.
> 
> What kind of speedups are you seeing here?  And do these changes cause a
> memory increase due to the structure changes which outweigh the
> speedups?
> 
> What kind of test are you doing to reproduce this?
> 

Its curious because in my tests the biggest problems come from
kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
in following attempt to create 20.000 devices

(disable hotplug before trying this, and ipv6 too !)
modprobe dummy numdummies=20000

I believe we should address __register_sysctl_paths() scalability
problems too.

I dont know what is the 'sentinel' we allocate after each struct ctl_table
But I suspect we could reduce size requirement of the 'sentinel' to include
only needed fields for the sentinel (and move them at start of ctl_table)

        /*
         * For each path component, allocate a 2-element ctl_table array.
         * The first array element will be filled with the sysctl entry
         * for this, the second will be the sentinel (ctl_name == 0).
         *
         * We allocate everything in one go so that we don't have to
         * worry about freeing additional memory in unregister_sysctl_table.
         */
        header = kzalloc(sizeof(struct ctl_table_header) +
                         (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);

Then, adding an rb_node in ctl_table_header to speedup __register_sysctl_paths() a bit

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] sysctl: reduce ram usage by 40 %
  2009-11-03  6:14   ` Eric Dumazet
@ 2009-11-03  7:01     ` Eric Dumazet
  2009-11-03 10:23       ` Eric W. Biederman
  2009-11-03 16:07     ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
  1 sibling, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2009-11-03  7:01 UTC (permalink / raw)
  To: David S. Miller, Greg KH
  Cc: Benjamin LaHaise, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

Eric Dumazet a écrit :

> Its curious because in my tests the biggest problems come from
> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> in following attempt to create 20.000 devices
> 
> (disable hotplug before trying this, and ipv6 too !)
> modprobe dummy numdummies=20000
> 
> I believe we should address __register_sysctl_paths() scalability
> problems too.
> 
> I dont know what is the 'sentinel' we allocate after each struct ctl_table
> But I suspect we could reduce size requirement of the 'sentinel' to include
> only needed fields for the sentinel (and move them at start of ctl_table)
> 

Here is the patch to reduce ram usage of sysctl :

[PATCH] sysctl: reduce ram usage by 40 %

We currently reserve space for a so called sentinel, a full struct ctl_table
for each ctl_table. We can cheat a bit since only needed fields of a sentinel
are ctl_name and procname. Add a new structure (struct ctl_table_sentinel)
that includes a full ctl_table and only required part of a sentinel.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/sysctl.h |   13 ++++++++++++-
 kernel/sysctl.c        |   19 ++++++++++---------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743e..6a1b1d5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1050,8 +1050,10 @@ extern ctl_handler sysctl_ms_jiffies;
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
-	int ctl_name;			/* Binary ID */
+	/* ctl_name and procname must be first fields (check sentinel) */
+	int	   ctl_name;		/* Binary ID */
 	const char *procname;		/* Text ID for /proc/sys, or zero */
+
 	void *data;
 	int maxlen;
 	mode_t mode;
@@ -1063,6 +1065,15 @@ struct ctl_table
 	void *extra2;
 };
 
+/* ctl_table_sentinel : a ctl_table followed by a sentinel
+ * (null ctl & procname)
+ */
+struct ctl_table_sentinel {
+	struct ctl_table table;
+	int		 ctl_name;
+	const char	 *procname;
+};
+
 struct ctl_table_root {
 	struct list_head root_list;
 	struct ctl_table_set default_set;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c5..5d29dd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2063,7 +2063,8 @@ struct ctl_table_header *__register_sysctl_paths(
 	const struct ctl_path *path, struct ctl_table *table)
 {
 	struct ctl_table_header *header;
-	struct ctl_table *new, **prevp;
+	struct ctl_table_sentinel *new;
+	struct ctl_table **prevp;
 	unsigned int n, npath;
 	struct ctl_table_set *set;
 
@@ -2080,24 +2081,24 @@ struct ctl_table_header *__register_sysctl_paths(
 	 * worry about freeing additional memory in unregister_sysctl_table.
 	 */
 	header = kzalloc(sizeof(struct ctl_table_header) +
-			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+			 (npath * sizeof(struct ctl_table_sentinel)), GFP_KERNEL);
 	if (!header)
 		return NULL;
 
-	new = (struct ctl_table *) (header + 1);
+	new = (struct ctl_table_sentinel *) (header + 1);
 
 	/* Now connect the dots */
 	prevp = &header->ctl_table;
 	for (n = 0; n < npath; ++n, ++path) {
 		/* Copy the procname */
-		new->procname = path->procname;
-		new->ctl_name = path->ctl_name;
-		new->mode     = 0555;
+		new->table.procname = path->procname;
+		new->table.ctl_name = path->ctl_name;
+		new->table.mode     = 0555;
 
-		*prevp = new;
-		prevp = &new->child;
+		*prevp = &new->table;
+		prevp = &new->table.child;
 
-		new += 2;
+		new++;
 	}
 	*prevp = table;
 	header->ctl_table_arg = table;

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] sysctl: reduce ram usage by 40 %
  2009-11-03  7:01     ` [PATCH] sysctl: reduce ram usage by 40 % Eric Dumazet
@ 2009-11-03 10:23       ` Eric W. Biederman
  0 siblings, 0 replies; 20+ messages in thread
From: Eric W. Biederman @ 2009-11-03 10:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Greg KH, Benjamin LaHaise, Octavian Purdila,
	netdev, Cosmin Ratiu, linux-kernel

Eric Dumazet <eric.dumazet@gmail.com> writes:

> Eric Dumazet a écrit :
>
>> Its curious because in my tests the biggest problems come from
>> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
>> in following attempt to create 20.000 devices

I bet that is Al's cute glue all the sysctl data structures together
patch.  It improves readdir and lookup at a small cost at registration
time.

>> (disable hotplug before trying this, and ipv6 too !)
>> modprobe dummy numdummies=20000


>> I believe we should address __register_sysctl_paths() scalability
>> problems too.

Agreed.

>> I dont know what is the 'sentinel' we allocate after each struct ctl_table
>> But I suspect we could reduce size requirement of the 'sentinel' to include
>> only needed fields for the sentinel (and move them at start of ctl_table)

The sentinel is just a NULL terminator.

> Here is the patch to reduce ram usage of sysctl :
>
> [PATCH] sysctl: reduce ram usage by 40 %
>
> We currently reserve space for a so called sentinel, a full struct ctl_table
> for each ctl_table. We can cheat a bit since only needed fields of a sentinel
> are ctl_name and procname. Add a new structure (struct ctl_table_sentinel)
> that includes a full ctl_table and only required part of a sentinel.

Before we address sysctl I would like to get out my patchset that
makes sys_sysctl a wrapper around the ascii version of
/proc/sys/net. Once that goes in it becomes much easier to do things
and perform radical surgery on sysctl.  Little things like .ctl_name and
.strategy go away.

Have you happened to look at the other cost of /proc proper?  Hmm.
Except for /proc/net/dev_snmp6 it doesn't look like we keep per
interface directories in proc so without ivp6 you won't see the proc
generic code at all.

The practical consequence is if /proc/net/dev_snmp6 is not painful during
registration right now we can probably convert all of /proc/sys/net to proc
generic after my other changes are in.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-01 16:31 [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Benjamin LaHaise
  2009-11-01 16:32 ` [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents Benjamin LaHaise
  2009-11-03  3:50 ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
@ 2009-11-03 10:41 ` Eric W. Biederman
  2 siblings, 0 replies; 20+ messages in thread
From: Eric W. Biederman @ 2009-11-03 10:41 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Eric Dumazet, Greg Kroah-Hartman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

Benjamin LaHaise <bcrl@lhnet.ca> writes:

> Use an rbtree in sysfs_dirent to speed up file lookup times
>
> Systems with large numbers (tens of thousands and more) of network 
> interfaces stress the sysfs code in ways that make the linear search for 
> a name match take far too long.  Avoid this by using an rbtree.

Please take a look at the cleanups_scaling branch at:
kernel.org:/pub/scm/linux/kernel/git/ebiederm/linux-2.6.32-rc5-sysfs-enhancements

I haven't spent a lot of time on it but it is possible to get everything
except the rbtree without increasing the size of sysfs_dirent.  Also we
don't need the both the rbtree and a linked list.

In particular see:
commit 50623bbb82da3bd1d596b9173a91ed1b5aa168b8
Author: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>
Date:   Sat Oct 31 04:11:18 2009 -0700

    sysfs: Sort sysfs directories by name hash.
    
    This is a step in preparation for introducing a more efficient
    data structure than a linked list for sysfs entries.  By ordering
    by name hash instead of by inode sysfs_lookup can be speeded
    up as well as allowing restarting after seekdir.
    
    Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>

Meanwhile back to pushing the most important ones for real.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03  6:14   ` Eric Dumazet
  2009-11-03  7:01     ` [PATCH] sysctl: reduce ram usage by 40 % Eric Dumazet
@ 2009-11-03 16:07     ` Greg KH
  2009-11-03 16:38       ` Octavian Purdila
                         ` (2 more replies)
  1 sibling, 3 replies; 20+ messages in thread
From: Greg KH @ 2009-11-03 16:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Benjamin LaHaise, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

On Tue, Nov 03, 2009 at 07:14:33AM +0100, Eric Dumazet wrote:
> Greg KH a ?crit :
> > On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> >> Use an rbtree in sysfs_dirent to speed up file lookup times
> >>
> >> Systems with large numbers (tens of thousands and more) of network 
> >> interfaces stress the sysfs code in ways that make the linear search for 
> >> a name match take far too long.  Avoid this by using an rbtree.
> > 
> > What kind of speedups are you seeing here?  And do these changes cause a
> > memory increase due to the structure changes which outweigh the
> > speedups?
> > 
> > What kind of test are you doing to reproduce this?
> > 
> 
> Its curious because in my tests the biggest problems come from
> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> in following attempt to create 20.000 devices
> 
> (disable hotplug before trying this, and ipv6 too !)
> modprobe dummy numdummies=20000
> 
> I believe we should address __register_sysctl_paths() scalability
> problems too.

But registering 20000 devices is a far different problem from using
those 20000 devices :)

I think the "use the device" path should be the one we care the most
about fixing up, as that is much more common than the register path for
all users.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 16:07     ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
@ 2009-11-03 16:38       ` Octavian Purdila
  2009-11-03 16:45       ` Benjamin LaHaise
  2009-11-03 22:28       ` Eric W. Biederman
  2 siblings, 0 replies; 20+ messages in thread
From: Octavian Purdila @ 2009-11-03 16:38 UTC (permalink / raw)
  To: Greg KH
  Cc: Eric Dumazet, Benjamin LaHaise, Eric W. Biederman, netdev,
	Cosmin Ratiu, linux-kernel

On Tuesday 03 November 2009 18:07:15 you wrote:

> > > What kind of test are you doing to reproduce this?
> >
> > Its curious because in my tests the biggest problems come from
> > kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> > in following attempt to create 20.000 devices
> >
> > (disable hotplug before trying this, and ipv6 too !)
> > modprobe dummy numdummies=20000
> >
> > I believe we should address __register_sysctl_paths() scalability
> > problems too.
> 
> But registering 20000 devices is a far different problem from using
> those 20000 devices :)
> 
> I think the "use the device" path should be the one we care the most
> about fixing up, as that is much more common than the register path for
> all users.
> 

For sysctl in general probably, but I would argue that for dynamic network 
interfaces (ppp and other sorts of tunnels) the "use" and "register" paths are 
not that unbalanced.

For our case where we use up to 128K interfaces, sysctl entries per network 
interface is pretty much unusable - but I agree that is not a very common case 
:) 

However [1] is not so far fetched.

[1] http://www.spinics.net/lists/netdev/msg110392.html

Thanks,
tavi

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 16:07     ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
  2009-11-03 16:38       ` Octavian Purdila
@ 2009-11-03 16:45       ` Benjamin LaHaise
  2009-11-03 17:56         ` Greg KH
  2009-11-03 22:28       ` Eric W. Biederman
  2 siblings, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-03 16:45 UTC (permalink / raw)
  To: Greg KH
  Cc: Eric Dumazet, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

On Tue, Nov 03, 2009 at 08:07:15AM -0800, Greg KH wrote:
> But registering 20000 devices is a far different problem from using
> those 20000 devices :)

Registering 20,000 devices *is* a real world problem (I'm actually aiming 
for 100,000, as that's what roughly fits in a single 10Gbps link -- something 
that a mid range system can now route).  When an edge router comes up from 
reboot, or after a link has been down, the rate at which customers connect 
is important -- too slow, and you get a pile of support calls from customers 
complaining that their connection is down.  Because of the data structures 
used, there isn't even any improvement from an SMP system, so this needs 
to be addressed directly.

		-ben

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 16:45       ` Benjamin LaHaise
@ 2009-11-03 17:56         ` Greg KH
  0 siblings, 0 replies; 20+ messages in thread
From: Greg KH @ 2009-11-03 17:56 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Eric Dumazet, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

On Tue, Nov 03, 2009 at 11:45:50AM -0500, Benjamin LaHaise wrote:
> On Tue, Nov 03, 2009 at 08:07:15AM -0800, Greg KH wrote:
> > But registering 20000 devices is a far different problem from using
> > those 20000 devices :)
> 
> Registering 20,000 devices *is* a real world problem (I'm actually aiming 
> for 100,000, as that's what roughly fits in a single 10Gbps link -- something 
> that a mid range system can now route).  When an edge router comes up from 
> reboot, or after a link has been down, the rate at which customers connect 
> is important -- too slow, and you get a pile of support calls from customers 
> complaining that their connection is down.  Because of the data structures 
> used, there isn't even any improvement from an SMP system, so this needs 
> to be addressed directly.

Ok, how long are we talking about here?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03  3:50 ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
  2009-11-03  6:14   ` Eric Dumazet
@ 2009-11-03 20:01   ` Benjamin LaHaise
  2009-11-03 21:32     ` Eric W. Biederman
  1 sibling, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-03 20:01 UTC (permalink / raw)
  To: Greg KH
  Cc: Eric Dumazet, Eric W. Biederman, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> > Use an rbtree in sysfs_dirent to speed up file lookup times
> > 
> > Systems with large numbers (tens of thousands and more) of network 
> > interfaces stress the sysfs code in ways that make the linear search for 
> > a name match take far too long.  Avoid this by using an rbtree.
> 
> What kind of speedups are you seeing here?  And do these changes cause a
> memory increase due to the structure changes which outweigh the
> speedups?

Depends on the number of interfaces being created.  Without the patch, 
interface creation time tends to double or worse for every 5,000-10,000 
additional network interfaces.

> What kind of test are you doing to reproduce this?

I'm creating 30,000+ network interfaces, with the goal being 100,000.  
With other hacks in the tree to get around the sysctl and procfs scaling 
issues, as well as disabling things like NetworkManager, the results look 
as follows:

	Interfaces	no-rb	rbtree	rbtree+list
	0-5,000		13.8s	14.0s	13.0s
	5,000-10,000	20.0s	17.4s	14.4s
	10,000-15,000	27.3s	24.1s	16.9s
	15,000-20,000	36.3s	32.2s	19.7s
	20,000-25,000	45.2s	40.0s	22.9s
	25,000-30,000	54.2s	48.2s	26.6s
	30,000-35,000	63.9s	54.9s	30.7s

Thoughts?

		-ben

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 20:01   ` Benjamin LaHaise
@ 2009-11-03 21:32     ` Eric W. Biederman
  2009-11-03 21:43       ` Eric W. Biederman
  2009-11-03 21:52       ` Benjamin LaHaise
  0 siblings, 2 replies; 20+ messages in thread
From: Eric W. Biederman @ 2009-11-03 21:32 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Greg KH, Eric Dumazet, Octavian Purdila, netdev, Cosmin Ratiu,
	linux-kernel

Benjamin LaHaise <bcrl@lhnet.ca> writes:

> On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
>> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> > Use an rbtree in sysfs_dirent to speed up file lookup times
>> > 
>> > Systems with large numbers (tens of thousands and more) of network 
>> > interfaces stress the sysfs code in ways that make the linear search for 
>> > a name match take far too long.  Avoid this by using an rbtree.
>> 
>> What kind of speedups are you seeing here?  And do these changes cause a
>> memory increase due to the structure changes which outweigh the
>> speedups?
>
> Depends on the number of interfaces being created.  Without the patch, 
> interface creation time tends to double or worse for every 5,000-10,000 
> additional network interfaces.
>
>> What kind of test are you doing to reproduce this?
>
> I'm creating 30,000+ network interfaces, with the goal being 100,000.  
> With other hacks in the tree to get around the sysctl and procfs scaling 
> issues, as well as disabling things like NetworkManager, the results look 
> as follows:
>
> 	Interfaces	no-rb	rbtree	rbtree+list
> 	0-5,000		13.8s	14.0s	13.0s
> 	5,000-10,000	20.0s	17.4s	14.4s
> 	10,000-15,000	27.3s	24.1s	16.9s
> 	15,000-20,000	36.3s	32.2s	19.7s
> 	20,000-25,000	45.2s	40.0s	22.9s
> 	25,000-30,000	54.2s	48.2s	26.6s
> 	30,000-35,000	63.9s	54.9s	30.7s
>
> Thoughts?

Something is very weird.  I just took your no-rb numbers
and divided by the number of interfaces to see how well we are
scaling.  I got:

 	Interfaces	per-interface cost
 	5,000		0.002760s
 	10,000		0.002000s
 	15,000		0.001820s
 	20,000		0.001815s
 	25,000		0.001808s
 	30,000		0.001807s
 	35,000		0.001826s

I ran a variant of this test a long time ago and at that the
cost per interface grew with additional interfaces added.  This
jives with the fact that the fundamental cost of adding N
network interfaces to sysfs is O(N^2).

Are your numbers from your application and are they real world?
In which case they are interesting, but it would be good if
we could also have microbenchmark numbers that just measure
the sysfs costs.   If nothing else I am seeing a big startup
overhead that isn't being subtracted out that makes it hard
to see the real costs here.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 21:32     ` Eric W. Biederman
@ 2009-11-03 21:43       ` Eric W. Biederman
  2009-11-03 21:56         ` Benjamin LaHaise
  2009-11-03 21:52       ` Benjamin LaHaise
  1 sibling, 1 reply; 20+ messages in thread
From: Eric W. Biederman @ 2009-11-03 21:43 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Greg KH, Eric Dumazet, Octavian Purdila, netdev, Cosmin Ratiu,
	linux-kernel

ebiederm@xmission.com (Eric W. Biederman) writes:

> Benjamin LaHaise <bcrl@lhnet.ca> writes:
>
>> On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
>>> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>>> > Use an rbtree in sysfs_dirent to speed up file lookup times
>>> > 
>>> > Systems with large numbers (tens of thousands and more) of network 
>>> > interfaces stress the sysfs code in ways that make the linear search for 
>>> > a name match take far too long.  Avoid this by using an rbtree.
>>> 
>>> What kind of speedups are you seeing here?  And do these changes cause a
>>> memory increase due to the structure changes which outweigh the
>>> speedups?
>>
>> Depends on the number of interfaces being created.  Without the patch, 
>> interface creation time tends to double or worse for every 5,000-10,000 
>> additional network interfaces.
>>
>>> What kind of test are you doing to reproduce this?
>>
>> I'm creating 30,000+ network interfaces, with the goal being 100,000.  
>> With other hacks in the tree to get around the sysctl and procfs scaling 
>> issues, as well as disabling things like NetworkManager, the results look 
>> as follows:
>>
>> 	Interfaces	no-rb	rbtree	rbtree+list
>> 	0-5,000		13.8s	14.0s	13.0s
>> 	5,000-10,000	20.0s	17.4s	14.4s
>> 	10,000-15,000	27.3s	24.1s	16.9s
>> 	15,000-20,000	36.3s	32.2s	19.7s
>> 	20,000-25,000	45.2s	40.0s	22.9s
>> 	25,000-30,000	54.2s	48.2s	26.6s
>> 	30,000-35,000	63.9s	54.9s	30.7s
>>
>> Thoughts?
>
> Something is very weird.  I just took your no-rb numbers
> and divided by the number of interfaces to see how well we are
> scaling.  I got:
>
>  	Interfaces	per-interface cost
>  	5,000		0.002760s
>  	10,000		0.002000s
>  	15,000		0.001820s
>  	20,000		0.001815s
>  	25,000		0.001808s
>  	30,000		0.001807s
>  	35,000		0.001826s
>
> I ran a variant of this test a long time ago and at that the
> cost per interface grew with additional interfaces added.  This
> jives with the fact that the fundamental cost of adding N
> network interfaces to sysfs is O(N^2).
>
> Are your numbers from your application and are they real world?
> In which case they are interesting, but it would be good if
> we could also have microbenchmark numbers that just measure
> the sysfs costs.   If nothing else I am seeing a big startup
> overhead that isn't being subtracted out that makes it hard
> to see the real costs here.

I guess in particular what I would expect is that if we can do 35000
interfaces in 63s with an O(N^2) algorithm.  Then we should be able to
do 35000 interfaces with an O(NlogN) algorithm in under a second.
Which for your application should make the time essentially flat in
the number of interfaces.

Until we get close to that I figure we need to do more digging.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 21:32     ` Eric W. Biederman
  2009-11-03 21:43       ` Eric W. Biederman
@ 2009-11-03 21:52       ` Benjamin LaHaise
  2009-11-03 22:18         ` Eric W. Biederman
  1 sibling, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-03 21:52 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Greg KH, Eric Dumazet, Octavian Purdila, netdev, Cosmin Ratiu,
	linux-kernel

On Tue, Nov 03, 2009 at 01:32:33PM -0800, Eric W. Biederman wrote:
> Are your numbers from your application and are they real world?
> In which case they are interesting, but it would be good if
> we could also have microbenchmark numbers that just measure
> the sysfs costs.   If nothing else I am seeing a big startup
> overhead that isn't being subtracted out that makes it hard
> to see the real costs here.

They're application based, so there's a bunch of other overhead included 
that won't show up on a microbenchmark.  Each interface requires a round 
trip between 2 L2TP daemons, so there are lots of syscalls and other cache 
polluting effects that won't show up on a microbenchmark.  One of the L2TP 
daemons is configured not to instantiate any kernel state -- running in 
this mode, it has very little overhead.

The other thing to note is that the costs posted are how long it takes to 
add an additional 5,000 interfaces in the given range, not the total time 
to add say 35,000 interfaces (I didn't feel like waiting that long).

		-ben

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 21:43       ` Eric W. Biederman
@ 2009-11-03 21:56         ` Benjamin LaHaise
  2009-11-03 22:06           ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2009-11-03 21:56 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Greg KH, Eric Dumazet, Octavian Purdila, netdev, Cosmin Ratiu,
	linux-kernel

On Tue, Nov 03, 2009 at 01:43:43PM -0800, Eric W. Biederman wrote:
> I guess in particular what I would expect is that if we can do 35000
> interfaces in 63s with an O(N^2) algorithm.  Then we should be able to
> do 35000 interfaces with an O(NlogN) algorithm in under a second.
> Which for your application should make the time essentially flat in
> the number of interfaces.

That's the wrong way to interprete the numbers.  The 35000 number of 63s is 
the time that it takes 63s to add 5000 more interfaces in the 30,000 to 
35,000 range.  This includes the time required to add a point to point ip 
route on the interface and bring the interface up.

		-ben

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 21:56         ` Benjamin LaHaise
@ 2009-11-03 22:06           ` Eric Dumazet
  0 siblings, 0 replies; 20+ messages in thread
From: Eric Dumazet @ 2009-11-03 22:06 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Eric W. Biederman, Greg KH, Octavian Purdila, netdev, Cosmin Ratiu

Benjamin LaHaise a écrit :
> On Tue, Nov 03, 2009 at 01:43:43PM -0800, Eric W. Biederman wrote:
>> I guess in particular what I would expect is that if we can do 35000
>> interfaces in 63s with an O(N^2) algorithm.  Then we should be able to
>> do 35000 interfaces with an O(NlogN) algorithm in under a second.
>> Which for your application should make the time essentially flat in
>> the number of interfaces.
> 
> That's the wrong way to interprete the numbers.  The 35000 number of 63s is 
> the time that it takes 63s to add 5000 more interfaces in the 30,000 to 
> 35,000 range.  This includes the time required to add a point to point ip 
> route on the interface and bring the interface up.

Speaking of pppol2tp, it seems /proc/net/pppol2tp is not safe,
with a strange two phases locking...

(We keep in struct pppol2tp_seq_data pointers to structures
that might have been freed between to read() syscalls)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 21:52       ` Benjamin LaHaise
@ 2009-11-03 22:18         ` Eric W. Biederman
  0 siblings, 0 replies; 20+ messages in thread
From: Eric W. Biederman @ 2009-11-03 22:18 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Greg KH, Eric Dumazet, Octavian Purdila, netdev, Cosmin Ratiu,
	linux-kernel

Benjamin LaHaise <bcrl@lhnet.ca> writes:

> On Tue, Nov 03, 2009 at 01:32:33PM -0800, Eric W. Biederman wrote:
>> Are your numbers from your application and are they real world?
>> In which case they are interesting, but it would be good if
>> we could also have microbenchmark numbers that just measure
>> the sysfs costs.   If nothing else I am seeing a big startup
>> overhead that isn't being subtracted out that makes it hard
>> to see the real costs here.
>
> They're application based, so there's a bunch of other overhead included 
> that won't show up on a microbenchmark.  Each interface requires a round 
> trip between 2 L2TP daemons, so there are lots of syscalls and other cache 
> polluting effects that won't show up on a microbenchmark.  One of the L2TP 
> daemons is configured not to instantiate any kernel state -- running in 
> this mode, it has very little overhead.
>
> The other thing to note is that the costs posted are how long it takes to 
> add an additional 5,000 interfaces in the given range, not the total time 
> to add say 35,000 interfaces (I didn't feel like waiting that long).

Ok.  That makes a lot more sense.  The times you posted ideally would be flat
but they go up from 12s to 60s.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
  2009-11-03 16:07     ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
  2009-11-03 16:38       ` Octavian Purdila
  2009-11-03 16:45       ` Benjamin LaHaise
@ 2009-11-03 22:28       ` Eric W. Biederman
  2 siblings, 0 replies; 20+ messages in thread
From: Eric W. Biederman @ 2009-11-03 22:28 UTC (permalink / raw)
  To: Greg KH
  Cc: Eric Dumazet, Benjamin LaHaise, Octavian Purdila, netdev,
	Cosmin Ratiu, linux-kernel

Greg KH <greg@kroah.com> writes:

> On Tue, Nov 03, 2009 at 07:14:33AM +0100, Eric Dumazet wrote:
>> Greg KH a ?crit :
>> > On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> >> Use an rbtree in sysfs_dirent to speed up file lookup times
>> >>
>> >> Systems with large numbers (tens of thousands and more) of network 
>> >> interfaces stress the sysfs code in ways that make the linear search for 
>> >> a name match take far too long.  Avoid this by using an rbtree.
>> > 
>> > What kind of speedups are you seeing here?  And do these changes cause a
>> > memory increase due to the structure changes which outweigh the
>> > speedups?
>> > 
>> > What kind of test are you doing to reproduce this?
>> > 
>> 
>> Its curious because in my tests the biggest problems come from
>> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
>> in following attempt to create 20.000 devices
>> 
>> (disable hotplug before trying this, and ipv6 too !)
>> modprobe dummy numdummies=20000
>> 
>> I believe we should address __register_sysctl_paths() scalability
>> problems too.
>
> But registering 20000 devices is a far different problem from using
> those 20000 devices :)
>
> I think the "use the device" path should be the one we care the most
> about fixing up, as that is much more common than the register path for
> all users.

Definitely.  Of the three proc sysctl and sysfs.  sysctl tends to have
the worst costs across the board.  They are all rarely used so a lot
of what gets hit when scaling are rare path events that even the most
horrible code works fine on small systems.

Usually slow registration times indicate an O(N^2) or worse data
structure for filename lookup.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2009-11-03 22:28 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-01 16:31 [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Benjamin LaHaise
2009-11-01 16:32 ` [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents Benjamin LaHaise
2009-11-01 16:33   ` [PATCH 2/3] sysfs directory scaling: count number of children dirs Benjamin LaHaise
2009-11-03  3:50 ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
2009-11-03  6:14   ` Eric Dumazet
2009-11-03  7:01     ` [PATCH] sysctl: reduce ram usage by 40 % Eric Dumazet
2009-11-03 10:23       ` Eric W. Biederman
2009-11-03 16:07     ` [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups Greg KH
2009-11-03 16:38       ` Octavian Purdila
2009-11-03 16:45       ` Benjamin LaHaise
2009-11-03 17:56         ` Greg KH
2009-11-03 22:28       ` Eric W. Biederman
2009-11-03 20:01   ` Benjamin LaHaise
2009-11-03 21:32     ` Eric W. Biederman
2009-11-03 21:43       ` Eric W. Biederman
2009-11-03 21:56         ` Benjamin LaHaise
2009-11-03 22:06           ` Eric Dumazet
2009-11-03 21:52       ` Benjamin LaHaise
2009-11-03 22:18         ` Eric W. Biederman
2009-11-03 10:41 ` Eric W. Biederman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.