All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] dev: add support of flag IFF_NOPROC
@ 2013-10-03 13:28 Nicolas Dichtel
  2013-10-03 13:30 ` [PATCH iproute2 net-next-3.11] ip: add support of link " Nicolas Dichtel
  2013-10-03 17:46 ` [PATCH net-next] dev: add support of " Stephen Hemminger
  0 siblings, 2 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2013-10-03 13:28 UTC (permalink / raw)
  To: netdev; +Cc: davem, Nicolas Dichtel

This flag allows to create netdevices without creating directories in
/proc, ie no /proc/sys/net/ipv[4|6]/[conf|neigh]/<dev> and no
/proc/net/dev_snmp6/<dev>.

When a system creates a lot of virtual netdevices, this allows to speed up the
creation time. For systems which continuously create and destroy virtual
netdevices, proc entries for these netdevices may not be used, hence adding this
flag is interesting.

Note that the flag should be specified at the creation time (before calling
register_netdevice()) and cannot be removed during the life of the netdevice.

Here are some numbers:

dummy20000.batch contains 20 000 times 'link add type dummy' and
dummy20000-noproc.batch 20 000 times 'link add noproc type dummy'.

time ip -b dummy20000.batch
real    0m56.367s
user    0m0.200s
sys     0m53.070s

time ip -b dummy20000-noproc.batch
real    0m42.417s
user    0m0.310s
sys     0m38.470s

Suggested-by: Thierry Herbelot <thierry.herbelot@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/uapi/linux/if.h | 2 ++
 net/core/dev.c          | 2 +-
 net/core/rtnetlink.c    | 1 +
 net/ipv4/devinet.c      | 3 +++
 net/ipv6/addrconf.c     | 3 +++
 net/ipv6/proc.c         | 5 +++++
 6 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 1ec407b01e46..bb9fe5eb38bf 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -53,6 +53,8 @@
 
 #define IFF_ECHO	0x40000		/* echo sent packets		*/
 
+#define IFF_NOPROC	0x80000		/* no proc/sysctl directories	*/
+
 #define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
 		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c25db20a4246..13f6dd360c74 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5199,7 +5199,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 			       IFF_AUTOMEDIA)) |
 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
-				    IFF_ALLMULTI));
+				    IFF_ALLMULTI | IFF_NOPROC));
 
 	/*
 	 *	Load in the correct multicast list now the flags have changed.
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4aedf03da052..5bad28e66fa2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1860,6 +1860,7 @@ replay:
 		}
 
 		dev->ifindex = ifm->ifi_index;
+		dev->flags |= ifm->ifi_flags & IFF_NOPROC;
 
 		if (ops->newlink)
 			err = ops->newlink(net, dev, tb, data);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a1b5bcbd04ae..13b4089d8996 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2160,6 +2160,9 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
 
 static void devinet_sysctl_register(struct in_device *idev)
 {
+	if (idev->dev->flags & IFF_NOPROC)
+		return;
+
 	neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
 	__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
 					&idev->cnf);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cd3fb301da38..e06d15ea2dba 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5032,6 +5032,9 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
 
 static void addrconf_sysctl_register(struct inet6_dev *idev)
 {
+	if (idev->dev->flags & IFF_NOPROC)
+		return;
+
 	neigh_sysctl_register(idev->dev, idev->nd_parms, "ipv6",
 			      &ndisc_ifinfo_sysctl_change);
 	__addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 091d066a57b3..f89911116aa7 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -274,6 +274,9 @@ int snmp6_register_dev(struct inet6_dev *idev)
 	if (!idev || !idev->dev)
 		return -EINVAL;
 
+	if (idev->dev->flags & IFF_NOPROC)
+		return 0;
+
 	net = dev_net(idev->dev);
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
@@ -291,6 +294,8 @@ int snmp6_register_dev(struct inet6_dev *idev)
 int snmp6_unregister_dev(struct inet6_dev *idev)
 {
 	struct net *net = dev_net(idev->dev);
+	if (idev->dev->flags & IFF_NOPROC)
+		return 0;
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 	if (!idev->stats.proc_dir_entry)
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH iproute2 net-next-3.11] ip: add support of link flag IFF_NOPROC
  2013-10-03 13:28 [PATCH net-next] dev: add support of flag IFF_NOPROC Nicolas Dichtel
@ 2013-10-03 13:30 ` Nicolas Dichtel
  2013-10-03 17:46 ` [PATCH net-next] dev: add support of " Stephen Hemminger
  1 sibling, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2013-10-03 13:30 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, davem, Nicolas Dichtel

When this flag is specified, /proc/sys/net/ipv[4|6]/[conf|neigh]/<dev> and
/proc/net/dev_snmp6/<dev> directories are not created.

This flag cannot be removed.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/linux/if.h    | 2 ++
 ip/ipaddress.c        | 1 +
 ip/iplink.c           | 3 +++
 man/man8/ip-link.8.in | 8 ++++++++
 4 files changed, 14 insertions(+)

diff --git a/include/linux/if.h b/include/linux/if.h
index 7f261c08e816..5b8a5ebff599 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -53,6 +53,8 @@
 
 #define IFF_ECHO	0x40000		/* echo sent packets		*/
 
+#define IFF_NOPROC	0x80000		/* no proc/sysctl directories	*/
+
 #define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
 		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
 
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 1c3e4da0d0da..b2e35028c844 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -116,6 +116,7 @@ static void print_link_flags(FILE *fp, unsigned flags, unsigned mdown)
 	_PF(LOWER_UP);
 	_PF(DORMANT);
 	_PF(ECHO);
+	_PF(NOPROC);
 #undef _PF
 	if (flags)
 		fprintf(fp, "%x", flags);
diff --git a/ip/iplink.c b/ip/iplink.c
index ada9d4255ba2..253ed1cc3f6f 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -50,6 +50,7 @@ void iplink_usage(void)
 		fprintf(stderr, "                   [ mtu MTU ]\n");
 		fprintf(stderr, "                   [ numtxqueues QUEUE_COUNT ]\n");
 		fprintf(stderr, "                   [ numrxqueues QUEUE_COUNT ]\n");
+		fprintf(stderr, "                   [ noproc ]\n");
 		fprintf(stderr, "                   type TYPE [ ARGS ]\n");
 		fprintf(stderr, "       ip link delete DEV type TYPE [ ARGS ]\n");
 		fprintf(stderr, "\n");
@@ -480,6 +481,8 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
 				invarg("Invalid \"numrxqueues\" value\n", *argv);
 			addattr_l(&req->n, sizeof(*req), IFLA_NUM_RX_QUEUES,
 				  &numrxqueues, 4);
+		} else if (matches(*argv, "noproc") == 0) {
+			req->i.ifi_flags |= IFF_NOPROC;
 		} else {
 			if (strcmp(*argv, "dev") == 0) {
 				NEXT_ARG();
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 76f92ddbd82c..b16d1a1f8a41 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -45,6 +45,8 @@ ip-link \- network device configuration
 .RB "[ " numrxqueues
 .IR QUEUE_COUNT " ]"
 .br
+.RB "[ " noproc " ]"
+.br
 .BR type " TYPE"
 .RI "[ " ARGS " ]"
 
@@ -197,6 +199,12 @@ specifies the number of transmit queues for new device.
 specifies the number of receive queues for new device.
 
 .TP
+.BI noproc
+specifies to no create iface related directories under /proc
+(/proc/sys/net/ipv[4|6]/[conf|neigh]/<dev> and
+/proc/net/dev_snmp6/<dev>)
+
+.TP
 VXLAN Type Support
 For a link of type 
 .I VXLAN
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH net-next] dev: add support of flag IFF_NOPROC
  2013-10-03 13:28 [PATCH net-next] dev: add support of flag IFF_NOPROC Nicolas Dichtel
  2013-10-03 13:30 ` [PATCH iproute2 net-next-3.11] ip: add support of link " Nicolas Dichtel
@ 2013-10-03 17:46 ` Stephen Hemminger
  2013-10-03 19:09   ` David Miller
  1 sibling, 1 reply; 31+ messages in thread
From: Stephen Hemminger @ 2013-10-03 17:46 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: netdev, davem

On Thu,  3 Oct 2013 15:28:25 +0200
Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:

> This flag allows to create netdevices without creating directories in
> /proc, ie no /proc/sys/net/ipv[4|6]/[conf|neigh]/<dev> and no
> /proc/net/dev_snmp6/<dev>.
> 
> When a system creates a lot of virtual netdevices, this allows to speed up the
> creation time. For systems which continuously create and destroy virtual
> netdevices, proc entries for these netdevices may not be used, hence adding this
> flag is interesting.
> 
> Note that the flag should be specified at the creation time (before calling
> register_netdevice()) and cannot be removed during the life of the netdevice.
> 
> Here are some numbers:
> 
> dummy20000.batch contains 20 000 times 'link add type dummy' and
> dummy20000-noproc.batch 20 000 times 'link add noproc type dummy'.
> 
> time ip -b dummy20000.batch
> real    0m56.367s
> user    0m0.200s
> sys     0m53.070s
> 
> time ip -b dummy20000-noproc.batch
> real    0m42.417s
> user    0m0.310s
> sys     0m38.470s
> 
> Suggested-by: Thierry Herbelot <thierry.herbelot@6wind.com>
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

Seems like a special case. The problem is that you just created devices
that are unmanageable and might well break other tools in the system.
What about speeding up proc or sysfs? Or providing a bulk create/destroy.
Also if you used a custom program it could have seperate netlink send
and receive threads to pipeline the creation.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH net-next] dev: add support of flag IFF_NOPROC
  2013-10-03 17:46 ` [PATCH net-next] dev: add support of " Stephen Hemminger
@ 2013-10-03 19:09   ` David Miller
  2013-10-04 12:07     ` Nicolas Dichtel
  2014-10-02 15:24     ` [RFC PATCH linux 0/2] Optimize network interfaces creation Nicolas Dichtel
  0 siblings, 2 replies; 31+ messages in thread
From: David Miller @ 2013-10-03 19:09 UTC (permalink / raw)
  To: stephen; +Cc: nicolas.dichtel, netdev

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 3 Oct 2013 10:46:27 -0700

> What about speeding up proc or sysfs? Or providing a bulk create/destroy.

+1 +1 +1

This will benefit more people than the just the envisioned users for
this IFF_NOPROC thing.

I really don't want to take the IFF_NOPROC approach.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH net-next] dev: add support of flag IFF_NOPROC
  2013-10-03 19:09   ` David Miller
@ 2013-10-04 12:07     ` Nicolas Dichtel
  2013-10-04 17:29       ` David Miller
  2014-10-02 15:24     ` [RFC PATCH linux 0/2] Optimize network interfaces creation Nicolas Dichtel
  1 sibling, 1 reply; 31+ messages in thread
From: Nicolas Dichtel @ 2013-10-04 12:07 UTC (permalink / raw)
  To: David Miller, stephen; +Cc: netdev

Le 03/10/2013 21:09, David Miller a écrit :
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Thu, 3 Oct 2013 10:46:27 -0700
>
>> What about speeding up proc or sysfs? Or providing a bulk create/destroy.
>
> +1 +1 +1
>
> This will benefit more people than the just the envisioned users for
> this IFF_NOPROC thing.
>
> I really don't want to take the IFF_NOPROC approach.
>
Of course optimizing /proc and /sysfs is a good option, but any optimizations
will never be as fast as disabling them for some well known netdevices.

Note also that the memory consumption is significantly less with this flag:
for 20000 dummy interfaces:
without the flag: 463,84Mo
with the flag: 297,45Mo
the gain is 166Mo (35%)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH net-next] dev: add support of flag IFF_NOPROC
  2013-10-04 12:07     ` Nicolas Dichtel
@ 2013-10-04 17:29       ` David Miller
  0 siblings, 0 replies; 31+ messages in thread
From: David Miller @ 2013-10-04 17:29 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: stephen, netdev

From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 04 Oct 2013 14:07:00 +0200

> Of course optimizing /proc and /sysfs is a good option, but any
> optimizations
> will never be as fast as disabling them for some well known
> netdevices.
> 
> Note also that the memory consumption is significantly less with this
> flag:

It potentially breaks tools, it's a non-starter, sorry.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC PATCH linux 0/2] Optimize network interfaces creation
  2013-10-03 19:09   ` David Miller
  2013-10-04 12:07     ` Nicolas Dichtel
@ 2014-10-02 15:24     ` Nicolas Dichtel
  2014-10-02 15:25       ` [RFC PATCH linux 1/2] proc_net: declare /proc/net as a directory Nicolas Dichtel
  2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
  1 sibling, 2 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-02 15:24 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso

When a lot of netdevices are created, one of the bottleneck is the creation
of proc entries. This serie aims to accelerate this part.

The first patch only prepares the second one.

I'm not sure against which tree this patch should be done. I've done it against
linux.git.

 fs/proc/generic.c  | 100 +++++++++++++++++++++++++++++++++++++++++------------
 fs/proc/internal.h |  49 +++++++++++++++++++++++---
 fs/proc/proc_net.c |   8 +++++
 fs/proc/root.c     |   5 +++
 4 files changed, 135 insertions(+), 27 deletions(-)

Comments are welcome.

Regards,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC PATCH linux 1/2] proc_net: declare /proc/net as a directory
  2014-10-02 15:24     ` [RFC PATCH linux 0/2] Optimize network interfaces creation Nicolas Dichtel
@ 2014-10-02 15:25       ` Nicolas Dichtel
  2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
  1 sibling, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-02 15:25 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso, Thierry Herbelot,
	Nicolas Dichtel

From: Thierry Herbelot <thierry.herbelot@6wind.com>

The mode bits are copied from those of "proc_root".

This commit prepares the next patch.

Signed-off-by: Thierry Herbelot <thierry.herbelot@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 fs/proc/proc_net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..6fc308ec105c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -193,6 +193,7 @@ static __net_init int proc_net_ns_init(struct net *net)
 		goto out;
 
 	netd->data = net;
+	netd->mode = S_IFDIR | S_IRUGO | S_IXUGO;
 	netd->nlink = 2;
 	netd->namelen = 3;
 	netd->parent = &proc_root;
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 15:24     ` [RFC PATCH linux 0/2] Optimize network interfaces creation Nicolas Dichtel
  2014-10-02 15:25       ` [RFC PATCH linux 1/2] proc_net: declare /proc/net as a directory Nicolas Dichtel
@ 2014-10-02 15:25       ` Nicolas Dichtel
  2014-10-02 16:46         ` Stephen Hemminger
                           ` (3 more replies)
  1 sibling, 4 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-02 15:25 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso, Thierry Herbelot,
	Nicolas Dichtel

From: Thierry Herbelot <thierry.herbelot@6wind.com>

The current implementation for the directories in /proc is using a single
linked list. This is slow when handling directories with large numbers of
entries (eg netdevice-related entries when lots of tunnels are opened).

This patch enables multiple linked lists. A hash based on the entry name is
used to select the linked list for one given entry.

The speed creation of netdevices is faster as shorter linked lists must be
scanned when adding a new netdevice.

Here are some numbers:

dummy30000.batch contains 30 000 times 'link add type dummy'.

Before the patch:
time ip -b dummy30000.batch
real    2m32.221s
user    0m0.380s
sys     2m30.610s

After the patch:
time ip -b dummy30000.batch
real    1m57.190s
user    0m0.350s
sys     1m56.120s

The single 'subdir' list head is replaced by a subdir hash table. The subdir
hash buckets are only allocated for directories. The number of hash buckets
is a compile-time parameter.

For all functions which handle directory entries, an additional check on the
directory nature of the dir entry ensures that pde_hash_buckets was allocated.
This check was not needed as subdir was present for all dir entries, whether
actual directories or simple files.

Signed-off-by: Thierry Herbelot <thierry.herbelot@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 fs/proc/generic.c  | 100 +++++++++++++++++++++++++++++++++++++++++------------
 fs/proc/internal.h |  49 +++++++++++++++++++++++---
 fs/proc/proc_net.c |   7 ++++
 fs/proc/root.c     |   5 +++
 4 files changed, 134 insertions(+), 27 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 317b72641ebf..c3af8c289c7e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -81,10 +81,13 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 	const char     		*cp = name, *next;
 	struct proc_dir_entry	*de;
 	unsigned int		len;
+	unsigned int		name_hash;
 
 	de = *ret;
 	if (!de)
 		de = &proc_root;
+	if (!S_ISDIR(de->mode))
+		return -EINVAL;
 
 	while (1) {
 		next = strchr(cp, '/');
@@ -92,7 +95,9 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 			break;
 
 		len = next - cp;
-		for (de = de->subdir; de ; de = de->next) {
+		name_hash = proc_pde_name_hash(cp, len);
+		for (de = de->pde_hash_buckets[name_hash]; de;
+		     de = de->next) {
 			if (proc_match(len, cp, de))
 				break;
 		}
@@ -180,10 +185,15 @@ static const struct inode_operations proc_link_inode_operations = {
 struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		struct dentry *dentry)
 {
+	unsigned int name_hash;
 	struct inode *inode;
 
+	if (!S_ISDIR(de->mode))
+		return NULL;
 	spin_lock(&proc_subdir_lock);
-	for (de = de->subdir; de ; de = de->next) {
+	name_hash = proc_pde_name_hash(dentry->d_name.name,
+				       dentry->d_name.len);
+	for (de = de->pde_hash_buckets[name_hash]; de ; de = de->next) {
 		if (de->namelen != dentry->d_name.len)
 			continue;
 		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
@@ -219,18 +229,30 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
 int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		    struct dir_context *ctx)
 {
+	struct proc_dir_entry *dir;
+	unsigned int hash_idx = 0;
 	int i;
 
+	dir = de;
+	if (!S_ISDIR(dir->mode))
+		return -EINVAL;
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
 	spin_lock(&proc_subdir_lock);
-	de = de->subdir;
+	/* try first hash bucket */
+	de = de->pde_hash_buckets[0];
+
 	i = ctx->pos - 2;
 	for (;;) {
 		if (!de) {
-			spin_unlock(&proc_subdir_lock);
-			return 0;
+			/* try next hash bucket if one is availalable */
+			hash_idx = find_next_hash_bucket(dir, hash_idx + 1);
+			if (hash_idx == PROC_PDE_HASH_BUCKETS) {
+				spin_unlock(&proc_subdir_lock);
+				return 0;
+			}
+			de = dir->pde_hash_buckets[hash_idx];
 		}
 		if (!i)
 			break;
@@ -250,6 +272,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		spin_lock(&proc_subdir_lock);
 		ctx->pos++;
 		next = de->next;
+		if (!next) {
+			/* try next hash bucket if one is availalable */
+			hash_idx = find_next_hash_bucket(dir, hash_idx + 1);
+			if (hash_idx != PROC_PDE_HASH_BUCKETS)
+				next = dir->pde_hash_buckets[hash_idx];
+		}
 		pde_put(de);
 		de = next;
 	} while (de);
@@ -287,8 +315,12 @@ static const struct inode_operations proc_dir_inode_operations = {
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
 	struct proc_dir_entry *tmp;
+	unsigned int name_hash;
 	int ret;
-	
+
+	if (!S_ISDIR(dir->mode))
+		return -EINVAL;
+
 	ret = proc_alloc_inum(&dp->low_ino);
 	if (ret)
 		return ret;
@@ -309,16 +341,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 
 	spin_lock(&proc_subdir_lock);
 
-	for (tmp = dir->subdir; tmp; tmp = tmp->next)
+	name_hash = proc_pde_name_hash(dp->name, strlen(dp->name));
+	for (tmp = dir->pde_hash_buckets[name_hash]; tmp; tmp = tmp->next)
 		if (strcmp(tmp->name, dp->name) == 0) {
 			WARN(1, "proc_dir_entry '%s/%s' already registered\n",
 				dir->name, dp->name);
 			break;
 		}
 
-	dp->next = dir->subdir;
+	dp->next = dir->pde_hash_buckets[name_hash];
 	dp->parent = dir;
-	dir->subdir = dp;
+	dir->pde_hash_buckets[name_hash] = dp;
 	spin_unlock(&proc_subdir_lock);
 
 	return 0;
@@ -349,6 +382,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
 	if (!ent)
 		goto out;
+	if (S_ISDIR(mode)) {
+		ent->pde_hash_buckets = kzalloc(PROC_PDE_HASH_SIZE, GFP_KERNEL);
+		if (!ent->pde_hash_buckets) {
+			kfree(ent);
+			ent = NULL;
+			goto out;
+		}
+	}
 
 	memcpy(ent->name, fn, qstr.len + 1);
 	ent->namelen = qstr.len;
@@ -471,6 +512,8 @@ static void free_proc_entry(struct proc_dir_entry *de)
 
 	if (S_ISLNK(de->mode))
 		kfree(de->data);
+	if (S_ISDIR(de->mode))
+		kfree(de->pde_hash_buckets);
 	kfree(de);
 }
 
@@ -488,7 +531,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	struct proc_dir_entry **p;
 	struct proc_dir_entry *de = NULL;
 	const char *fn = name;
-	unsigned int len;
+	unsigned int len, name_hash;
+
+	if (!S_ISDIR(parent->mode))
+		return;
 
 	spin_lock(&proc_subdir_lock);
 	if (__xlate_proc_name(name, &parent, &fn) != 0) {
@@ -497,7 +543,8 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	}
 	len = strlen(fn);
 
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
+	name_hash = proc_pde_name_hash(fn, len);
+	for (p = &parent->pde_hash_buckets[name_hash]; *p; p = &(*p)->next) {
 		if (proc_match(len, fn, *p)) {
 			de = *p;
 			*p = de->next;
@@ -516,9 +563,11 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
 	de->nlink = 0;
-	WARN(de->subdir, "%s: removing non-empty directory "
-			 "'%s/%s', leaking at least '%s'\n", __func__,
-			 de->parent->name, de->name, de->subdir->name);
+	if (S_ISDIR(de->mode))
+		WARN(de->pde_hash_buckets[name_hash],
+		     "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+		     __func__, de->parent->name, de->name,
+		     de->pde_hash_buckets[name_hash]->name);
 	pde_put(de);
 }
 EXPORT_SYMBOL(remove_proc_entry);
@@ -528,7 +577,10 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 	struct proc_dir_entry **p;
 	struct proc_dir_entry *root = NULL, *de, *next;
 	const char *fn = name;
-	unsigned int len;
+	unsigned int len, name_hash, hash_idx;
+
+	if (!S_ISDIR(parent->mode))
+		return -EINVAL;
 
 	spin_lock(&proc_subdir_lock);
 	if (__xlate_proc_name(name, &parent, &fn) != 0) {
@@ -536,8 +588,9 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 		return -ENOENT;
 	}
 	len = strlen(fn);
+	name_hash = proc_pde_name_hash(fn, len);
 
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
+	for (p = &parent->pde_hash_buckets[name_hash]; *p; p = &(*p)->next) {
 		if (proc_match(len, fn, *p)) {
 			root = *p;
 			*p = root->next;
@@ -551,12 +604,15 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 	}
 	de = root;
 	while (1) {
-		next = de->subdir;
-		if (next) {
-			de->subdir = next->next;
-			next->next = NULL;
-			de = next;
-			continue;
+		if (S_ISDIR(de->mode)) {
+			hash_idx = find_first_hash_bucket(de);
+			if (hash_idx < PROC_PDE_HASH_BUCKETS) {
+				next = de->pde_hash_buckets[hash_idx];
+				de->pde_hash_buckets[hash_idx] = next->next;
+				next->next = NULL;
+				de = next;
+				continue;
+			}
 		}
 		spin_unlock(&proc_subdir_lock);
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..7c32e7821453 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -18,16 +18,28 @@
 struct ctl_table_header;
 struct mempolicy;
 
+#define PROC_PDE_SHIFT		6
+#define PROC_PDE_HASH_BUCKETS	(1 << PROC_PDE_SHIFT)
+#define PROC_PDE_HASH_MASK	(PROC_PDE_HASH_BUCKETS - 1)
+#define PROC_PDE_HASH_SIZE	(PROC_PDE_HASH_BUCKETS * \
+				 sizeof(struct proc_dir_entry *))
+
+static inline unsigned int proc_pde_name_hash(const unsigned char *name,
+					      const unsigned int len)
+{
+	return full_name_hash(name, len) & PROC_PDE_HASH_MASK;
+}
+
 /*
  * This is not completely implemented yet. The idea is to
  * create an in-memory tree (like the actual /proc filesystem
  * tree) of these proc_dir_entries, so that we can dynamically
  * add new files to /proc.
  *
- * The "next" pointer creates a linked list of one /proc directory,
- * while parent/subdir create the directory structure (every
- * /proc file has a parent, but "subdir" is NULL for all
- * non-directory entries).
+ * The "pde_hash_buckets" pointer is a hash table of linked lists for
+ * one /proc directory (every /proc file has a parent, but it is NULL
+ * for all non-directory entries). The linked lists are implemented using
+ * the "next" fields of the proc_dir_entry.
  */
 struct proc_dir_entry {
 	unsigned int low_ino;
@@ -38,7 +50,7 @@ struct proc_dir_entry {
 	loff_t size;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
-	struct proc_dir_entry *next, *parent, *subdir;
+	struct proc_dir_entry *next, *parent;
 	void *data;
 	atomic_t count;		/* use count */
 	atomic_t in_use;	/* number of callers into module in progress; */
@@ -46,10 +58,37 @@ struct proc_dir_entry {
 	struct completion *pde_unload_completion;
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
 	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	struct proc_dir_entry **pde_hash_buckets; /* hash table for dirs */
 	u8 namelen;
 	char name[];
 };
 
+/* index for the next non-NULL bucket in the hash table */
+static inline unsigned int find_next_hash_bucket(const struct proc_dir_entry *pde,
+						 const unsigned int start)
+{
+	unsigned int i;
+
+	if (start >= PROC_PDE_HASH_BUCKETS)
+		return PROC_PDE_HASH_BUCKETS;
+
+	for (i = start;
+	     (i < PROC_PDE_HASH_BUCKETS) && (!pde->pde_hash_buckets[i]);
+	     i++)
+		;
+
+	if (!pde->pde_hash_buckets[i])
+		i = PROC_PDE_HASH_BUCKETS;
+
+	return i;
+}
+
+/* index for the first non-NULL bucket in the hash table */
+static inline int find_first_hash_bucket(const struct proc_dir_entry *pde)
+{
+	return find_next_hash_bucket(pde, 0);
+}
+
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_show)(struct seq_file *m,
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 6fc308ec105c..da2e9f4533ed 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -191,6 +191,12 @@ static __net_init int proc_net_ns_init(struct net *net)
 	netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
 	if (!netd)
 		goto out;
+	netd->pde_hash_buckets = kzalloc(PROC_PDE_HASH_SIZE, GFP_KERNEL);
+	if (!netd->pde_hash_buckets) {
+		kfree(netd);
+		netd = NULL;
+		goto out;
+	}
 
 	netd->data = net;
 	netd->mode = S_IFDIR | S_IRUGO | S_IXUGO;
@@ -217,6 +223,7 @@ out:
 static __net_exit void proc_net_ns_exit(struct net *net)
 {
 	remove_proc_entry("stat", net->proc_net);
+	kfree(net->proc_net->pde_hash_buckets);
 	kfree(net->proc_net);
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 094e44d4a6be..bcd830465871 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -20,6 +20,7 @@
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
+#include <linux/slab.h>
 
 #include "internal.h"
 
@@ -166,6 +167,10 @@ void __init proc_root_init(void)
 {
 	int err;
 
+	proc_root.pde_hash_buckets = kzalloc(PROC_PDE_HASH_SIZE, GFP_KERNEL);
+	if (!proc_root.pde_hash_buckets)
+		return;
+
 	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
@ 2014-10-02 16:46         ` Stephen Hemminger
  2014-10-03 13:10           ` Nicolas Dichtel
  2014-10-02 17:28         ` Alexey Dobriyan
                           ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Stephen Hemminger @ 2014-10-02 16:46 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: netdev, linux-kernel, davem, ebiederm, akpm, adobriyan,
	rui.xiang, viro, oleg, gorcunov, kirill.shutemov, grant.likely,
	tytso, Thierry Herbelot

On Thu,  2 Oct 2014 17:25:01 +0200
Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:

> From: Thierry Herbelot <thierry.herbelot@6wind.com>
> 
> The current implementation for the directories in /proc is using a single
> linked list. This is slow when handling directories with large numbers of
> entries (eg netdevice-related entries when lots of tunnels are opened).
> 
> This patch enables multiple linked lists. A hash based on the entry name is
> used to select the linked list for one given entry.
> 
> The speed creation of netdevices is faster as shorter linked lists must be
> scanned when adding a new netdevice.
> 
> Here are some numbers:
> 
> dummy30000.batch contains 30 000 times 'link add type dummy'.
> 
> Before the patch:
> time ip -b dummy30000.batch
> real    2m32.221s
> user    0m0.380s
> sys     2m30.610s
> 
> After the patch:
> time ip -b dummy30000.batch
> real    1m57.190s
> user    0m0.350s
> sys     1m56.120s
> 
> The single 'subdir' list head is replaced by a subdir hash table. The subdir
> hash buckets are only allocated for directories. The number of hash buckets
> is a compile-time parameter.
> 
> For all functions which handle directory entries, an additional check on the
> directory nature of the dir entry ensures that pde_hash_buckets was allocated.
> This check was not needed as subdir was present for all dir entries, whether
> actual directories or simple files.
> 
> Signed-off-by: Thierry Herbelot <thierry.herbelot@6wind.com>
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

I think the speed up is a good idea and makes sense.
It would be better to use exist hlist macros for hash table rather than
open coding it.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
  2014-10-02 16:46         ` Stephen Hemminger
@ 2014-10-02 17:28         ` Alexey Dobriyan
  2014-10-03 13:07           ` Nicolas Dichtel
  2014-10-02 18:01         ` Eric W. Biederman
  2014-10-03 10:55         ` [RFC PATCH linux 2/2] fs/proc: use a hash table " Alexey Dobriyan
  3 siblings, 1 reply; 31+ messages in thread
From: Alexey Dobriyan @ 2014-10-02 17:28 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: netdev, linux-kernel, davem, ebiederm, akpm, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Thierry Herbelot

On Thu, Oct 02, 2014 at 05:25:01PM +0200, Nicolas Dichtel wrote:
> +static inline unsigned int proc_pde_name_hash(const unsigned char *name,
> +					      const unsigned int len)
> +{
> +	return full_name_hash(name, len) & PROC_PDE_HASH_MASK;
> +}

PDE already stands for "proc dir entry" :^)

	Alexey

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
  2014-10-02 16:46         ` Stephen Hemminger
  2014-10-02 17:28         ` Alexey Dobriyan
@ 2014-10-02 18:01         ` Eric W. Biederman
  2014-10-02 20:06           ` Alexey Dobriyan
  2014-10-03 13:09           ` Nicolas Dichtel
  2014-10-03 10:55         ` [RFC PATCH linux 2/2] fs/proc: use a hash table " Alexey Dobriyan
  3 siblings, 2 replies; 31+ messages in thread
From: Eric W. Biederman @ 2014-10-02 18:01 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: netdev, linux-kernel, davem, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Thierry Herbelot

Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:

> From: Thierry Herbelot <thierry.herbelot@6wind.com>
>
> The current implementation for the directories in /proc is using a single
> linked list. This is slow when handling directories with large numbers of
> entries (eg netdevice-related entries when lots of tunnels are opened).
>
> This patch enables multiple linked lists. A hash based on the entry name is
> used to select the linked list for one given entry.
>
> The speed creation of netdevices is faster as shorter linked lists must be
> scanned when adding a new netdevice.

Is the directory of primary concern /proc/net/dev/snmp6 ?

Unless I have configured my networking stack weird by mistake that
is the only directory under /proc/net that grows when we add an
interface.

I just want to make certain I am seeing the same things that you are
seeing.

I feel silly for overlooking this directory when the rest of the
scalability work was done.

> Here are some numbers:
>
> dummy30000.batch contains 30 000 times 'link add type dummy'.
>
> Before the patch:
> time ip -b dummy30000.batch
> real    2m32.221s
> user    0m0.380s
> sys     2m30.610s
>
> After the patch:
> time ip -b dummy30000.batch
> real    1m57.190s
> user    0m0.350s
> sys     1m56.120s
>
> The single 'subdir' list head is replaced by a subdir hash table. The subdir
> hash buckets are only allocated for directories. The number of hash buckets
> is a compile-time parameter.

That looks like a nice speed up.  A couple of things.

With sysfs and sysctl when faced this class of challenge we used an
rbtree instead of a hash table.  That should use less memory and scale
better.

I am concerned about a fixed sized hash table moving the location where
we fall off a cliff but not removing the cliff itself.

I suppose it would be possible to use the new fancy resizable hash
tables but previous work on sysctl and sysfs suggests that we don't look
up these entries sufficiently to require a hash table.  We just need a
data structure that doesn't fall over at scale, and the rbtrees seem to
do that very nicely.

> For all functions which handle directory entries, an additional check on the
> directory nature of the dir entry ensures that pde_hash_buckets was allocated.
> This check was not needed as subdir was present for all dir entries, whether
> actual directories or simple files.

That bit of logic seems reasonable.

Eric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 18:01         ` Eric W. Biederman
@ 2014-10-02 20:06           ` Alexey Dobriyan
  2014-10-02 21:07             ` Eric W. Biederman
  2014-10-03 13:09           ` Nicolas Dichtel
  1 sibling, 1 reply; 31+ messages in thread
From: Alexey Dobriyan @ 2014-10-02 20:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Nicolas Dichtel, netdev, linux-kernel, davem, akpm, rui.xiang,
	viro, oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Thierry Herbelot

On Thu, Oct 02, 2014 at 11:01:50AM -0700, Eric W. Biederman wrote:
> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
> 
> > From: Thierry Herbelot <thierry.herbelot@6wind.com>
> >
> > The current implementation for the directories in /proc is using a single
> > linked list. This is slow when handling directories with large numbers of
> > entries (eg netdevice-related entries when lots of tunnels are opened).
> >
> > This patch enables multiple linked lists. A hash based on the entry name is
> > used to select the linked list for one given entry.
> >
> > The speed creation of netdevices is faster as shorter linked lists must be
> > scanned when adding a new netdevice.
> 
> Is the directory of primary concern /proc/net/dev/snmp6 ?
> 
> Unless I have configured my networking stack weird by mistake that
> is the only directory under /proc/net that grows when we add an
> interface.
> 
> I just want to make certain I am seeing the same things that you are
> seeing.
> 
> I feel silly for overlooking this directory when the rest of the
> scalability work was done.

Slowdown comes from "duplicate name" check:

        for (tmp = dir->subdir; tmp; tmp = tmp->next)
                if (strcmp(tmp->name, dp->name) == 0) {
                        WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                                dir->name, dp->name);
                        break;
                }

Removal can be made O(1) after switching to doubly-linked list.

	Alexey

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 20:06           ` Alexey Dobriyan
@ 2014-10-02 21:07             ` Eric W. Biederman
  2014-10-02 21:27               ` Stephen Hemminger
  2014-10-03  7:28               ` Nicolas Dichtel
  0 siblings, 2 replies; 31+ messages in thread
From: Eric W. Biederman @ 2014-10-02 21:07 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Nicolas Dichtel, netdev, linux-kernel, davem, akpm, rui.xiang,
	viro, oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Thierry Herbelot

Alexey Dobriyan <adobriyan@gmail.com> writes:

> On Thu, Oct 02, 2014 at 11:01:50AM -0700, Eric W. Biederman wrote:
>> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
>> 
>> > From: Thierry Herbelot <thierry.herbelot@6wind.com>
>> >
>> > The current implementation for the directories in /proc is using a single
>> > linked list. This is slow when handling directories with large numbers of
>> > entries (eg netdevice-related entries when lots of tunnels are opened).
>> >
>> > This patch enables multiple linked lists. A hash based on the entry name is
>> > used to select the linked list for one given entry.
>> >
>> > The speed creation of netdevices is faster as shorter linked lists must be
>> > scanned when adding a new netdevice.
>> 
>> Is the directory of primary concern /proc/net/dev/snmp6 ?
>> 
>> Unless I have configured my networking stack weird by mistake that
>> is the only directory under /proc/net that grows when we add an
>> interface.
>> 
>> I just want to make certain I am seeing the same things that you are
>> seeing.
>> 
>> I feel silly for overlooking this directory when the rest of the
>> scalability work was done.
>
> Slowdown comes from "duplicate name" check:
>
>         for (tmp = dir->subdir; tmp; tmp = tmp->next)
>                 if (strcmp(tmp->name, dp->name) == 0) {
>                         WARN(1, "proc_dir_entry '%s/%s' already registered\n",
>                                 dir->name, dp->name);
>                         break;
>                 }
>
> Removal can be made O(1) after switching to doubly-linked list.

Yes.  There is the however unfortunate fact that proc directories exist
to be used.  If we don't switch to a better data structure than a linked
list the actual use will then opening of the files under
/proc/net/dev/snmp6/ will become O(N^2).  Which doesn't help much
(assuming those files are good for something).

If those files aren't actually useful we should just make registering
them a config option.  Deprecate them strongly and let only people who
need extreme backwards compatibility enable them.

Alexey do you know that those files aren't useful?  Unless we know
otherwise we should make those files useful.

Eric


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 21:07             ` Eric W. Biederman
@ 2014-10-02 21:27               ` Stephen Hemminger
  2014-10-03  7:28               ` Nicolas Dichtel
  1 sibling, 0 replies; 31+ messages in thread
From: Stephen Hemminger @ 2014-10-02 21:27 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexey Dobriyan, Nicolas Dichtel, netdev, linux-kernel, davem,
	akpm, rui.xiang, viro, oleg, gorcunov, kirill.shutemov,
	grant.likely, tytso, Thierry Herbelot

On Thu, 02 Oct 2014 14:07:37 -0700
ebiederm@xmission.com (Eric W. Biederman) wrote:

> Alexey Dobriyan <adobriyan@gmail.com> writes:
> 
> > On Thu, Oct 02, 2014 at 11:01:50AM -0700, Eric W. Biederman wrote:
> >> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
> >> 
> >> > From: Thierry Herbelot <thierry.herbelot@6wind.com>
> >> >
> >> > The current implementation for the directories in /proc is using a single
> >> > linked list. This is slow when handling directories with large numbers of
> >> > entries (eg netdevice-related entries when lots of tunnels are opened).
> >> >
> >> > This patch enables multiple linked lists. A hash based on the entry name is
> >> > used to select the linked list for one given entry.
> >> >
> >> > The speed creation of netdevices is faster as shorter linked lists must be
> >> > scanned when adding a new netdevice.
> >> 
> >> Is the directory of primary concern /proc/net/dev/snmp6 ?
> >> 
> >> Unless I have configured my networking stack weird by mistake that
> >> is the only directory under /proc/net that grows when we add an
> >> interface.
> >> 
> >> I just want to make certain I am seeing the same things that you are
> >> seeing.
> >> 
> >> I feel silly for overlooking this directory when the rest of the
> >> scalability work was done.
> >
> > Slowdown comes from "duplicate name" check:
> >
> >         for (tmp = dir->subdir; tmp; tmp = tmp->next)
> >                 if (strcmp(tmp->name, dp->name) == 0) {
> >                         WARN(1, "proc_dir_entry '%s/%s' already registered\n",
> >                                 dir->name, dp->name);
> >                         break;
> >                 }
> >
> > Removal can be made O(1) after switching to doubly-linked list.
> 
> Yes.  There is the however unfortunate fact that proc directories exist
> to be used.  If we don't switch to a better data structure than a linked
> list the actual use will then opening of the files under
> /proc/net/dev/snmp6/ will become O(N^2).  Which doesn't help much
> (assuming those files are good for something).
> 
> If those files aren't actually useful we should just make registering
> them a config option.  Deprecate them strongly and let only people who
> need extreme backwards compatibility enable them.

Net-snmp uses them (agent/mibgroup/mibII/kernel_linux.c)


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 21:07             ` Eric W. Biederman
  2014-10-02 21:27               ` Stephen Hemminger
@ 2014-10-03  7:28               ` Nicolas Dichtel
  1 sibling, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-03  7:28 UTC (permalink / raw)
  To: Eric W. Biederman, Alexey Dobriyan
  Cc: netdev, linux-kernel, davem, akpm, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso, Thierry Herbelot

Le 02/10/2014 23:07, Eric W. Biederman a écrit :
> Alexey Dobriyan <adobriyan@gmail.com> writes:
>
>> On Thu, Oct 02, 2014 at 11:01:50AM -0700, Eric W. Biederman wrote:
>>> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
>>>
>>>> From: Thierry Herbelot <thierry.herbelot@6wind.com>
>>>>
>>>> The current implementation for the directories in /proc is using a single
>>>> linked list. This is slow when handling directories with large numbers of
>>>> entries (eg netdevice-related entries when lots of tunnels are opened).
>>>>
>>>> This patch enables multiple linked lists. A hash based on the entry name is
>>>> used to select the linked list for one given entry.
>>>>
>>>> The speed creation of netdevices is faster as shorter linked lists must be
>>>> scanned when adding a new netdevice.
>>>
>>> Is the directory of primary concern /proc/net/dev/snmp6 ?
>>>
>>> Unless I have configured my networking stack weird by mistake that
>>> is the only directory under /proc/net that grows when we add an
>>> interface.
>>>
>>> I just want to make certain I am seeing the same things that you are
>>> seeing.
>>>
>>> I feel silly for overlooking this directory when the rest of the
>>> scalability work was done.
>>
>> Slowdown comes from "duplicate name" check:
>>
>>          for (tmp = dir->subdir; tmp; tmp = tmp->next)
>>                  if (strcmp(tmp->name, dp->name) == 0) {
>>                          WARN(1, "proc_dir_entry '%s/%s' already registered\n",
>>                                  dir->name, dp->name);
>>                          break;
>>                  }
>>
>> Removal can be made O(1) after switching to doubly-linked list.
>
> Yes.  There is the however unfortunate fact that proc directories exist
> to be used.  If we don't switch to a better data structure than a linked
> list the actual use will then opening of the files under
> /proc/net/dev/snmp6/ will become O(N^2).  Which doesn't help much
> (assuming those files are good for something).
>
> If those files aren't actually useful we should just make registering
> them a config option.  Deprecate them strongly and let only people who
> need extreme backwards compatibility enable them.
>
> Alexey do you know that those files aren't useful?  Unless we know
> otherwise we should make those files useful.
This was proposed and nacked in a first version:
http://thread.gmane.org/gmane.linux.network/285840/


Regards,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
                           ` (2 preceding siblings ...)
  2014-10-02 18:01         ` Eric W. Biederman
@ 2014-10-03 10:55         ` Alexey Dobriyan
  2014-10-03 13:07           ` Nicolas Dichtel
  3 siblings, 1 reply; 31+ messages in thread
From: Alexey Dobriyan @ 2014-10-03 10:55 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: netdev, Linux Kernel, David Miller, Eric W. Biederman,
	Andrew Morton, rui.xiang, Al Viro, Oleg Nesterov,
	Cyrill Gorcunov, kirill.shutemov, Grant Likely,
	Theodore Ts'o, Thierry Herbelot

On Thu, Oct 2, 2014 at 6:25 PM, Nicolas Dichtel
<nicolas.dichtel@6wind.com> wrote:
> --- a/fs/proc/generic.c
> +++ b/fs/proc/generic.c
> @@ -81,10 +81,13 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,

> +       if (!S_ISDIR(de->mode))
> +               return -EINVAL;

There are way too many S_ISDIR checks.
In lookup and readdir, it is guaranteed that PDE is directory.

I'd say all of them aren't needed because non-directories have
->subdir = NULL and
directories have ->subdir != NULL which transforms into hashtable or
rbtree or whatever,
so you only need to guarantee only directories appear where they are
expected and
fearlessly use your new data structure traversing directories.

    Alexey

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 17:28         ` Alexey Dobriyan
@ 2014-10-03 13:07           ` Nicolas Dichtel
  0 siblings, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-03 13:07 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: netdev, linux-kernel, davem, ebiederm, akpm, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Thierry Herbelot

Le 02/10/2014 19:28, Alexey Dobriyan a écrit :
> On Thu, Oct 02, 2014 at 05:25:01PM +0200, Nicolas Dichtel wrote:
>> +static inline unsigned int proc_pde_name_hash(const unsigned char *name,
>> +					      const unsigned int len)
>> +{
>> +	return full_name_hash(name, len) & PROC_PDE_HASH_MASK;
>> +}
>
> PDE already stands for "proc dir entry" :^)
>
> 	Alexey
>
Oops, will fix it in v2.


Thank you,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-03 10:55         ` [RFC PATCH linux 2/2] fs/proc: use a hash table " Alexey Dobriyan
@ 2014-10-03 13:07           ` Nicolas Dichtel
  0 siblings, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-03 13:07 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: netdev, Linux Kernel, David Miller, Eric W. Biederman,
	Andrew Morton, rui.xiang, Al Viro, Oleg Nesterov,
	Cyrill Gorcunov, kirill.shutemov, Grant Likely,
	Theodore Ts'o, Thierry Herbelot

Le 03/10/2014 12:55, Alexey Dobriyan a écrit :
> On Thu, Oct 2, 2014 at 6:25 PM, Nicolas Dichtel
> <nicolas.dichtel@6wind.com> wrote:
>> --- a/fs/proc/generic.c
>> +++ b/fs/proc/generic.c
>> @@ -81,10 +81,13 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
>
>> +       if (!S_ISDIR(de->mode))
>> +               return -EINVAL;
>
> There are way too many S_ISDIR checks.
> In lookup and readdir, it is guaranteed that PDE is directory.
>
> I'd say all of them aren't needed because non-directories have
> ->subdir = NULL and
> directories have ->subdir != NULL which transforms into hashtable or
> rbtree or whatever,
> so you only need to guarantee only directories appear where they are
> expected and
> fearlessly use your new data structure traversing directories.
To be honest, they was put during the debug stage and I hesitated to remove
them at the end.
I will remove them!


Thank you,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 18:01         ` Eric W. Biederman
  2014-10-02 20:06           ` Alexey Dobriyan
@ 2014-10-03 13:09           ` Nicolas Dichtel
  2014-10-06 14:30             ` [PATCH linux v2 0/1] Optimize network interfaces creation Nicolas Dichtel
  1 sibling, 1 reply; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-03 13:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev, linux-kernel, davem, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Thierry Herbelot

Le 02/10/2014 20:01, Eric W. Biederman a écrit :
> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
>
>> From: Thierry Herbelot <thierry.herbelot@6wind.com>
>>
>> The current implementation for the directories in /proc is using a single
>> linked list. This is slow when handling directories with large numbers of
>> entries (eg netdevice-related entries when lots of tunnels are opened).
>>
>> This patch enables multiple linked lists. A hash based on the entry name is
>> used to select the linked list for one given entry.
>>
>> The speed creation of netdevices is faster as shorter linked lists must be
>> scanned when adding a new netdevice.
>
> Is the directory of primary concern /proc/net/dev/snmp6 ?
Yes.

>
> Unless I have configured my networking stack weird by mistake that
> is the only directory under /proc/net that grows when we add an
> interface.
>
> I just want to make certain I am seeing the same things that you are
> seeing.
>
> I feel silly for overlooking this directory when the rest of the
> scalability work was done.
>
>> Here are some numbers:
>>
>> dummy30000.batch contains 30 000 times 'link add type dummy'.
>>
>> Before the patch:
>> time ip -b dummy30000.batch
>> real    2m32.221s
>> user    0m0.380s
>> sys     2m30.610s
>>
>> After the patch:
>> time ip -b dummy30000.batch
>> real    1m57.190s
>> user    0m0.350s
>> sys     1m56.120s
>>
>> The single 'subdir' list head is replaced by a subdir hash table. The subdir
>> hash buckets are only allocated for directories. The number of hash buckets
>> is a compile-time parameter.
>
> That looks like a nice speed up.  A couple of things.
>
> With sysfs and sysctl when faced this class of challenge we used an
> rbtree instead of a hash table.  That should use less memory and scale
> better.
>
> I am concerned about a fixed sized hash table moving the location where
> we fall off a cliff but not removing the cliff itself.
>
> I suppose it would be possible to use the new fancy resizable hash
> tables but previous work on sysctl and sysfs suggests that we don't look
> up these entries sufficiently to require a hash table.  We just need a
> data structure that doesn't fall over at scale, and the rbtrees seem to
> do that very nicely.
Ok, I will have a look at it.



Thank you,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries
  2014-10-02 16:46         ` Stephen Hemminger
@ 2014-10-03 13:10           ` Nicolas Dichtel
  0 siblings, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-03 13:10 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, linux-kernel, davem, ebiederm, akpm, adobriyan,
	rui.xiang, viro, oleg, gorcunov, kirill.shutemov, grant.likely,
	tytso, Thierry Herbelot

Le 02/10/2014 18:46, Stephen Hemminger a écrit :
> On Thu,  2 Oct 2014 17:25:01 +0200
> Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:
>
>> From: Thierry Herbelot <thierry.herbelot@6wind.com>
>>
>> The current implementation for the directories in /proc is using a single
>> linked list. This is slow when handling directories with large numbers of
>> entries (eg netdevice-related entries when lots of tunnels are opened).
>>
>> This patch enables multiple linked lists. A hash based on the entry name is
>> used to select the linked list for one given entry.
>>
>> The speed creation of netdevices is faster as shorter linked lists must be
>> scanned when adding a new netdevice.
>>
>> Here are some numbers:
>>
>> dummy30000.batch contains 30 000 times 'link add type dummy'.
>>
>> Before the patch:
>> time ip -b dummy30000.batch
>> real    2m32.221s
>> user    0m0.380s
>> sys     2m30.610s
>>
>> After the patch:
>> time ip -b dummy30000.batch
>> real    1m57.190s
>> user    0m0.350s
>> sys     1m56.120s
>>
>> The single 'subdir' list head is replaced by a subdir hash table. The subdir
>> hash buckets are only allocated for directories. The number of hash buckets
>> is a compile-time parameter.
>>
>> For all functions which handle directory entries, an additional check on the
>> directory nature of the dir entry ensures that pde_hash_buckets was allocated.
>> This check was not needed as subdir was present for all dir entries, whether
>> actual directories or simple files.
>>
>> Signed-off-by: Thierry Herbelot <thierry.herbelot@6wind.com>
>> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
>
> I think the speed up is a good idea and makes sense.
> It would be better to use exist hlist macros for hash table rather than
> open coding it.
>
Right, I will rework this after looking at rbtree.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH linux v2 0/1] Optimize network interfaces creation
  2014-10-03 13:09           ` Nicolas Dichtel
@ 2014-10-06 14:30             ` Nicolas Dichtel
  2014-10-06 14:30               ` [PATCH linux v2 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
  0 siblings, 1 reply; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-06 14:30 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso

When a lot of netdevices are created, one of the bottleneck is the creation
of proc entries. This serie aims to accelerate this part.

I'm not sure against which tree this patch should be done. I've done it against
linux.git.

RFCv1 -> v2
 - use a red-black tree instead of a hash list

 fs/proc/generic.c  | 164 ++++++++++++++++++++++++++++++++++-------------------
 fs/proc/internal.h |  11 ++--
 fs/proc/proc_net.c |   1 +
 fs/proc/root.c     |   1 +
 4 files changed, 113 insertions(+), 64 deletions(-)

Comments are welcome.

Regards,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH linux v2 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-06 14:30             ` [PATCH linux v2 0/1] Optimize network interfaces creation Nicolas Dichtel
@ 2014-10-06 14:30               ` Nicolas Dichtel
  2014-10-06 22:14                 ` David Miller
  0 siblings, 1 reply; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-06 14:30 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso, Nicolas Dichtel

The current implementation for the directories in /proc is using a single
linked list. This is slow when handling directories with large numbers of
entries (eg netdevice-related entries when lots of tunnels are opened).

This patch replaces this linked list by a red-black tree.

Here are some numbers:

dummy30000.batch contains 30 000 times 'link add type dummy'.

Before the patch:
$ time ip -b dummy30000.batch
real	2m31.950s
user	0m0.440s
sys	2m21.440s
$ time rmmod dummy
real	1m35.764s
user	0m0.000s
sys	1m24.088s

After the patch:
$ time ip -b dummy30000.batch
real	2m0.874s
user	0m0.448s
sys	1m49.720s
$ time rmmod dummy
real	1m13.988s
user	0m0.000s
sys	1m1.008s

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 fs/proc/generic.c  | 164 ++++++++++++++++++++++++++++++++++-------------------
 fs/proc/internal.h |  11 ++--
 fs/proc/proc_net.c |   1 +
 fs/proc/root.c     |   1 +
 4 files changed, 113 insertions(+), 64 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 317b72641ebf..9f8fa1e5e8aa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -31,9 +31,81 @@ static DEFINE_SPINLOCK(proc_subdir_lock);
 
 static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
-	if (de->namelen != len)
-		return 0;
-	return !memcmp(name, de->name, len);
+	if (len < de->namelen)
+		return -1;
+	if (len > de->namelen)
+		return 1;
+
+	return memcmp(name, de->name, len);
+}
+
+static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
+{
+	struct rb_node *node = rb_first(&dir->subdir);
+
+	if (node == NULL)
+		return NULL;
+
+	return rb_entry(node, struct proc_dir_entry, subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
+{
+	struct rb_node *node = rb_next(&dir->subdir_node);
+
+	if (node == NULL)
+		return NULL;
+
+	return rb_entry(node, struct proc_dir_entry, subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+					      const char *name,
+					      unsigned int len)
+{
+	struct rb_node *node = dir->subdir.rb_node;
+
+	while (node) {
+		struct proc_dir_entry *de = container_of(node,
+							 struct proc_dir_entry,
+							 subdir_node);
+		int result = proc_match(len, name, de);
+
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return de;
+	}
+	return NULL;
+}
+
+static bool pde_subdir_insert(struct proc_dir_entry *dir,
+			      struct proc_dir_entry *de)
+{
+	struct rb_root *root = &dir->subdir;
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct proc_dir_entry *this =
+			container_of(*new, struct proc_dir_entry, subdir_node);
+		int result = proc_match(de->namelen, de->name, this);
+
+		parent = *new;
+		if (result < 0)
+			new = &(*new)->rb_left;
+		else if (result > 0)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&de->subdir_node, parent, new);
+	rb_insert_color(&de->subdir_node, root);
+	return true;
 }
 
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -92,10 +164,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 			break;
 
 		len = next - cp;
-		for (de = de->subdir; de ; de = de->next) {
-			if (proc_match(len, cp, de))
-				break;
-		}
+		de = pde_subdir_find(de, cp, len);
 		if (!de) {
 			WARN(1, "name '%s'\n", name);
 			return -ENOENT;
@@ -183,19 +252,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 	struct inode *inode;
 
 	spin_lock(&proc_subdir_lock);
-	for (de = de->subdir; de ; de = de->next) {
-		if (de->namelen != dentry->d_name.len)
-			continue;
-		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-			pde_get(de);
-			spin_unlock(&proc_subdir_lock);
-			inode = proc_get_inode(dir->i_sb, de);
-			if (!inode)
-				return ERR_PTR(-ENOMEM);
-			d_set_d_op(dentry, &simple_dentry_operations);
-			d_add(dentry, inode);
-			return NULL;
-		}
+	de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
+	if (de) {
+		pde_get(de);
+		spin_unlock(&proc_subdir_lock);
+		inode = proc_get_inode(dir->i_sb, de);
+		if (!inode)
+			return ERR_PTR(-ENOMEM);
+		d_set_d_op(dentry, &simple_dentry_operations);
+		d_add(dentry, inode);
+		return NULL;
 	}
 	spin_unlock(&proc_subdir_lock);
 	return ERR_PTR(-ENOENT);
@@ -225,7 +291,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		return 0;
 
 	spin_lock(&proc_subdir_lock);
-	de = de->subdir;
+	de = pde_subdir_first(de);
 	i = ctx->pos - 2;
 	for (;;) {
 		if (!de) {
@@ -234,7 +300,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		}
 		if (!i)
 			break;
-		de = de->next;
+		de = pde_subdir_next(de);
 		i--;
 	}
 
@@ -249,7 +315,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		}
 		spin_lock(&proc_subdir_lock);
 		ctx->pos++;
-		next = de->next;
+		next = pde_subdir_next(de);
 		pde_put(de);
 		de = next;
 	} while (de);
@@ -286,9 +352,8 @@ static const struct inode_operations proc_dir_inode_operations = {
 
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	struct proc_dir_entry *tmp;
 	int ret;
-	
+
 	ret = proc_alloc_inum(&dp->low_ino);
 	if (ret)
 		return ret;
@@ -308,17 +373,10 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 	}
 
 	spin_lock(&proc_subdir_lock);
-
-	for (tmp = dir->subdir; tmp; tmp = tmp->next)
-		if (strcmp(tmp->name, dp->name) == 0) {
-			WARN(1, "proc_dir_entry '%s/%s' already registered\n",
-				dir->name, dp->name);
-			break;
-		}
-
-	dp->next = dir->subdir;
 	dp->parent = dir;
-	dir->subdir = dp;
+	if (pde_subdir_insert(dir, dp) == false)
+		WARN(1, "proc_dir_entry '%s/%s' already registered\n",
+		     dir->name, dp->name);
 	spin_unlock(&proc_subdir_lock);
 
 	return 0;
@@ -354,6 +412,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent->namelen = qstr.len;
 	ent->mode = mode;
 	ent->nlink = nlink;
+	ent->subdir = RB_ROOT;
 	atomic_set(&ent->count, 1);
 	spin_lock_init(&ent->pde_unload_lock);
 	INIT_LIST_HEAD(&ent->pde_openers);
@@ -485,7 +544,6 @@ void pde_put(struct proc_dir_entry *pde)
  */
 void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry **p;
 	struct proc_dir_entry *de = NULL;
 	const char *fn = name;
 	unsigned int len;
@@ -497,14 +555,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	}
 	len = strlen(fn);
 
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
-		if (proc_match(len, fn, *p)) {
-			de = *p;
-			*p = de->next;
-			de->next = NULL;
-			break;
-		}
-	}
+	de = pde_subdir_find(parent, fn, len);
+	if (de)
+		rb_erase(&de->subdir_node, &parent->subdir);
 	spin_unlock(&proc_subdir_lock);
 	if (!de) {
 		WARN(1, "name '%s'\n", name);
@@ -516,16 +569,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
 	de->nlink = 0;
-	WARN(de->subdir, "%s: removing non-empty directory "
-			 "'%s/%s', leaking at least '%s'\n", __func__,
-			 de->parent->name, de->name, de->subdir->name);
+	WARN(pde_subdir_first(de),
+	     "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+	     __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
 	pde_put(de);
 }
 EXPORT_SYMBOL(remove_proc_entry);
 
 int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry **p;
 	struct proc_dir_entry *root = NULL, *de, *next;
 	const char *fn = name;
 	unsigned int len;
@@ -537,24 +589,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 	}
 	len = strlen(fn);
 
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
-		if (proc_match(len, fn, *p)) {
-			root = *p;
-			*p = root->next;
-			root->next = NULL;
-			break;
-		}
-	}
+	root = pde_subdir_find(parent, fn, len);
 	if (!root) {
 		spin_unlock(&proc_subdir_lock);
 		return -ENOENT;
 	}
+	rb_erase(&root->subdir_node, &parent->subdir);
+
 	de = root;
 	while (1) {
-		next = de->subdir;
+		next = pde_subdir_first(de);
 		if (next) {
-			de->subdir = next->next;
-			next->next = NULL;
+			rb_erase(&next->subdir_node, &de->subdir);
 			de = next;
 			continue;
 		}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..433557634c1b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -24,10 +24,9 @@ struct mempolicy;
  * tree) of these proc_dir_entries, so that we can dynamically
  * add new files to /proc.
  *
- * The "next" pointer creates a linked list of one /proc directory,
- * while parent/subdir create the directory structure (every
- * /proc file has a parent, but "subdir" is NULL for all
- * non-directory entries).
+ * parent/subdir are used for the directory structure (every /proc file has a
+ * parent, but "subdir" is empty for all non-directory entries).
+ * subdir_node is used to build the rb tree "subdir" of the parent.
  */
 struct proc_dir_entry {
 	unsigned int low_ino;
@@ -38,7 +37,9 @@ struct proc_dir_entry {
 	loff_t size;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
-	struct proc_dir_entry *next, *parent, *subdir;
+	struct proc_dir_entry *parent;
+	struct rb_root subdir;
+	struct rb_node subdir_node;
 	void *data;
 	atomic_t count;		/* use count */
 	atomic_t in_use;	/* number of callers into module in progress; */
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..1bde894bc624 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net)
 	if (!netd)
 		goto out;
 
+	netd->subdir = RB_ROOT;
 	netd->data = net;
 	netd->nlink = 2;
 	netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 094e44d4a6be..4eae849baedd 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -166,6 +166,7 @@ void __init proc_root_init(void)
 {
 	int err;
 
+	proc_root.subdir = RB_ROOT;
 	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH linux v2 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-06 14:30               ` [PATCH linux v2 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
@ 2014-10-06 22:14                 ` David Miller
  2014-10-07  9:02                   ` [PATCH linux v3 0/1] Optimize network interfaces creation Nicolas Dichtel
  0 siblings, 1 reply; 31+ messages in thread
From: David Miller @ 2014-10-06 22:14 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: netdev, linux-kernel, ebiederm, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso

From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon,  6 Oct 2014 16:30:34 +0200

> The current implementation for the directories in /proc is using a single
> linked list. This is slow when handling directories with large numbers of
> entries (eg netdevice-related entries when lots of tunnels are opened).
> 
> This patch replaces this linked list by a red-black tree.
> 
> Here are some numbers:
> 
> dummy30000.batch contains 30 000 times 'link add type dummy'.
 ...
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

FWIW:

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH linux v3 0/1] Optimize network interfaces creation
  2014-10-06 22:14                 ` David Miller
@ 2014-10-07  9:02                   ` Nicolas Dichtel
  2014-10-07  9:02                     ` [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
  0 siblings, 1 reply; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-07  9:02 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso

When a lot of netdevices are created, one of the bottleneck is the creation
of proc entries. This serie aims to accelerate this part.

I'm not sure against which tree this patch should be done. I've done it against
linux.git.

v2 -> v3
 - restore credit to Thierry Herbelot for the initial idea.

RFCv1 -> v2
 - use a red-black tree instead of a hash list

 fs/proc/generic.c  | 164 ++++++++++++++++++++++++++++++++++-------------------
 fs/proc/internal.h |  11 ++--
 fs/proc/proc_net.c |   1 +
 fs/proc/root.c     |   1 +
 4 files changed, 113 insertions(+), 64 deletions(-)

Comments are welcome.

Regards,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-07  9:02                   ` [PATCH linux v3 0/1] Optimize network interfaces creation Nicolas Dichtel
@ 2014-10-07  9:02                     ` Nicolas Dichtel
  2014-10-13 11:14                       ` Nicolas Dichtel
  2014-10-15 21:37                       ` Andrew Morton
  0 siblings, 2 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-07  9:02 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso, Nicolas Dichtel

The current implementation for the directories in /proc is using a single
linked list. This is slow when handling directories with large numbers of
entries (eg netdevice-related entries when lots of tunnels are opened).

This patch replaces this linked list by a red-black tree.

Here are some numbers:

dummy30000.batch contains 30 000 times 'link add type dummy'.

Before the patch:
$ time ip -b dummy30000.batch
real	2m31.950s
user	0m0.440s
sys	2m21.440s
$ time rmmod dummy
real	1m35.764s
user	0m0.000s
sys	1m24.088s

After the patch:
$ time ip -b dummy30000.batch
real	2m0.874s
user	0m0.448s
sys	1m49.720s
$ time rmmod dummy
real	1m13.988s
user	0m0.000s
sys	1m1.008s

The idea of improving this part was suggested by
Thierry Herbelot <thierry.herbelot@6wind.com>.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/generic.c  | 164 ++++++++++++++++++++++++++++++++++-------------------
 fs/proc/internal.h |  11 ++--
 fs/proc/proc_net.c |   1 +
 fs/proc/root.c     |   1 +
 4 files changed, 113 insertions(+), 64 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 317b72641ebf..9f8fa1e5e8aa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -31,9 +31,81 @@ static DEFINE_SPINLOCK(proc_subdir_lock);
 
 static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
-	if (de->namelen != len)
-		return 0;
-	return !memcmp(name, de->name, len);
+	if (len < de->namelen)
+		return -1;
+	if (len > de->namelen)
+		return 1;
+
+	return memcmp(name, de->name, len);
+}
+
+static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
+{
+	struct rb_node *node = rb_first(&dir->subdir);
+
+	if (node == NULL)
+		return NULL;
+
+	return rb_entry(node, struct proc_dir_entry, subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
+{
+	struct rb_node *node = rb_next(&dir->subdir_node);
+
+	if (node == NULL)
+		return NULL;
+
+	return rb_entry(node, struct proc_dir_entry, subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+					      const char *name,
+					      unsigned int len)
+{
+	struct rb_node *node = dir->subdir.rb_node;
+
+	while (node) {
+		struct proc_dir_entry *de = container_of(node,
+							 struct proc_dir_entry,
+							 subdir_node);
+		int result = proc_match(len, name, de);
+
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return de;
+	}
+	return NULL;
+}
+
+static bool pde_subdir_insert(struct proc_dir_entry *dir,
+			      struct proc_dir_entry *de)
+{
+	struct rb_root *root = &dir->subdir;
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct proc_dir_entry *this =
+			container_of(*new, struct proc_dir_entry, subdir_node);
+		int result = proc_match(de->namelen, de->name, this);
+
+		parent = *new;
+		if (result < 0)
+			new = &(*new)->rb_left;
+		else if (result > 0)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&de->subdir_node, parent, new);
+	rb_insert_color(&de->subdir_node, root);
+	return true;
 }
 
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -92,10 +164,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 			break;
 
 		len = next - cp;
-		for (de = de->subdir; de ; de = de->next) {
-			if (proc_match(len, cp, de))
-				break;
-		}
+		de = pde_subdir_find(de, cp, len);
 		if (!de) {
 			WARN(1, "name '%s'\n", name);
 			return -ENOENT;
@@ -183,19 +252,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 	struct inode *inode;
 
 	spin_lock(&proc_subdir_lock);
-	for (de = de->subdir; de ; de = de->next) {
-		if (de->namelen != dentry->d_name.len)
-			continue;
-		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-			pde_get(de);
-			spin_unlock(&proc_subdir_lock);
-			inode = proc_get_inode(dir->i_sb, de);
-			if (!inode)
-				return ERR_PTR(-ENOMEM);
-			d_set_d_op(dentry, &simple_dentry_operations);
-			d_add(dentry, inode);
-			return NULL;
-		}
+	de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
+	if (de) {
+		pde_get(de);
+		spin_unlock(&proc_subdir_lock);
+		inode = proc_get_inode(dir->i_sb, de);
+		if (!inode)
+			return ERR_PTR(-ENOMEM);
+		d_set_d_op(dentry, &simple_dentry_operations);
+		d_add(dentry, inode);
+		return NULL;
 	}
 	spin_unlock(&proc_subdir_lock);
 	return ERR_PTR(-ENOENT);
@@ -225,7 +291,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		return 0;
 
 	spin_lock(&proc_subdir_lock);
-	de = de->subdir;
+	de = pde_subdir_first(de);
 	i = ctx->pos - 2;
 	for (;;) {
 		if (!de) {
@@ -234,7 +300,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		}
 		if (!i)
 			break;
-		de = de->next;
+		de = pde_subdir_next(de);
 		i--;
 	}
 
@@ -249,7 +315,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
 		}
 		spin_lock(&proc_subdir_lock);
 		ctx->pos++;
-		next = de->next;
+		next = pde_subdir_next(de);
 		pde_put(de);
 		de = next;
 	} while (de);
@@ -286,9 +352,8 @@ static const struct inode_operations proc_dir_inode_operations = {
 
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	struct proc_dir_entry *tmp;
 	int ret;
-	
+
 	ret = proc_alloc_inum(&dp->low_ino);
 	if (ret)
 		return ret;
@@ -308,17 +373,10 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 	}
 
 	spin_lock(&proc_subdir_lock);
-
-	for (tmp = dir->subdir; tmp; tmp = tmp->next)
-		if (strcmp(tmp->name, dp->name) == 0) {
-			WARN(1, "proc_dir_entry '%s/%s' already registered\n",
-				dir->name, dp->name);
-			break;
-		}
-
-	dp->next = dir->subdir;
 	dp->parent = dir;
-	dir->subdir = dp;
+	if (pde_subdir_insert(dir, dp) == false)
+		WARN(1, "proc_dir_entry '%s/%s' already registered\n",
+		     dir->name, dp->name);
 	spin_unlock(&proc_subdir_lock);
 
 	return 0;
@@ -354,6 +412,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent->namelen = qstr.len;
 	ent->mode = mode;
 	ent->nlink = nlink;
+	ent->subdir = RB_ROOT;
 	atomic_set(&ent->count, 1);
 	spin_lock_init(&ent->pde_unload_lock);
 	INIT_LIST_HEAD(&ent->pde_openers);
@@ -485,7 +544,6 @@ void pde_put(struct proc_dir_entry *pde)
  */
 void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry **p;
 	struct proc_dir_entry *de = NULL;
 	const char *fn = name;
 	unsigned int len;
@@ -497,14 +555,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	}
 	len = strlen(fn);
 
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
-		if (proc_match(len, fn, *p)) {
-			de = *p;
-			*p = de->next;
-			de->next = NULL;
-			break;
-		}
-	}
+	de = pde_subdir_find(parent, fn, len);
+	if (de)
+		rb_erase(&de->subdir_node, &parent->subdir);
 	spin_unlock(&proc_subdir_lock);
 	if (!de) {
 		WARN(1, "name '%s'\n", name);
@@ -516,16 +569,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
 	de->nlink = 0;
-	WARN(de->subdir, "%s: removing non-empty directory "
-			 "'%s/%s', leaking at least '%s'\n", __func__,
-			 de->parent->name, de->name, de->subdir->name);
+	WARN(pde_subdir_first(de),
+	     "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+	     __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
 	pde_put(de);
 }
 EXPORT_SYMBOL(remove_proc_entry);
 
 int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry **p;
 	struct proc_dir_entry *root = NULL, *de, *next;
 	const char *fn = name;
 	unsigned int len;
@@ -537,24 +589,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 	}
 	len = strlen(fn);
 
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
-		if (proc_match(len, fn, *p)) {
-			root = *p;
-			*p = root->next;
-			root->next = NULL;
-			break;
-		}
-	}
+	root = pde_subdir_find(parent, fn, len);
 	if (!root) {
 		spin_unlock(&proc_subdir_lock);
 		return -ENOENT;
 	}
+	rb_erase(&root->subdir_node, &parent->subdir);
+
 	de = root;
 	while (1) {
-		next = de->subdir;
+		next = pde_subdir_first(de);
 		if (next) {
-			de->subdir = next->next;
-			next->next = NULL;
+			rb_erase(&next->subdir_node, &de->subdir);
 			de = next;
 			continue;
 		}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..433557634c1b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -24,10 +24,9 @@ struct mempolicy;
  * tree) of these proc_dir_entries, so that we can dynamically
  * add new files to /proc.
  *
- * The "next" pointer creates a linked list of one /proc directory,
- * while parent/subdir create the directory structure (every
- * /proc file has a parent, but "subdir" is NULL for all
- * non-directory entries).
+ * parent/subdir are used for the directory structure (every /proc file has a
+ * parent, but "subdir" is empty for all non-directory entries).
+ * subdir_node is used to build the rb tree "subdir" of the parent.
  */
 struct proc_dir_entry {
 	unsigned int low_ino;
@@ -38,7 +37,9 @@ struct proc_dir_entry {
 	loff_t size;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
-	struct proc_dir_entry *next, *parent, *subdir;
+	struct proc_dir_entry *parent;
+	struct rb_root subdir;
+	struct rb_node subdir_node;
 	void *data;
 	atomic_t count;		/* use count */
 	atomic_t in_use;	/* number of callers into module in progress; */
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..1bde894bc624 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net)
 	if (!netd)
 		goto out;
 
+	netd->subdir = RB_ROOT;
 	netd->data = net;
 	netd->nlink = 2;
 	netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 094e44d4a6be..4eae849baedd 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -166,6 +166,7 @@ void __init proc_root_init(void)
 {
 	int err;
 
+	proc_root.subdir = RB_ROOT;
 	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-07  9:02                     ` [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
@ 2014-10-13 11:14                       ` Nicolas Dichtel
  2014-10-14 19:30                         ` David Miller
  2014-10-14 19:56                         ` Eric W. Biederman
  2014-10-15 21:37                       ` Andrew Morton
  1 sibling, 2 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-13 11:14 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, ebiederm, akpm, adobriyan, rui.xiang, viro, oleg,
	gorcunov, kirill.shutemov, grant.likely, tytso, Linus Torvalds

Le 07/10/2014 11:02, Nicolas Dichtel a écrit :
> The current implementation for the directories in /proc is using a single
> linked list. This is slow when handling directories with large numbers of
> entries (eg netdevice-related entries when lots of tunnels are opened).
>
> This patch replaces this linked list by a red-black tree.
>
> Here are some numbers:
>
> dummy30000.batch contains 30 000 times 'link add type dummy'.
>
> Before the patch:
> $ time ip -b dummy30000.batch
> real	2m31.950s
> user	0m0.440s
> sys	2m21.440s
> $ time rmmod dummy
> real	1m35.764s
> user	0m0.000s
> sys	1m24.088s
>
> After the patch:
> $ time ip -b dummy30000.batch
> real	2m0.874s
> user	0m0.448s
> sys	1m49.720s
> $ time rmmod dummy
> real	1m13.988s
> user	0m0.000s
> sys	1m1.008s
>
> The idea of improving this part was suggested by
> Thierry Herbelot <thierry.herbelot@6wind.com>.
>
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
> Acked-by: David S. Miller <davem@davemloft.net>
> ---

I'm not sure who is in charge of taking this patch. Should I resend it to
someone else or is it already included in a tree?


Thank you,
Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-13 11:14                       ` Nicolas Dichtel
@ 2014-10-14 19:30                         ` David Miller
  2014-10-14 19:56                         ` Eric W. Biederman
  1 sibling, 0 replies; 31+ messages in thread
From: David Miller @ 2014-10-14 19:30 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: netdev, linux-kernel, ebiederm, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso, torvalds

From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 13 Oct 2014 13:14:51 +0200

> I'm not sure who is in charge of taking this patch. Should I resend
> it to someone else or is it already included in a tree?

Just want to make it clear that I don't intend to take this via
the networking tree.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-13 11:14                       ` Nicolas Dichtel
  2014-10-14 19:30                         ` David Miller
@ 2014-10-14 19:56                         ` Eric W. Biederman
  2014-10-15  9:02                           ` Nicolas Dichtel
  1 sibling, 1 reply; 31+ messages in thread
From: Eric W. Biederman @ 2014-10-14 19:56 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: netdev, linux-kernel, davem, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Linus Torvalds, Andrew Morton

Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:

> Le 07/10/2014 11:02, Nicolas Dichtel a écrit :
>> The current implementation for the directories in /proc is using a single
>> linked list. This is slow when handling directories with large numbers of
>> entries (eg netdevice-related entries when lots of tunnels are opened).
>>
>> This patch replaces this linked list by a red-black tree.
>>
>> Here are some numbers:
>>
>> dummy30000.batch contains 30 000 times 'link add type dummy'.
>>
>> Before the patch:
>> $ time ip -b dummy30000.batch
>> real	2m31.950s
>> user	0m0.440s
>> sys	2m21.440s
>> $ time rmmod dummy
>> real	1m35.764s
>> user	0m0.000s
>> sys	1m24.088s
>>
>> After the patch:
>> $ time ip -b dummy30000.batch
>> real	2m0.874s
>> user	0m0.448s
>> sys	1m49.720s
>> $ time rmmod dummy
>> real	1m13.988s
>> user	0m0.000s
>> sys	1m1.008s
>>
>> The idea of improving this part was suggested by
>> Thierry Herbelot <thierry.herbelot@6wind.com>.
>>
>> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
>> Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>
> I'm not sure who is in charge of taking this patch. Should I resend it to
> someone else or is it already included in a tree?

There are a couple of things going on here.

This patch came out at the beginning of the merge window which is a time
when everything that was ready and well tested ahead of time gets
merged.

Your numbers don't look too bad, so I expect this code is ready to go
(although I am a smidge disappointed in the small size of the
performance improvement), my quick read through earlier did not show
anything scary.   Do you have any idea why going from O(N^2) algorithm
to a O(NlogN) algorithm showed such a small performance improvement with
30,000 entries?

Normally proc is looked at by a group of folks me, Alexey Dobriyan, and
Al Viro all sort of tag team taking care of the proc infrastructure with
(except for Al) Andrew Morton typically taking the patches and merging
them.

I am currently in the middle of a move so I don't have the time to carry
this change or do much of anything until I am settled again.

What I would recommend is verifying your patch works against v3.18-rc1
at the begginning of next week and sending the code to Andrew Morton.

Eric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-14 19:56                         ` Eric W. Biederman
@ 2014-10-15  9:02                           ` Nicolas Dichtel
  0 siblings, 0 replies; 31+ messages in thread
From: Nicolas Dichtel @ 2014-10-15  9:02 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev, linux-kernel, davem, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Linus Torvalds

Le 14/10/2014 21:56, Eric W. Biederman a écrit :
> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
>
>> Le 07/10/2014 11:02, Nicolas Dichtel a écrit :
>>> The current implementation for the directories in /proc is using a single
>>> linked list. This is slow when handling directories with large numbers of
>>> entries (eg netdevice-related entries when lots of tunnels are opened).
>>>
>>> This patch replaces this linked list by a red-black tree.
>>>
>>> Here are some numbers:
>>>
>>> dummy30000.batch contains 30 000 times 'link add type dummy'.
>>>
>>> Before the patch:
>>> $ time ip -b dummy30000.batch
>>> real	2m31.950s
>>> user	0m0.440s
>>> sys	2m21.440s
>>> $ time rmmod dummy
>>> real	1m35.764s
>>> user	0m0.000s
>>> sys	1m24.088s
>>>
>>> After the patch:
>>> $ time ip -b dummy30000.batch
>>> real	2m0.874s
>>> user	0m0.448s
>>> sys	1m49.720s
>>> $ time rmmod dummy
>>> real	1m13.988s
>>> user	0m0.000s
>>> sys	1m1.008s
>>>
>>> The idea of improving this part was suggested by
>>> Thierry Herbelot <thierry.herbelot@6wind.com>.
>>>
>>> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
>>> Acked-by: David S. Miller <davem@davemloft.net>
> Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>> ---
>>
>> I'm not sure who is in charge of taking this patch. Should I resend it to
>> someone else or is it already included in a tree?
>
> There are a couple of things going on here.
>
> This patch came out at the beginning of the merge window which is a time
> when everything that was ready and well tested ahead of time gets
> merged.
>
> Your numbers don't look too bad, so I expect this code is ready to go
> (although I am a smidge disappointed in the small size of the
> performance improvement), my quick read through earlier did not show
> anything scary.   Do you have any idea why going from O(N^2) algorithm
> to a O(NlogN) algorithm showed such a small performance improvement with
> 30,000 entries?
perf top shows that a lot of time was wasted in vsscanf().
With my previous test file (dummy30000.batch), kernel needs to calculate
the interface name (iproute2 command was : 'link add type dummy'). Here is
another bench:

Files dummywithnameX.batch are created like this:
for i in `seq 1 X`; do echo "link add dummy$i type dummy" >> 
dummywithnameX.batch; done

Before the patch:
$ time ip -b dummywithname10000.batch
real	0m22.496s
user	0m0.196s
sys	0m5.924s
$ rmmod dummy
$ time ip -b dummywithname15000.batch
real	0m37.903s
user	0m0.348s
sys	0m13.160s
$ rmmod dummy
$ time ip -b dummywithname20000.batch
real	0m52.941s
user	0m0.396s
sys	0m20.164s
$ rmmod dummy
$ time ip -b dummywithname30000.batch
real	1m32.447s
user	0m0.660s
sys	0m43.436s

After the patch:
$ time ip -b dummywithname10000.batch
real	0m19.648s
user	0m0.180s
sys	0m2.260s
$ rmmod dummy
$ time ip -b dummywithname15000.batch
real	0m29.149s
user	0m0.256s
sys	0m3.616s
$ rmmod dummy
$ time ip -b dummywithname20000.batch
real	0m39.133s
user	0m0.440s
sys	0m4.756s
$ rmmod dummy
$ time ip -b dummywithname30000.batch
real    0m59.837s
user    0m0.520s
sys     0m7.780s

Thus an improvement of ~35% for 30k ifaces, but more importantly, after the
patch, it scales linearly.

perf top output after the patch:
      4.30%  [kernel]          [k] arch_local_irq_restore
      2.92%  [kernel]          [k] format_decode
      2.10%  [kernel]          [k] vsnprintf
      2.08%  [kernel]          [k] arch_local_irq_enable
      1.82%  [kernel]          [k] number.isra.1
      1.81%  [kernel]          [k] arch_local_irq_enable
      1.41%  [kernel]          [k] kmem_cache_alloc
      1.33%  [kernel]          [k] unmap_single_vma
      1.10%  [kernel]          [k] do_raw_spin_lock
      1.09%  [kernel]          [k] clear_page
      1.00%  [kernel]          [k] arch_local_irq_enable

>
> Normally proc is looked at by a group of folks me, Alexey Dobriyan, and
> Al Viro all sort of tag team taking care of the proc infrastructure with
> (except for Al) Andrew Morton typically taking the patches and merging
> them.
>
> I am currently in the middle of a move so I don't have the time to carry
> this change or do much of anything until I am settled again.
>
> What I would recommend is verifying your patch works against v3.18-rc1
> at the begginning of next week and sending the code to Andrew Morton.
Ok, thank you. I will do this.

Nicolas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
  2014-10-07  9:02                     ` [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
  2014-10-13 11:14                       ` Nicolas Dichtel
@ 2014-10-15 21:37                       ` Andrew Morton
  1 sibling, 0 replies; 31+ messages in thread
From: Andrew Morton @ 2014-10-15 21:37 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: netdev, linux-kernel, davem, ebiederm, adobriyan, rui.xiang,
	viro, oleg, gorcunov, kirill.shutemov, grant.likely, tytso

On Tue,  7 Oct 2014 11:02:39 +0200 Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:

> The current implementation for the directories in /proc is using a single
> linked list. This is slow when handling directories with large numbers of
> entries (eg netdevice-related entries when lots of tunnels are opened).
> 
> This patch replaces this linked list by a red-black tree.
> 
> ...
>
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -166,6 +166,7 @@ void __init proc_root_init(void)
>  {
>  	int err;
>  
> +	proc_root.subdir = RB_ROOT;
>  	proc_init_inodecache();
>  	err = register_filesystem(&proc_fs_type);
>  	if (err)

This can be done at compile time can't it?

--- a/fs/proc/root.c~fs-proc-use-a-rb-tree-for-the-directory-entries-fix
+++ a/fs/proc/root.c
@@ -166,7 +166,6 @@ void __init proc_root_init(void)
 {
 	int err;
 
-	proc_root.subdir = RB_ROOT;
 	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
@@ -252,6 +251,7 @@ struct proc_dir_entry proc_root = {
 	.proc_iops	= &proc_root_inode_operations, 
 	.proc_fops	= &proc_root_operations,
 	.parent		= &proc_root,
+	.subdir		= RB_ROOT,
 	.name		= "/proc",
 };
 
_


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2014-10-15 21:37 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-03 13:28 [PATCH net-next] dev: add support of flag IFF_NOPROC Nicolas Dichtel
2013-10-03 13:30 ` [PATCH iproute2 net-next-3.11] ip: add support of link " Nicolas Dichtel
2013-10-03 17:46 ` [PATCH net-next] dev: add support of " Stephen Hemminger
2013-10-03 19:09   ` David Miller
2013-10-04 12:07     ` Nicolas Dichtel
2013-10-04 17:29       ` David Miller
2014-10-02 15:24     ` [RFC PATCH linux 0/2] Optimize network interfaces creation Nicolas Dichtel
2014-10-02 15:25       ` [RFC PATCH linux 1/2] proc_net: declare /proc/net as a directory Nicolas Dichtel
2014-10-02 15:25       ` [RFC PATCH linux 2/2] fs/proc: use a hash table for the directory entries Nicolas Dichtel
2014-10-02 16:46         ` Stephen Hemminger
2014-10-03 13:10           ` Nicolas Dichtel
2014-10-02 17:28         ` Alexey Dobriyan
2014-10-03 13:07           ` Nicolas Dichtel
2014-10-02 18:01         ` Eric W. Biederman
2014-10-02 20:06           ` Alexey Dobriyan
2014-10-02 21:07             ` Eric W. Biederman
2014-10-02 21:27               ` Stephen Hemminger
2014-10-03  7:28               ` Nicolas Dichtel
2014-10-03 13:09           ` Nicolas Dichtel
2014-10-06 14:30             ` [PATCH linux v2 0/1] Optimize network interfaces creation Nicolas Dichtel
2014-10-06 14:30               ` [PATCH linux v2 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
2014-10-06 22:14                 ` David Miller
2014-10-07  9:02                   ` [PATCH linux v3 0/1] Optimize network interfaces creation Nicolas Dichtel
2014-10-07  9:02                     ` [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries Nicolas Dichtel
2014-10-13 11:14                       ` Nicolas Dichtel
2014-10-14 19:30                         ` David Miller
2014-10-14 19:56                         ` Eric W. Biederman
2014-10-15  9:02                           ` Nicolas Dichtel
2014-10-15 21:37                       ` Andrew Morton
2014-10-03 10:55         ` [RFC PATCH linux 2/2] fs/proc: use a hash table " Alexey Dobriyan
2014-10-03 13:07           ` Nicolas Dichtel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.