linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open
       [not found]         ` <20071126201717.450e44eb.akpm@linux-foundation.org>
@ 2007-11-27  7:09           ` Eric Dumazet
  2007-11-27  7:25             ` Valdis.Kletnieks
  2007-11-28  7:52             ` [PATCH, v2] " Eric Dumazet
  0 siblings, 2 replies; 4+ messages in thread
From: Eric Dumazet @ 2007-11-27  7:09 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Alan Cox, Linux kernel

[-- Attachment #1: Type: text/plain, Size: 1239 bytes --]

As changing NR_OPEN from 1024*1024 to 16*1024*1024 was considered a litle
bit dangerous, just let it default to 1024*1024 but adds a new sysctl
to let sysadmins change this value.

Thank you

[PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open

NR_OPEN (historically set to 1024*1024) actually forbids processes to open 
more than 1024*1024 handles.

Unfortunatly some production servers hit the not so 'ridiculously high value' 
of 1024*1024 file descriptors per process.

Changing NR_OPEN is not considered safe because of vmalloc space potential 
exhaust.

This patch introduces a new sysctl (/proc/sys/fs/nr_open) wich defaults to 
1024*1024, so that admins can decide to change this limit if their workload 
needs it.


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

  Documentation/filesystems/proc.txt |    8 ++++++++
  Documentation/sysctl/fs.txt        |   10 ++++++++++
  fs/file.c                          |    8 +++++---
  include/linux/fs.h                 |    2 +-
  kernel/sys.c                       |    2 +-
  kernel/sysctl.c                    |    8 ++++++++
  6 files changed, 33 insertions(+), 5 deletions(-)

[-- Attachment #2: nr_open.patch --]
[-- Type: text/plain, Size: 3799 bytes --]

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dec9945..9b390d7 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -989,6 +989,14 @@ nr_inodes
 Denotes the  number  of  inodes the system has allocated. This number will
 grow and shrink dynamically.
 
+nr_open
+-------
+
+Denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
 nr_free_inodes
 --------------
 
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index aa986a3..f992543 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -23,6 +23,7 @@ Currently, these files are in /proc/sys/fs:
 - inode-max
 - inode-nr
 - inode-state
+- nr_open
 - overflowuid
 - overflowgid
 - suid_dumpable
@@ -91,6 +92,15 @@ usage of file handles and you don't need to increase the maximum.
 
 ==============================================================
 
+nr_open:
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+==============================================================
+
 inode-max, inode-nr & inode-state:
 
 As with file handles, the kernel allocates the inode structures
diff --git a/fs/file.c b/fs/file.c
index c5575de..5110acb 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -24,6 +24,8 @@ struct fdtable_defer {
 	struct fdtable *next;
 };
 
+int sysctl_nr_open __read_mostly = 1024*1024;
+
 /*
  * We use this list to defer free fdtables that have vmalloced
  * sets/arrays. By keeping a per-cpu list, we avoid having to embed
@@ -147,8 +149,8 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	nr /= (1024 / sizeof(struct file *));
 	nr = roundup_pow_of_two(nr + 1);
 	nr *= (1024 / sizeof(struct file *));
-	if (nr > NR_OPEN)
-		nr = NR_OPEN;
+	if (nr > sysctl_nr_open)
+		nr = sysctl_nr_open;
 
 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 	if (!fdt)
@@ -233,7 +235,7 @@ int expand_files(struct files_struct *files, int nr)
 	if (nr < fdt->max_fds)
 		return 0;
 	/* Can we expand? */
-	if (nr >= NR_OPEN)
+	if (nr >= sysctl_nr_open)
 		return -EMFILE;
 
 	/* All good, so we try */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a4..1cda287 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -21,7 +21,7 @@
 
 /* Fixed constants first: */
 #undef NR_OPEN
-#define NR_OPEN (1024*1024)	/* Absolute upper limit on fd num */
+extern int sysctl_nr_open;
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 
 #define BLOCK_SIZE_BITS 10
diff --git a/kernel/sys.c b/kernel/sys.c
index d1fe71e..99c6ce1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1472,7 +1472,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
 		return -EPERM;
 
 	retval = security_task_setrlimit(resource, &new_rlim);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0deed82..de22f7b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1127,6 +1127,14 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_open",
+		.data		= &sysctl_nr_open,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= FS_DENTRY,
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open
  2007-11-27  7:09           ` [PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open Eric Dumazet
@ 2007-11-27  7:25             ` Valdis.Kletnieks
  2007-11-27  7:58               ` Eric Dumazet
  2007-11-28  7:52             ` [PATCH, v2] " Eric Dumazet
  1 sibling, 1 reply; 4+ messages in thread
From: Valdis.Kletnieks @ 2007-11-27  7:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andrew Morton, Alan Cox, Linux kernel

[-- Attachment #1: Type: text/plain, Size: 1010 bytes --]

On Tue, 27 Nov 2007 08:09:19 +0100, Eric Dumazet said:

> Changing NR_OPEN is not considered safe because of vmalloc space potential 
> exhaust.

Verbiage about this point...


> +nr_open
> +-------
> +
> +Denotes the maximum number of file-handles a process can
> +allocate. Default value is 1024*1024 (1048576) which should be
> +enough for most machines. Actual limit depends on RLIMIT_NOFILE
> +resource limit.
> +

should probably be in here - can you add something of the form "Setting this
too high can cause vmalloc failures, especially on smaller-RAM machines",
and/or *say* how much RAM the default takes?  Sure, it's 1M entries, but
my tuning on a 2G-RAM machine will differ if these are byte-sized, or 128-byte
sized - one is off in a corner, the other is 1/16th of my entire memory.

Also, would it be useful to *lower* the value drastically, if you know a priori
that no process should get up to 1K file handles, much less 1M? Does that
buy me anything different than setting RLIMIT_NOFILE=1024?

[-- Attachment #2: Type: application/pgp-signature, Size: 226 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open
  2007-11-27  7:25             ` Valdis.Kletnieks
@ 2007-11-27  7:58               ` Eric Dumazet
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2007-11-27  7:58 UTC (permalink / raw)
  To: Valdis.Kletnieks; +Cc: Andrew Morton, Alan Cox, Linux kernel

Valdis.Kletnieks@vt.edu a écrit :
> On Tue, 27 Nov 2007 08:09:19 +0100, Eric Dumazet said:
> 
>> Changing NR_OPEN is not considered safe because of vmalloc space potential 
>> exhaust.
> 
> Verbiage about this point...
> 
> 
>> +nr_open
>> +-------
>> +
>> +Denotes the maximum number of file-handles a process can
>> +allocate. Default value is 1024*1024 (1048576) which should be
>> +enough for most machines. Actual limit depends on RLIMIT_NOFILE
>> +resource limit.
>> +
> 
> should probably be in here - can you add something of the form "Setting this
> too high can cause vmalloc failures, especially on smaller-RAM machines",
> and/or *say* how much RAM the default takes?  Sure, it's 1M entries, but
> my tuning on a 2G-RAM machine will differ if these are byte-sized, or 128-byte
> sized - one is off in a corner, the other is 1/16th of my entire memory.

vmalloc failures can already happen if you start 32 processes on i386 kernels, 
each of them wanting to open file handle number 600.000 (if their 
RLIMIT_NOFILE >= 600000)

fcntl(0, F_DUPFD, 600000);

We are not going to add warnings about vmalloc on every sysctl around there 
that could allow a root user to exhaust vmalloc space. This is a vmalloc issue 
on 32bit kernel, and quite frankly I never hit this limit.

If you take a look at vmalloc() implementation, fact that it uses a 'struct 
vm_struct *vmlist;' to track all active zones show that vmalloc() is not used 
that much.

> 
> Also, would it be useful to *lower* the value drastically, if you know a priori
> that no process should get up to 1K file handles, much less 1M? Does that
> buy me anything different than setting RLIMIT_NOFILE=1024?

NR_OPEN is the max value that RLIMIT_NOFILE can reach, nothing more.

You can set it to 256*1024*1024 or 4*1024 it wont change memory needs on your 
machine, unless you raise RLIMIT_NOFILE and one of your program leaks file 
handles, or really want to open simultaneously many of them.

Most programs wont open more than 500 files, so their file table is allocated 
via kmalloc()


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH, v2] get rid of NR_OPEN and introduce a sysctl_nr_open
  2007-11-27  7:09           ` [PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open Eric Dumazet
  2007-11-27  7:25             ` Valdis.Kletnieks
@ 2007-11-28  7:52             ` Eric Dumazet
  1 sibling, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2007-11-28  7:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Alan Cox, Linux kernel

[-- Attachment #1: Type: text/plain, Size: 1502 bytes --]

V1->V2

Some NR_OPEN were left unchanged for alpha, mips & sparc64.

As changing NR_OPEN from 1024*1024 to 16*1024*1024 was considered a litle
bit dangerous, just let it default to 1024*1024 but adds a new sysctl
to let sysadmins change this value.

Thank you

[PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open

NR_OPEN (historically set to 1024*1024) actually forbids processes to open 
more than 1024*1024 handles.

Unfortunatly some production servers hit the not so 'ridiculously high value' 
of 1024*1024 file descriptors per process.

Changing NR_OPEN is not considered safe because of vmalloc space potential 
exhaust.

This patch introduces a new sysctl (/proc/sys/fs/nr_open) wich defaults to 
1024*1024, so that admins can decide to change this limit if their workload 
needs it.


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

  Documentation/filesystems/proc.txt |    8 ++++++++
  Documentation/sysctl/fs.txt        |   10 ++++++++++
  arch/alpha/kernel/osf_sys.c        |    2 +-
  arch/mips/kernel/sysirix.c         |    2 +-
  arch/sparc64/solaris/fs.c          |    2 +-
  arch/sparc64/solaris/timod.c       |    6 ++++--
  fs/file.c                          |    8 +++++---
  include/linux/fs.h                 |    2 +-
  kernel/sys.c                       |    2 +-
  kernel/sysctl.c                    |    8 ++++++++
  10 files changed, 40 insertions(+), 10 deletions(-)

[-- Attachment #2: nr_open.patch --]
[-- Type: text/plain, Size: 5608 bytes --]

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dec9945..9b390d7 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -989,6 +989,14 @@ nr_inodes
 Denotes the  number  of  inodes the system has allocated. This number will
 grow and shrink dynamically.
 
+nr_open
+-------
+
+Denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
 nr_free_inodes
 --------------
 
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index aa986a3..f992543 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -23,6 +23,7 @@ Currently, these files are in /proc/sys/fs:
 - inode-max
 - inode-nr
 - inode-state
+- nr_open
 - overflowuid
 - overflowgid
 - suid_dumpable
@@ -91,6 +92,15 @@ usage of file handles and you don't need to increase the maximum.
 
 ==============================================================
 
+nr_open:
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+==============================================================
+
 inode-max, inode-nr & inode-state:
 
 As with file handles, the kernel allocates the inode structures
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 6413c5f..72f9a61 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -430,7 +430,7 @@ sys_getpagesize(void)
 asmlinkage unsigned long
 sys_getdtablesize(void)
 {
-	return NR_OPEN;
+	return sysctl_nr_open;
 }
 
 /*
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 4c477c7..22fd41e 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -356,7 +356,7 @@ asmlinkage int irix_syssgi(struct pt_regs *regs)
 			retval = NGROUPS_MAX;
 			goto out;
 		case 5:
-			retval = NR_OPEN;
+			retval = sysctl_nr_open;
 			goto out;
 		case 6:
 			retval = 1;
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
index 61be597..9311bfe 100644
--- a/arch/sparc64/solaris/fs.c
+++ b/arch/sparc64/solaris/fs.c
@@ -624,7 +624,7 @@ asmlinkage int solaris_ulimit(int cmd, int val)
 	case 3: /* UL_GMEMLIM */
 		return current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	case 4: /* UL_GDESLIM */
-		return NR_OPEN;
+		return sysctl_nr_open;
 	}
 	return -EINVAL;
 }
diff --git a/arch/sparc64/solaris/timod.c b/arch/sparc64/solaris/timod.c
index a9d32ce..f53123c 100644
--- a/arch/sparc64/solaris/timod.c
+++ b/arch/sparc64/solaris/timod.c
@@ -859,7 +859,8 @@ asmlinkage int solaris_getmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)
 
 	SOLD("entry");
 	lock_kernel();
-	if(fd >= NR_OPEN) goto out;
+	if (fd >= sysctl_nr_open)
+		goto out;
 
 	fdt = files_fdtable(current->files);
 	filp = fdt->fd[fd];
@@ -927,7 +928,8 @@ asmlinkage int solaris_putmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)
 
 	SOLD("entry");
 	lock_kernel();
-	if(fd >= NR_OPEN) goto out;
+	if (fd >= sysctl_nr_open)
+		goto out;
 
 	fdt = files_fdtable(current->files);
 	filp = fdt->fd[fd];
diff --git a/fs/file.c b/fs/file.c
index c5575de..5110acb 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -24,6 +24,8 @@ struct fdtable_defer {
 	struct fdtable *next;
 };
 
+int sysctl_nr_open __read_mostly = 1024*1024;
+
 /*
  * We use this list to defer free fdtables that have vmalloced
  * sets/arrays. By keeping a per-cpu list, we avoid having to embed
@@ -147,8 +149,8 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	nr /= (1024 / sizeof(struct file *));
 	nr = roundup_pow_of_two(nr + 1);
 	nr *= (1024 / sizeof(struct file *));
-	if (nr > NR_OPEN)
-		nr = NR_OPEN;
+	if (nr > sysctl_nr_open)
+		nr = sysctl_nr_open;
 
 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 	if (!fdt)
@@ -233,7 +235,7 @@ int expand_files(struct files_struct *files, int nr)
 	if (nr < fdt->max_fds)
 		return 0;
 	/* Can we expand? */
-	if (nr >= NR_OPEN)
+	if (nr >= sysctl_nr_open)
 		return -EMFILE;
 
 	/* All good, so we try */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a4..1cda287 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -21,7 +21,7 @@
 
 /* Fixed constants first: */
 #undef NR_OPEN
-#define NR_OPEN (1024*1024)	/* Absolute upper limit on fd num */
+extern int sysctl_nr_open;
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 
 #define BLOCK_SIZE_BITS 10
diff --git a/kernel/sys.c b/kernel/sys.c
index d1fe71e..99c6ce1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1472,7 +1472,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
 		return -EPERM;
 
 	retval = security_task_setrlimit(resource, &new_rlim);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0deed82..de22f7b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1127,6 +1127,14 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_open",
+		.data		= &sysctl_nr_open,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= FS_DENTRY,
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2007-11-28  7:52 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <200711200023.lAK0NtEe004292@imap1.linux-foundation.org>
     [not found] ` <20071120004832.393cc5a8@the-village.bc.nu>
     [not found]   ` <474291E7.5060700@cosmosbay.com>
     [not found]     ` <20071120001411.4db1eb04.akpm@linux-foundation.org>
     [not found]       ` <474420EE.4060603@cosmosbay.com>
     [not found]         ` <20071126201717.450e44eb.akpm@linux-foundation.org>
2007-11-27  7:09           ` [PATCH] get rid of NR_OPEN and introduce a sysctl_nr_open Eric Dumazet
2007-11-27  7:25             ` Valdis.Kletnieks
2007-11-27  7:58               ` Eric Dumazet
2007-11-28  7:52             ` [PATCH, v2] " Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).