Re: [RFC][PATCHv2] List per-process file descriptor consumption when hitting file-max

From: Andrew Morton <akpm@linux-foundation.org>
To: Alexander Shishkin <virtuoso@slind.org>
Cc: Valdis.Kletnieks@vt.edu, linux-fsdevel@vger.kernel.org,
	viro@zeniv.linux.org.uk
Subject: Re: [RFC][PATCHv2] List per-process file descriptor consumption when hitting file-max
Date: Wed, 13 Jan 2010 14:12:39 -0800	[thread overview]
Message-ID: <20100113141239.b79aa7c3.akpm@linux-foundation.org> (raw)
In-Reply-To: <1263141242-23440-1-git-send-email-virtuoso@slind.org>

On Sun, 10 Jan 2010 18:34:02 +0200
Alexander Shishkin <virtuoso@slind.org> wrote:

> When a file descriptor limit is hit, display the top consumers of
> descriptors so that it is possible to identify and fix those which
> leak them.
> 
> Two new sysctl tunables are introduced:
>   * file-max-consumers -- number of processes to display (defaults
>     to 10);
>   * file-max-rate-limit -- time interval between subsequent dumps
>     (defaults to 10 seconds).
> 
> Signed-off-by: Alexander Shishkin <virtuoso@slind.org>
> CC: viro@zeniv.linux.org.uk
> CC: linux-fsdevel@vger.kernel.org
> ---
>  fs/file_table.c    |   90 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/fs.h |    5 +++
>  kernel/sysctl.c    |   14 ++++++++
>  3 files changed, 108 insertions(+), 1 deletions(-)
> 

Seems to be a lot of code for a fairly obscure requirement.  Can we
justify adding it?  Can't the operator use lsof(8)?

> +struct fd_consumer
> +{

Please use scripts/checkpatch.pl.

> +	struct task_struct *task;
> +	int fd_count;
> +};
> +
> +static int cmp_fd_consumers(const void *a, const void *b)
> +{
> +	const struct fd_consumer *x = a, *y = b;
> +
> +	return y->fd_count - x->fd_count;
> +}
> +
> +static void dump_fd_consumers(void)
> +{
> +	struct task_struct *p;
> +	struct files_struct *files;
> +	struct fdtable *fdt;
> +	int proc_limit = files_stat.max_consumers;
> +	int i, nproc;
> +	struct fd_consumer *procs, *tmp;

Please avoid variables called `tmp'.  Think up a meaningful name.

> +	if (!files_stat.max_consumers)
> +		return;
> +
> +	read_lock(&tasklist_lock);
> +
> +	/* build an array of per-task file descriptor usage */
> +	nproc = nr_processes();
> +	procs = kzalloc(nproc * sizeof(struct fd_consumer), GFP_KERNEL);

GFP_KERNEL allocation under tasklist_lock is a bug.  A bug which would
have been revealed during runtime testing had you empoyed the wise
advice in Documentation/SubmitChecklist!

> +	if (!procs)
> +		goto out;
> +
> +	tmp = procs;
> +
> +	for_each_process(p) {
> +		tmp->task = p;
> +
> +		files = get_files_struct(p);

This does task_lock() under tasklist_lock.  Is that correct?

> +		if (!files)
> +			continue;
> +
> +		spin_lock(&files->file_lock);

Similarly, is this the correct ranking?

Also, is it a _new_ ranking?  Is there any place where we
simultaneously take tasklist_lock and file_lock?  If so, that's a bit
undesirable.

> +		fdt = files_fdtable(files);
> +
> +		/* we have to actually *count* the fds */
> +		for (tmp->fd_count = i = 0; i < fdt->max_fds; i++)
> +			tmp->fd_count += !!fcheck_files(files, i);
> +
> +		spin_unlock(&files->file_lock);
> +		put_files_struct(files);
> +
> +		tmp++;
> +	}
> +
> +	/* sort by number of used descriptor in descending order */
> +	sort(procs, nproc, sizeof(struct fd_consumer), cmp_fd_consumers, NULL);

This code can hold tasklist lock for quite a long time.

> +	if (proc_limit > nproc)
> +		proc_limit = nproc;
> +
> +	/* output the 'proc_limit' first entries */
> +	for (i = 0, tmp = procs; i < proc_limit; i++, tmp++)
> +		printk(KERN_INFO "=> %s [%d]: open=%d\n", tmp->task->comm,
> +		       tmp->task->pid, tmp->fd_count);
> +
> +	kfree(procs);
> +
> +out:
> +	read_unlock(&tasklist_lock);
> +}
> +
>  /* Find an unused file structure and return a pointer to it.
>   * Returns NULL, if there are no more free file structures or
>   * we run out of memory.
> @@ -105,6 +182,7 @@ struct file *get_empty_filp(void)
>  	const struct cred *cred = current_cred();
>  	static int old_max;
>  	struct file * f;
> +	static unsigned long next_dump = 0;
>  
>  	/*
>  	 * Privileged users can go above max_files
> @@ -140,6 +218,14 @@ over:
>  	if (get_nr_files() > old_max) {
>  		printk(KERN_INFO "VFS: file-max limit %d reached\n",
>  					get_max_files());
> +
> +		/* dump the biggest file descriptor users */
> +		if (!next_dump || time_after(jiffies, next_dump)) {
> +			next_dump = jiffies + files_stat.rate_limit;
> +
> +			dump_fd_consumers();
> +		}

Use of __ratelimit() would be appropriate here.

>  		old_max = get_nr_files();
>  	}
>  	goto fail;
> @@ -425,6 +511,8 @@ void __init files_init(unsigned long mempages)
>  	files_stat.max_files = n; 
>  	if (files_stat.max_files < NR_FILE)
>  		files_stat.max_files = NR_FILE;
> +
> +	files_stat.rate_limit = DUMP_RATE_LIMIT;
>  	files_defer_init();
>  	percpu_counter_init(&nr_files, 0);
>  } 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9147ca8..291beb3 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -36,6 +36,8 @@ struct files_stat_struct {
>  	int nr_files;		/* read only */
>  	int nr_free_files;	/* read only */
>  	int max_files;		/* tunable */
> +	int max_consumers;	/* tunable */
> +	unsigned long rate_limit;	/* tunable */
>  };
>  
>  struct inodes_stat_t {
> @@ -46,6 +48,9 @@ struct inodes_stat_t {
>  
>  
>  #define NR_FILE  8192	/* this can well be larger on a larger system */
> +#define NR_CONSUMERS 10 /* dump this many tasks when file-max is hit */
> +#define DUMP_RATE_LIMIT msecs_to_jiffies(10000) /* wait this long between
> +						   dumps */
>  
>  #define MAY_EXEC 1
>  #define MAY_WRITE 2
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 8a68b24..dfb08fc 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1325,6 +1325,20 @@ static struct ctl_table fs_table[] = {
>  		.proc_handler	= proc_dointvec,
>  	},
>  	{
> +		.procname	= "file-max-consumers",
> +		.data		= &files_stat.max_consumers,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "file-max-rate-limit",
> +		.data		= &files_stat.rate_limit,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
> +	},
> +	{
>  		.procname	= "nr_open",
>  		.data		= &sysctl_nr_open,
>  		.maxlen		= sizeof(int),