* [patch 0/2] RCU: fix various latency/oom issues @ 2006-01-26 18:40 Dipankar Sarma 2006-01-26 18:41 ` [patch 1/2] rcu batch tuning Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-01-26 18:40 UTC (permalink / raw) To: Andrew Morton; +Cc: Linus Torvalds, Paul E.McKenney, linux-kernel Here are the patches that I have been testing that should help some of the latency and OOM issues (file limit) that we had discussed in the past. If the patchset looks ok, we should probably queue them up in -mm for some testing before merging. I have lightly tested the patchset on both ppc64 and x86_64 using ltp, dbench etc. Thanks Dipnkar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/2] rcu batch tuning 2006-01-26 18:40 [patch 0/2] RCU: fix various latency/oom issues Dipankar Sarma @ 2006-01-26 18:41 ` Dipankar Sarma 2006-01-26 18:42 ` [patch 2/2] fix file counting Dipankar Sarma 2006-01-26 19:33 ` [patch 1/2] rcu batch tuning Paul E. McKenney 0 siblings, 2 replies; 24+ messages in thread From: Dipankar Sarma @ 2006-01-26 18:41 UTC (permalink / raw) To: Andrew Morton; +Cc: Linus Torvalds, Paul E.McKenney, linux-kernel This patch adds new tunables for RCU queue and finished batches. There are two types of controls - number of completed RCU updates invoked in a batch (blimit) and monitoring for high rate of incoming RCUs on a cpu (qhimark, qlowmark). By default, the per-cpu batch limit is set to a small value. If the input RCU rate exceeds the high watermark, we do two things - force quiescent state on all cpus and set the batch limit of the CPU to INTMAX. Setting batch limit to INTMAX forces all finished RCUs to be processed in one shot. If we have more than INTMAX RCUs queued up, then we have bigger problems anyway. Once the incoming queued RCUs fall below the low watermark, the batch limit is set to the default. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> --- include/linux/rcupdate.h | 6 +++ kernel/rcupdate.c | 76 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 19 deletions(-) diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h --- linux-2.6.16-rc1-rcu/include/linux/rcupdate.h~rcu-batch-tuning 2006-01-25 00:09:54.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/include/linux/rcupdate.h 2006-01-25 01:07:39.000000000 +0530 @@ -98,13 +98,17 @@ struct rcu_data { long batch; /* Batch # for current RCU batch */ struct rcu_head *nxtlist; struct rcu_head **nxttail; - long count; /* # of queued items */ + long qlen; /* # of queued callbacks */ struct rcu_head *curlist; struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; + long blimit; /* Upper limit on a processed batch */ int cpu; struct rcu_head barrier; +#ifdef CONFIG_SMP + long last_rs_qlen; /* qlen during the last resched */ +#endif }; DECLARE_PER_CPU(struct rcu_data, rcu_data); diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c --- linux-2.6.16-rc1-rcu/kernel/rcupdate.c~rcu-batch-tuning 2006-01-25 00:09:54.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/kernel/rcupdate.c 2006-01-25 23:08:03.000000000 +0530 @@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d /* Fake initialization required by compiler */ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int maxbatch = 10000; +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; +#ifdef CONFIG_SMP +static int rsinterval = 1000; +#endif + +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { + rdp->last_rs_qlen = rdp->qlen; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif /** * call_rcu - Queue an RCU callback for invocation after a grace period. @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head * rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - - if (unlikely(++rdp->count > 10000)) - set_need_resched(); - + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } local_irq_restore(flags); } -static atomic_t rcu_barrier_cpu_count; -static struct semaphore rcu_barrier_sema; -static struct completion rcu_barrier_completion; - /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_hea rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - rdp->count++; -/* - * Should we directly call rcu_do_batch() here ? - * if (unlikely(rdp->count > 10000)) - * rcu_do_batch(rdp); - */ + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + local_irq_restore(flags); } @@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data next = rdp->donelist = list->next; list->func(list); list = next; - rdp->count--; - if (++count >= maxbatch) + rdp->qlen--; + if (++count >= rdp->blimit) break; } + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; if (!rdp->donelist) rdp->donetail = &rdp->donelist; else @@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; + rdp->blimit = blimit; } static void __devinit rcu_online_cpu(int cpu) @@ -567,7 +602,12 @@ void synchronize_kernel(void) synchronize_rcu(); } -module_param(maxbatch, int, 0); +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); +#ifdef CONFIG_SMP +module_param(rsinterval, int, 0); +#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ _ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-26 18:41 ` [patch 1/2] rcu batch tuning Dipankar Sarma @ 2006-01-26 18:42 ` Dipankar Sarma 2006-01-26 20:15 ` Eric Dumazet 2006-01-26 19:33 ` [patch 1/2] rcu batch tuning Paul E. McKenney 1 sibling, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-01-26 18:42 UTC (permalink / raw) To: Andrew Morton; +Cc: Linus Torvalds, Paul E.McKenney, linux-kernel, Al Viro The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate atomic_t, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> --- fs/dcache.c | 2 - fs/file_table.c | 79 ++++++++++++++++++++++++++++++--------------------- include/linux/file.h | 2 - include/linux/fs.h | 2 + kernel/sysctl.c | 5 ++- net/unix/af_unix.c | 2 - 6 files changed, 56 insertions(+), 36 deletions(-) diff -puN fs/dcache.c~fix-file-counting fs/dcache.c --- linux-2.6.16-rc1-rcu/fs/dcache.c~fix-file-counting 2006-01-26 00:38:47.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/fs/dcache.c 2006-01-26 00:38:47.000000000 +0530 @@ -1734,7 +1734,7 @@ void __init vfs_caches_init(unsigned lon SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff -puN fs/file_table.c~fix-file-counting fs/file_table.c --- linux-2.6.16-rc1-rcu/fs/file_table.c~fix-file-counting 2006-01-26 00:38:47.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/fs/file_table.c 2006-01-26 01:23:28.000000000 +0530 @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include <linux/config.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> @@ -19,52 +20,67 @@ #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> - +#include <linux/sysctl.h> +#include <asm/atomic.h> + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); + +static atomic_t nr_files __cacheline_aligned_in_smp; -static DEFINE_SPINLOCK(filp_count_lock); +static inline void file_free_rcu(struct rcu_head *head) +{ + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); +} -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files +static inline void file_free(struct file *f) +{ + atomic_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); +} + +/* + * Return the total number of open files in the system */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +int get_nr_files(void) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + return atomic_read(&nr_files); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + return files_stat.max_files; } -static inline void file_free_rcu(struct rcu_head *head) +EXPORT_SYMBOL(get_nr_files); +EXPORT_SYMBOL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); } - -static inline void file_free(struct file *f) +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return -ENOSYS; } +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,7 +94,7 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) goto over; @@ -97,14 +113,15 @@ struct file *get_empty_filp(void) rwlock_init(&f->f_owner.lock); /* f->f_version: 0 */ INIT_LIST_HEAD(&f->f_u.fu_list); + atomic_inc(&nr_files); return f; over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; diff -puN include/linux/file.h~fix-file-counting include/linux/file.h --- linux-2.6.16-rc1-rcu/include/linux/file.h~fix-file-counting 2006-01-26 00:38:47.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/include/linux/file.h 2006-01-26 01:17:36.000000000 +0530 @@ -60,8 +60,6 @@ extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); struct kmem_cache; -extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags); -extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); diff -puN include/linux/fs.h~fix-file-counting include/linux/fs.h --- linux-2.6.16-rc1-rcu/include/linux/fs.h~fix-file-counting 2006-01-26 00:38:47.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/include/linux/fs.h 2006-01-26 00:38:48.000000000 +0530 @@ -35,6 +35,8 @@ struct files_stat_struct { int max_files; /* tunable */ }; extern struct files_stat_struct files_stat; +extern int get_nr_files(void); +extern int get_max_files(void); struct inodes_stat_t { int nr_inodes; diff -puN kernel/sysctl.c~fix-file-counting kernel/sysctl.c --- linux-2.6.16-rc1-rcu/kernel/sysctl.c~fix-file-counting 2006-01-26 00:38:47.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/kernel/sysctl.c 2006-01-26 00:38:48.000000000 +0530 @@ -52,6 +52,9 @@ #include <linux/nfs_fs.h> #endif +extern int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -900,7 +903,7 @@ static ctl_table fs_table[] = { .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_files, }, { .ctl_name = FS_MAXFILE, diff -puN net/unix/af_unix.c~fix-file-counting net/unix/af_unix.c --- linux-2.6.16-rc1-rcu/net/unix/af_unix.c~fix-file-counting 2006-01-26 00:38:47.000000000 +0530 +++ linux-2.6.16-rc1-rcu-dipankar/net/unix/af_unix.c 2006-01-26 00:38:48.000000000 +0530 @@ -547,7 +547,7 @@ static struct sock * unix_create1(struct struct sock *sk = NULL; struct unix_sock *u; - if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + if (atomic_read(&unix_nr_socks) >= 2*get_max_files()) goto out; sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); _ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-26 18:42 ` [patch 2/2] fix file counting Dipankar Sarma @ 2006-01-26 20:15 ` Eric Dumazet 2006-01-27 22:54 ` Andrew Morton 0 siblings, 1 reply; 24+ messages in thread From: Eric Dumazet @ 2006-01-26 20:15 UTC (permalink / raw) To: dipankar Cc: Andrew Morton, Linus Torvalds, Paul E.McKenney, linux-kernel, Al Viro, Nick Piggin, Christoph Hellwig [-- Attachment #1: Type: text/plain, Size: 1912 bytes --] Dipankar Sarma a écrit : > The way we do file struct accounting is not very suitable for batched > freeing. For scalability reasons, file accounting was constructor/destructor > based. This meant that nr_files was decremented only when > the object was removed from the slab cache. This is > susceptible to slab fragmentation. With RCU based file structure, > consequent batched freeing and a test program like Serge's, > we just speed this up and end up with a very fragmented slab - > > llm22:~ # cat /proc/sys/fs/file-nr > 587730 0 758844 > > At the same time, I see only a 2000+ objects in filp cache. > The following patch I fixes this problem. > > This patch changes the file counting by removing the filp_count_lock. > Instead we use a separate atomic_t, nr_files, for now and all > accesses to it are through get_nr_files() api. In the sysctl > handler for nr_files, we populate files_stat.nr_files before returning > to user. > > Counting files as an when they are created and destroyed (as opposed > to inside slab) allows us to correctly count open files with RCU. > > Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> > --- Well... I am using a patch that seems sligthly better : It removes the filp_count_lock as yours but introduces a percpu variable, and a lazy nr_files . (Its value can be off with a delta of +/- 16*num_possible_cpus() Pros : - No more delay caused by the slab dtor hack (like your patch) - No more ping pongs in SMP/NUMA machines because of the nr_files being constantly modified. - litle memory footprint of NR_CPUS*4 bytes - No more cli/sti games (because of filp ctor/dtor spinlock_bh) - Moves files_stat definition in a new file (include/linux/files_stat.h) Cons : The lazy nr_files value in SMP, but who cares ? Do we want the exact value displayed in /proc/sys/fs/file-nr ? Thank you Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> [-- Attachment #2: nr_files.patch --] [-- Type: text/plain, Size: 7972 bytes --] --- linux-2.6.16-rc1-mm3/include/linux/fs.h 2006-01-26 21:42:54.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/include/linux/fs.h 2006-01-26 21:46:44.000000000 +0100 @@ -29,12 +29,6 @@ #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) /* And dynamically-tunable limits and defaults: */ -struct files_stat_struct { - int nr_files; /* read only */ - int nr_free_files; /* read only */ - int max_files; /* tunable */ -}; -extern struct files_stat_struct files_stat; struct inodes_stat_t { int nr_inodes; --- linux-2.6.16-rc1-mm3/include/linux/files_stat.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/include/linux/files_stat.h 2006-01-26 21:26:08.000000000 +0100 @@ -0,0 +1,34 @@ +#ifndef _LINUX_FILES_STAT_H +#define _LINUX_FILES_STAT_H +/* + * nr_files uses a percpu_counter to reduce atomic ops and/or cli/sti + * Using a percpu_counter is lazy, but we dont need exact values, + * and can consider max_files as a lazy limit, at least for SMP machines. + */ +#include <linux/percpu_counter.h> + +struct files_stat_struct { + /* + * WARNING : The 3 first fields of this structure + * must be of int type :_nr_files, nr_free_files, max_files + * in this exact order. + * (see kernel/sysctl.c ) + */ + int _nr_files; /* rd only, updated when reading /proc/sys/fs/file-nr */ + int nr_free_files; /* rd only , obsolete */ + int max_files; /* rw (/proc/sys/fs/file-max) */ + atomic_t nr_files; +}; +extern struct files_stat_struct files_stat; + +static inline int get_nr_files(void) +{ + return atomic_read(&files_stat.nr_files); +} + +static inline int get_max_files(void) +{ + return files_stat.max_files; +} + +#endif /* _LINUX_FILES_STAT_H */ --- linux-2.6.16-rc1-mm3/fs/file_table.c 2006-01-26 21:42:53.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/fs/file_table.c 2006-01-26 21:48:27.000000000 +0100 @@ -19,10 +19,12 @@ #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> +#include <linux/files_stat.h> /* sysctl tunables... */ -struct files_stat_struct files_stat = { - .max_files = NR_FILE +__cacheline_aligned_in_smp struct files_stat_struct files_stat = { + .max_files = NR_FILE, + .nr_files = ATOMIC_INIT(0) }; EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ @@ -30,35 +32,33 @@ /* public. Not pretty! */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); -static DEFINE_SPINLOCK(filp_count_lock); - -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files - */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) -{ - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); +#ifdef CONFIG_SMP +#define NR_FILES_THRESHOLD 16 +static DEFINE_PER_CPU(int, nr_files); +static void add_nr_files(int delta) +{ + int *local; + preempt_disable(); + local = &__get_cpu_var(nr_files); + *local += delta; + if (*local > NR_FILES_THRESHOLD || *local < -NR_FILES_THRESHOLD) { + atomic_add(*local, &files_stat.nr_files); + *local = 0; } + preempt_enable(); } - -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) -{ - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); -} +#define dec_nr_files() add_nr_files(-1) +#define inc_nr_files() add_nr_files(1) +#else +# define dec_nr_files() atomic_dec(&files_stat.nr_files) +# define inc_nr_files() atomic_inc(&files_stat.nr_files) +#endif static inline void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); kmem_cache_free(filp_cachep, f); + dec_nr_files(); } static inline void file_free(struct file *f) @@ -75,11 +75,13 @@ struct task_struct *tsk; static int old_max; struct file * f; + int nr_files; /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && + nr_files = get_nr_files(); + if (nr_files >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) goto over; @@ -91,6 +93,7 @@ if (security_file_alloc(f)) goto fail_sec; + inc_nr_files(); tsk = current; INIT_LIST_HEAD(&f->f_u.fu_list); atomic_set(&f->f_count, 1); @@ -103,10 +106,10 @@ over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (nr_files > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files); - old_max = files_stat.nr_files; + old_max = nr_files; } goto fail; --- linux-2.6.16-rc1-mm3/kernel/sysctl.c 2006-01-26 21:42:54.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/kernel/sysctl.c 2006-01-26 19:40:04.000000000 +0100 @@ -46,6 +46,7 @@ #include <linux/syscalls.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> +#include <linux/files_stat.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -131,6 +132,9 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dointvec_files_stat(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + static ctl_table root_table[]; static struct ctl_table_header root_table_header = { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; @@ -920,7 +924,7 @@ .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_files_stat, }, { .ctl_name = FS_MAXFILE, @@ -1754,6 +1758,12 @@ return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, NULL,NULL); } +static int proc_dointvec_files_stat(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat._nr_files = get_nr_files(); + return proc_dointvec(table,write,filp,buffer,lenp,ppos); +} #define OP_SET 0 #define OP_AND 1 --- linux-2.6.16-rc1-mm3/net/unix/af_unix.c 2006-01-26 21:42:54.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/net/unix/af_unix.c 2006-01-26 19:19:07.000000000 +0100 @@ -100,6 +100,7 @@ #include <linux/net.h> #include <linux/in.h> #include <linux/fs.h> +#include <linux/files_stat.h> #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/skbuff.h> @@ -547,7 +548,7 @@ struct sock *sk = NULL; struct unix_sock *u; - if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + if (atomic_read(&unix_nr_socks) >= 2*get_max_files()) goto out; sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); --- linux-2.6.16-rc1-mm3/fs/dcache.c 2006-01-26 21:42:53.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/fs/dcache.c 2006-01-26 21:15:11.000000000 +0100 @@ -33,6 +33,7 @@ #include <linux/seqlock.h> #include <linux/swap.h> #include <linux/bootmem.h> +#include <linux/files_stat.h> /* #define DCACHE_DEBUG 1 */ @@ -1738,7 +1739,7 @@ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); --- linux-2.6.16-rc1-mm3/include/linux/file.h 2006-01-26 21:42:54.000000000 +0100 +++ linux-2.6.16-rc1-mm3-ed/include/linux/file.h 2006-01-26 21:45:23.000000000 +0100 @@ -79,9 +79,6 @@ extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); -struct kmem_cache; -extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags); -extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-26 20:15 ` Eric Dumazet @ 2006-01-27 22:54 ` Andrew Morton 2006-01-27 23:14 ` Paul E. McKenney 0 siblings, 1 reply; 24+ messages in thread From: Andrew Morton @ 2006-01-27 22:54 UTC (permalink / raw) To: Eric Dumazet Cc: dipankar, torvalds, paulmck, linux-kernel, viro, nickpiggin, hch Eric Dumazet <dada1@cosmosbay.com> wrote: > > > This patch changes the file counting by removing the filp_count_lock. > > Instead we use a separate atomic_t, nr_files, for now and all > > accesses to it are through get_nr_files() api. In the sysctl > > handler for nr_files, we populate files_stat.nr_files before returning > > to user. > > > > Counting files as an when they are created and destroyed (as opposed > > to inside slab) allows us to correctly count open files with RCU. > > > > Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> > > --- > > Well... > > I am using a patch that seems sligthly better : It removes the filp_count_lock > as yours but introduces a percpu variable, and a lazy nr_files . (Its value > can be off with a delta of +/- 16*num_possible_cpus() Yes, I think that is better. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-27 22:54 ` Andrew Morton @ 2006-01-27 23:14 ` Paul E. McKenney 2006-01-27 23:28 ` Andrew Morton 0 siblings, 1 reply; 24+ messages in thread From: Paul E. McKenney @ 2006-01-27 23:14 UTC (permalink / raw) To: Andrew Morton Cc: Eric Dumazet, dipankar, torvalds, linux-kernel, viro, nickpiggin, hch On Fri, Jan 27, 2006 at 02:54:12PM -0800, Andrew Morton wrote: > Eric Dumazet <dada1@cosmosbay.com> wrote: > > > > > This patch changes the file counting by removing the filp_count_lock. > > > Instead we use a separate atomic_t, nr_files, for now and all > > > accesses to it are through get_nr_files() api. In the sysctl > > > handler for nr_files, we populate files_stat.nr_files before returning > > > to user. > > > > > > Counting files as an when they are created and destroyed (as opposed > > > to inside slab) allows us to correctly count open files with RCU. > > > > > > Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> > > > --- > > > > Well... > > > > I am using a patch that seems sligthly better : It removes the filp_count_lock > > as yours but introduces a percpu variable, and a lazy nr_files . (Its value > > can be off with a delta of +/- 16*num_possible_cpus() > > Yes, I think that is better. I agree that Eric's approach likely improves performance on large systems due to decreased cache thrashing. However, the real problem is getting both good throughput and good latency in RCU callback processing, given Lee Revell's latency testing results. Once we get that in hand, then we should consider Eric's approach. It might be that Lee needs to use -rt CONFIG_PREEMPT_RT to get the latencies he needs, but given that he is seeing several milliseconds of delay, I hope we can do better. ;-) Thanx, Paul ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-27 23:14 ` Paul E. McKenney @ 2006-01-27 23:28 ` Andrew Morton 2006-01-28 18:42 ` Dipankar Sarma 2006-01-30 17:00 ` Dipankar Sarma 0 siblings, 2 replies; 24+ messages in thread From: Andrew Morton @ 2006-01-27 23:28 UTC (permalink / raw) To: paulmck; +Cc: dada1, dipankar, torvalds, linux-kernel, viro, nickpiggin, hch "Paul E. McKenney" <paulmck@us.ibm.com> wrote: > > > > I am using a patch that seems sligthly better : It removes the filp_count_lock > > > as yours but introduces a percpu variable, and a lazy nr_files . (Its value > > > can be off with a delta of +/- 16*num_possible_cpus() > > > > Yes, I think that is better. > > I agree that Eric's approach likely improves performance on large systems > due to decreased cache thrashing. However, the real problem is getting > both good throughput and good latency in RCU callback processing, given > Lee Revell's latency testing results. Once we get that in hand, then > we should consider Eric's approach. Dipankar's patch risks worsening large-SMP scalability, doesn't it? Putting an atomic op into the file_free path? And afaict it fixes up the skew in the nr_files accounting but we're still exposed to the risk of large amounts of memory getting chewed up due to RCU latencies? (And it forgot to initialise the atomic_t) (And has a couple of suspicious-looking module exports. We don't support CONFIG_PROC_FS=m). ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-27 23:28 ` Andrew Morton @ 2006-01-28 18:42 ` Dipankar Sarma 2006-01-28 18:51 ` Andrew Morton 2006-01-30 17:00 ` Dipankar Sarma 1 sibling, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-01-28 18:42 UTC (permalink / raw) To: Andrew Morton Cc: paulmck, dada1, torvalds, linux-kernel, viro, nickpiggin, hch On Fri, Jan 27, 2006 at 03:28:57PM -0800, Andrew Morton wrote: > "Paul E. McKenney" <paulmck@us.ibm.com> wrote: > > > > > > I am using a patch that seems sligthly better : It removes the filp_count_lock > > > > as yours but introduces a percpu variable, and a lazy nr_files . (Its value > > > > can be off with a delta of +/- 16*num_possible_cpus() > > > > > > Yes, I think that is better. > > > > I agree that Eric's approach likely improves performance on large systems > > due to decreased cache thrashing. However, the real problem is getting > > both good throughput and good latency in RCU callback processing, given > > Lee Revell's latency testing results. Once we get that in hand, then > > we should consider Eric's approach. Lee's problem now seems to be fixed with my rcu-rt-flush-list patch. So, atleast for now we can keep that issue aside. > Dipankar's patch risks worsening large-SMP scalability, doesn't it? > Putting an atomic op into the file_free path? It does. However I didn't see any degradation running kernbench on a 4-way box a few months ago when I had originally written this patch. It would be nice if someone from SGI can give this a spin on a really big machine. It is not as if we didn't have costly operations. Under memory pressure, we would probably have been acquiring the file_count_lock quite often. That lock is now gone. That said, I would like to get a lazy percpu counter implementation done at some point in time. So far, I have just kept the things simple. > And afaict it fixes up the skew in the nr_files accounting but we're still > exposed to the risk of large amounts of memory getting chewed up due to RCU > latencies? That is hopefully fixed by my rcu-batch-tuning patch. I tested it using a program that does open()/close() of /dev/null in a tight loop. [x86_64 3.6GHz] > (And it forgot to initialise the atomic_t) I declared it static. Isn't that sufficient ? > (And has a couple of suspicious-looking module exports. We don't support > CONFIG_PROC_FS=m). Where ? All proc functions are wrapped in #ifdef CONFIG_PROC_FS and that is what I have done. What am I missing here ? Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-28 18:42 ` Dipankar Sarma @ 2006-01-28 18:51 ` Andrew Morton 2006-01-28 19:10 ` Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Andrew Morton @ 2006-01-28 18:51 UTC (permalink / raw) To: dipankar; +Cc: paulmck, dada1, torvalds, linux-kernel, viro, nickpiggin, hch Dipankar Sarma <dipankar@in.ibm.com> wrote: > > > (And it forgot to initialise the atomic_t) > > I declared it static. Isn't that sufficient ? ATOMIC_INIT(0); > > (And has a couple of suspicious-looking module exports. We don't support > > CONFIG_PROC_FS=m). > > Where ? +EXPORT_SYMBOL(get_nr_files); +EXPORT_SYMBOL(get_max_files); Why are these needed? ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-28 18:51 ` Andrew Morton @ 2006-01-28 19:10 ` Dipankar Sarma 0 siblings, 0 replies; 24+ messages in thread From: Dipankar Sarma @ 2006-01-28 19:10 UTC (permalink / raw) To: Andrew Morton Cc: paulmck, dada1, torvalds, linux-kernel, viro, nickpiggin, hch On Sat, Jan 28, 2006 at 10:51:08AM -0800, Andrew Morton wrote: > Dipankar Sarma <dipankar@in.ibm.com> wrote: > > > > > (And it forgot to initialise the atomic_t) > > > > I declared it static. Isn't that sufficient ? > > ATOMIC_INIT(0); OK, I will put an explicit initializer for that static variable. > > > > (And has a couple of suspicious-looking module exports. We don't support > > > CONFIG_PROC_FS=m). > > > > Where ? > > +EXPORT_SYMBOL(get_nr_files); > +EXPORT_SYMBOL(get_max_files); > > Why are these needed? get_max_files() is needed by unix sockets which can be a module. IIRC, xfs needed get_nr_files() when I originally wrote this patch. There doesn't seem to be any in-tree use now, but files_stat was earlier exported and we should probably mark this for unexport at a later time. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-27 23:28 ` Andrew Morton 2006-01-28 18:42 ` Dipankar Sarma @ 2006-01-30 17:00 ` Dipankar Sarma 2006-01-31 10:33 ` Eric Dumazet 1 sibling, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-01-30 17:00 UTC (permalink / raw) To: Andrew Morton Cc: paulmck, dada1, torvalds, linux-kernel, viro, nickpiggin, hch On Fri, Jan 27, 2006 at 03:28:57PM -0800, Andrew Morton wrote: > "Paul E. McKenney" <paulmck@us.ibm.com> wrote: > > > > > > I am using a patch that seems sligthly better : It removes the filp_count_lock > > > > as yours but introduces a percpu variable, and a lazy nr_files . (Its value > > > > can be off with a delta of +/- 16*num_possible_cpus() > > > > > > Yes, I think that is better. > > > > I agree that Eric's approach likely improves performance on large systems > > due to decreased cache thrashing. However, the real problem is getting > > both good throughput and good latency in RCU callback processing, given > > Lee Revell's latency testing results. Once we get that in hand, then > > we should consider Eric's approach. > > Dipankar's patch risks worsening large-SMP scalability, doesn't it? > Putting an atomic op into the file_free path? Here are some numbers from a 32-way (HT) P4 xeon box with kernbench - 2.6.16-rc1 vanilla ------------------ kernbench 32 5 -m 2 Completed successfully Elapsed: 109.346s User: 1426.506s System: 178.71s CPU: 1467.6% 1426.63user 177.89system 1:49.07elapsed 1471%CPU (0avgtext+0avgdata 0maxresident)k 1425.48user 179.08system 1:49.09elapsed 1470%CPU (0avgtext+0avgdata 0maxresident)k 1427.02user 179.17system 1:49.82elapsed 1462%CPU (0avgtext+0avgdata 0maxresident)k 1427.13user 179.47system 1:50.00elapsed 1460%CPU (0avgtext+0avgdata 0maxresident)k 1426.27user 177.94system 1:48.75elapsed 1475%CPU (0avgtext+0avgdata 0maxresident)k 2.6.16-rc1+fix-file-counting.patch (with atomic_t nr_files) ----------------------------------------------------------- kernbench 32 5 -m 2 Completed successfully Elapsed: 109.338s User: 1427.554s System: 179.764s CPU: 1469.4% 1425.98user 179.32system 1:49.18elapsed 1470%CPU (0avgtext+0avgdata 0maxresident)k 1427.36user 179.60system 1:49.34elapsed 1469%CPU (0avgtext+0avgdata 0maxresident)k 1428.19user 179.92system 1:49.97elapsed 1462%CPU (0avgtext+0avgdata 0maxresident)k 1426.53user 180.45system 1:49.02elapsed 1473%CPU (0avgtext+0avgdata 0maxresident)k 1429.71user 179.53system 1:49.18elapsed 1473%CPU (0avgtext+0avgdata 0maxresident)k 2.6.16-rc1+nr-files.patch (with percpu counters [Eric's patch]) ------------------------------------------------ kernbench 32 5 -m 2 Completed successfully Elapsed: 109.38s User: 1427.684s System: 179.372s CPU: 1468.8% 1429.92user 179.37system 1:48.64elapsed 1481%CPU (0avgtext+0avgdata 0maxresident)k 1427.10user 178.60system 1:48.83elapsed 1475%CPU (0avgtext+0avgdata 0maxresident)k 1425.99user 177.75system 1:49.81elapsed 1460%CPU (0avgtext+0avgdata 0maxresident)k 1426.65user 181.33system 1:49.95elapsed 1462%CPU (0avgtext+0avgdata 0maxresident)k 1428.76user 179.81system 1:49.67elapsed 1466%CPU (0avgtext+0avgdata 0maxresident)k The difference between atomic_t nr_files and percpu counters are statistically insignficant. That said, there could be other workloads where we may get hit badly by the global atomic_t or big SGI boxen can be a problem. I will poke around with this a little bit more to see what else I can analyze. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-30 17:00 ` Dipankar Sarma @ 2006-01-31 10:33 ` Eric Dumazet 2006-01-31 20:19 ` Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Eric Dumazet @ 2006-01-31 10:33 UTC (permalink / raw) To: dipankar Cc: Andrew Morton, paulmck, torvalds, linux-kernel, viro, nickpiggin, hch Dipankar Sarma a écrit : > On Fri, Jan 27, 2006 at 03:28:57PM -0800, Andrew Morton wrote: >> "Paul E. McKenney" <paulmck@us.ibm.com> wrote: >>>>> I am using a patch that seems sligthly better : It removes the filp_count_lock >>>>> as yours but introduces a percpu variable, and a lazy nr_files . (Its value >>>>> can be off with a delta of +/- 16*num_possible_cpus() >>>> Yes, I think that is better. >>> I agree that Eric's approach likely improves performance on large systems >>> due to decreased cache thrashing. However, the real problem is getting >>> both good throughput and good latency in RCU callback processing, given >>> Lee Revell's latency testing results. Once we get that in hand, then >>> we should consider Eric's approach. >> Dipankar's patch risks worsening large-SMP scalability, doesn't it? >> Putting an atomic op into the file_free path? > > Here are some numbers from a 32-way (HT) P4 xeon box with kernbench - > Hi Dipankar, thank you very much for doing these tests. To have an idea of the number of files that are allocated/freed during a kernel build, I added 4 fields in struct files_stat. 4th field is the central atomic_t nr_files. 5th field is the counter of allocated files. 6th field is the counter of freed files. 7th field is the counter of changes done on central atomic_t. (Test machine is a dual Xeon HT, 2 GHz) # make clean # cat /proc/sys/fs/file-nr 119 0 206071 119 39169 39022 1131 [root@localhost linux-2.6.16-rc1-mm4]# time make -j4 bzImage 798.49user 82.36system 4:56.56elapsed 297%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (62major+7223263minor)pagefaults 0swaps # cat /proc/sys/fs/file-nr 153 0 206071 153 755767 755620 5119 So thats (755767-39169)/(4*60+56) = 2420 files opened per second. Number of changes on central atomic_t was : (5119-1131)/(4*60+56) = 13 per second. 13 atomic ops per second (instead of 2420*2 per second if I had one atomic_t as in your patch), this is certainly not something we can notice in a typical SMP machine... maybe on a big NUMA machine it could be different ? Eric ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 2/2] fix file counting 2006-01-31 10:33 ` Eric Dumazet @ 2006-01-31 20:19 ` Dipankar Sarma 0 siblings, 0 replies; 24+ messages in thread From: Dipankar Sarma @ 2006-01-31 20:19 UTC (permalink / raw) To: Eric Dumazet Cc: Andrew Morton, paulmck, torvalds, linux-kernel, nickpiggin, hch On Tue, Jan 31, 2006 at 11:33:49AM +0100, Eric Dumazet wrote: > Dipankar Sarma a écrit : > >>Putting an atomic op into the file_free path? > > > >Here are some numbers from a 32-way (HT) P4 xeon box with kernbench - > > > > Hi Dipankar, thank you very much for doing these tests. > > To have an idea of the number of files that are allocated/freed during a > kernel build, I added 4 fields in struct files_stat. > > # cat /proc/sys/fs/file-nr > 153 0 206071 153 755767 755620 5119 > > So thats (755767-39169)/(4*60+56) = 2420 files opened per second. > > Number of changes on central atomic_t was : > (5119-1131)/(4*60+56) = 13 per second. > > 13 atomic ops per second (instead of 2420*2 per second if I had one > atomic_t as in your patch), this is certainly not something we can notice > in a typical SMP machine... maybe on a big NUMA machine it could be > different ? Depends on what you call big :) The one I ran on is a 2-node NUMA 16(32)-way with HT. I can't find anything bigger than this in our server farm. On a 8-way ppc64 box too, there was no difference at all. Can you think of some non-microbenchish workload with more open/sec ? I privately pointed out this thread to John Hawkes from SGI to see if they care about the potential issues. In the mean while, I will investigate some more. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/2] rcu batch tuning 2006-01-26 18:41 ` [patch 1/2] rcu batch tuning Dipankar Sarma 2006-01-26 18:42 ` [patch 2/2] fix file counting Dipankar Sarma @ 2006-01-26 19:33 ` Paul E. McKenney 2006-01-26 19:42 ` Dipankar Sarma 1 sibling, 1 reply; 24+ messages in thread From: Paul E. McKenney @ 2006-01-26 19:33 UTC (permalink / raw) To: Dipankar Sarma; +Cc: Andrew Morton, Linus Torvalds, linux-kernel On Fri, Jan 27, 2006 at 12:11:27AM +0530, Dipankar Sarma wrote: > > This patch adds new tunables for RCU queue and finished batches. > There are two types of controls - number of completed RCU updates > invoked in a batch (blimit) and monitoring for high rate of > incoming RCUs on a cpu (qhimark, qlowmark). By default, > the per-cpu batch limit is set to a small value. If > the input RCU rate exceeds the high watermark, we do two things - > force quiescent state on all cpus and set the batch limit > of the CPU to INTMAX. Setting batch limit to INTMAX forces all > finished RCUs to be processed in one shot. If we have more than > INTMAX RCUs queued up, then we have bigger problems anyway. > Once the incoming queued RCUs fall below the low watermark, the batch limit > is set to the default. Looks good to me! We might have to have more sophisticated adjustment of blimit, but starting simple is definitely the right way to go. Thanx, Paul Acked-by: <paulmck@us.ibm.com> > Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> > --- > > > include/linux/rcupdate.h | 6 +++ > kernel/rcupdate.c | 76 +++++++++++++++++++++++++++++++++++------------ > 2 files changed, 63 insertions(+), 19 deletions(-) > > diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h > --- linux-2.6.16-rc1-rcu/include/linux/rcupdate.h~rcu-batch-tuning 2006-01-25 00:09:54.000000000 +0530 > +++ linux-2.6.16-rc1-rcu-dipankar/include/linux/rcupdate.h 2006-01-25 01:07:39.000000000 +0530 > @@ -98,13 +98,17 @@ struct rcu_data { > long batch; /* Batch # for current RCU batch */ > struct rcu_head *nxtlist; > struct rcu_head **nxttail; > - long count; /* # of queued items */ > + long qlen; /* # of queued callbacks */ > struct rcu_head *curlist; > struct rcu_head **curtail; > struct rcu_head *donelist; > struct rcu_head **donetail; > + long blimit; /* Upper limit on a processed batch */ > int cpu; > struct rcu_head barrier; > +#ifdef CONFIG_SMP > + long last_rs_qlen; /* qlen during the last resched */ > +#endif > }; > > DECLARE_PER_CPU(struct rcu_data, rcu_data); > diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c > --- linux-2.6.16-rc1-rcu/kernel/rcupdate.c~rcu-batch-tuning 2006-01-25 00:09:54.000000000 +0530 > +++ linux-2.6.16-rc1-rcu-dipankar/kernel/rcupdate.c 2006-01-25 23:08:03.000000000 +0530 > @@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d > > /* Fake initialization required by compiler */ > static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; > -static int maxbatch = 10000; > +static int blimit = 10; > +static int qhimark = 10000; > +static int qlowmark = 100; > +#ifdef CONFIG_SMP > +static int rsinterval = 1000; > +#endif > + > +static atomic_t rcu_barrier_cpu_count; > +static struct semaphore rcu_barrier_sema; > +static struct completion rcu_barrier_completion; > + > +#ifdef CONFIG_SMP > +static void force_quiescent_state(struct rcu_data *rdp, > + struct rcu_ctrlblk *rcp) > +{ > + int cpu; > + cpumask_t cpumask; > + set_need_resched(); > + if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { > + rdp->last_rs_qlen = rdp->qlen; > + /* > + * Don't send IPI to itself. With irqs disabled, > + * rdp->cpu is the current cpu. > + */ > + cpumask = rcp->cpumask; > + cpu_clear(rdp->cpu, cpumask); > + for_each_cpu_mask(cpu, cpumask) > + smp_send_reschedule(cpu); > + } > +} > +#else > +static inline void force_quiescent_state(struct rcu_data *rdp, > + struct rcu_ctrlblk *rcp) > +{ > + set_need_resched(); > +} > +#endif > > /** > * call_rcu - Queue an RCU callback for invocation after a grace period. > @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head * > rdp = &__get_cpu_var(rcu_data); > *rdp->nxttail = head; > rdp->nxttail = &head->next; > - > - if (unlikely(++rdp->count > 10000)) > - set_need_resched(); > - > + if (unlikely(++rdp->qlen > qhimark)) { > + rdp->blimit = INT_MAX; > + force_quiescent_state(rdp, &rcu_ctrlblk); > + } > local_irq_restore(flags); > } > > -static atomic_t rcu_barrier_cpu_count; > -static struct semaphore rcu_barrier_sema; > -static struct completion rcu_barrier_completion; > - > /** > * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. > * @head: structure to be used for queueing the RCU updates. > @@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_hea > rdp = &__get_cpu_var(rcu_bh_data); > *rdp->nxttail = head; > rdp->nxttail = &head->next; > - rdp->count++; > -/* > - * Should we directly call rcu_do_batch() here ? > - * if (unlikely(rdp->count > 10000)) > - * rcu_do_batch(rdp); > - */ > + > + if (unlikely(++rdp->qlen > qhimark)) { > + rdp->blimit = INT_MAX; > + force_quiescent_state(rdp, &rcu_bh_ctrlblk); > + } > + > local_irq_restore(flags); > } > > @@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data > next = rdp->donelist = list->next; > list->func(list); > list = next; > - rdp->count--; > - if (++count >= maxbatch) > + rdp->qlen--; > + if (++count >= rdp->blimit) > break; > } > + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) > + rdp->blimit = blimit; > if (!rdp->donelist) > rdp->donetail = &rdp->donelist; > else > @@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu > rdp->quiescbatch = rcp->completed; > rdp->qs_pending = 0; > rdp->cpu = cpu; > + rdp->blimit = blimit; > } > > static void __devinit rcu_online_cpu(int cpu) > @@ -567,7 +602,12 @@ void synchronize_kernel(void) > synchronize_rcu(); > } > > -module_param(maxbatch, int, 0); > +module_param(blimit, int, 0); > +module_param(qhimark, int, 0); > +module_param(qlowmark, int, 0); > +#ifdef CONFIG_SMP > +module_param(rsinterval, int, 0); > +#endif > EXPORT_SYMBOL_GPL(rcu_batches_completed); > EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ > EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ > > _ > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 1/2] rcu batch tuning 2006-01-26 19:33 ` [patch 1/2] rcu batch tuning Paul E. McKenney @ 2006-01-26 19:42 ` Dipankar Sarma 0 siblings, 0 replies; 24+ messages in thread From: Dipankar Sarma @ 2006-01-26 19:42 UTC (permalink / raw) To: Paul E. McKenney; +Cc: Andrew Morton, Linus Torvalds, linux-kernel On Thu, Jan 26, 2006 at 11:33:17AM -0800, Paul E. McKenney wrote: > On Fri, Jan 27, 2006 at 12:11:27AM +0530, Dipankar Sarma wrote: > > > > This patch adds new tunables for RCU queue and finished batches. > > There are two types of controls - number of completed RCU updates > > invoked in a batch (blimit) and monitoring for high rate of > > incoming RCUs on a cpu (qhimark, qlowmark). By default, > > the per-cpu batch limit is set to a small value. If > > the input RCU rate exceeds the high watermark, we do two things - > > force quiescent state on all cpus and set the batch limit > > of the CPU to INTMAX. Setting batch limit to INTMAX forces all > > finished RCUs to be processed in one shot. If we have more than > > INTMAX RCUs queued up, then we have bigger problems anyway. > > Once the incoming queued RCUs fall below the low watermark, the batch limit > > is set to the default. > > Looks good to me! We might have to have more sophisticated adjustment > of blimit, but starting simple is definitely the right way to go. Yes. Once I am set up to do latency measurements, I will look at the possible need for more sophisticated adjustment of blimit. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 0/2] RCU updates @ 2006-02-17 15:41 Dipankar Sarma 2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-02-17 15:41 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney Here are the patches that I have been testing that should help some of the latency and OOM issues (file limit) that we had discussed in the past. If the patchset looks ok, we should queue them up in -mm for some testing before merging. I have lightly tested the patchset on both ppc64 and x86_64 using ltp, dbench etc. Update since the last time I published - file counting stuff uses percpu_counter. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 1/2] rcu batch tuning 2006-02-17 15:41 [PATCH 0/2] RCU updates Dipankar Sarma @ 2006-02-17 15:43 ` Dipankar Sarma 2006-02-17 15:46 ` [PATCH 2/2] fix file counting Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-02-17 15:43 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney This patch adds new tunables for RCU queue and finished batches. There are two types of controls - number of completed RCU updates invoked in a batch (blimit) and monitoring for high rate of incoming RCUs on a cpu (qhimark, qlowmark). By default, the per-cpu batch limit is set to a small value. If the input RCU rate exceeds the high watermark, we do two things - force quiescent state on all cpus and set the batch limit of the CPU to INTMAX. Setting batch limit to INTMAX forces all finished RCUs to be processed in one shot. If we have more than INTMAX RCUs queued up, then we have bigger problems anyway. Once the incoming queued RCUs fall below the low watermark, the batch limit is set to the default. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> --- include/linux/rcupdate.h | 6 +++ kernel/rcupdate.c | 76 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 19 deletions(-) diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h --- linux-2.6.16-rc3-rcu/include/linux/rcupdate.h~rcu-batch-tuning 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/include/linux/rcupdate.h 2006-02-15 16:00:02.000000000 +0530 @@ -98,13 +98,17 @@ struct rcu_data { long batch; /* Batch # for current RCU batch */ struct rcu_head *nxtlist; struct rcu_head **nxttail; - long count; /* # of queued items */ + long qlen; /* # of queued callbacks */ struct rcu_head *curlist; struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; + long blimit; /* Upper limit on a processed batch */ int cpu; struct rcu_head barrier; +#ifdef CONFIG_SMP + long last_rs_qlen; /* qlen during the last resched */ +#endif }; DECLARE_PER_CPU(struct rcu_data, rcu_data); diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c --- linux-2.6.16-rc3-rcu/kernel/rcupdate.c~rcu-batch-tuning 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/kernel/rcupdate.c 2006-02-15 16:00:02.000000000 +0530 @@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d /* Fake initialization required by compiler */ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int maxbatch = 10000; +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; +#ifdef CONFIG_SMP +static int rsinterval = 1000; +#endif + +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { + rdp->last_rs_qlen = rdp->qlen; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif /** * call_rcu - Queue an RCU callback for invocation after a grace period. @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head * rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - - if (unlikely(++rdp->count > 10000)) - set_need_resched(); - + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } local_irq_restore(flags); } -static atomic_t rcu_barrier_cpu_count; -static struct semaphore rcu_barrier_sema; -static struct completion rcu_barrier_completion; - /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_hea rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - rdp->count++; -/* - * Should we directly call rcu_do_batch() here ? - * if (unlikely(rdp->count > 10000)) - * rcu_do_batch(rdp); - */ + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + local_irq_restore(flags); } @@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data next = rdp->donelist = list->next; list->func(list); list = next; - rdp->count--; - if (++count >= maxbatch) + rdp->qlen--; + if (++count >= rdp->blimit) break; } + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; if (!rdp->donelist) rdp->donetail = &rdp->donelist; else @@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; + rdp->blimit = blimit; } static void __devinit rcu_online_cpu(int cpu) @@ -567,7 +602,12 @@ void synchronize_kernel(void) synchronize_rcu(); } -module_param(maxbatch, int, 0); +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); +#ifdef CONFIG_SMP +module_param(rsinterval, int, 0); +#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ _ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma @ 2006-02-17 15:46 ` Dipankar Sarma 2006-02-18 9:04 ` Andrew Morton 0 siblings, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-02-17 15:46 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney, dada1 I have benchmarked this on an x86_64 NUMA system and see no significant performance difference on kernbench. Tested on both x86_64 and powerpc. The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate percpu counter, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com> --- fs/dcache.c | 2 - fs/file_table.c | 80 +++++++++++++++++++++++++++++++-------------------- include/linux/file.h | 2 - include/linux/fs.h | 2 + kernel/sysctl.c | 5 ++- net/unix/af_unix.c | 2 - 6 files changed, 57 insertions(+), 36 deletions(-) diff -puN fs/dcache.c~fix-file-counting fs/dcache.c --- linux-2.6.16-rc3-rcu/fs/dcache.c~fix-file-counting 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/fs/dcache.c 2006-02-15 16:00:02.000000000 +0530 @@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned lon SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff -puN fs/file_table.c~fix-file-counting fs/file_table.c --- linux-2.6.16-rc3-rcu/fs/file_table.c~fix-file-counting 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/fs/file_table.c 2006-02-16 21:52:35.000000000 +0530 @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include <linux/config.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> @@ -19,52 +20,67 @@ #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> - +#include <linux/sysctl.h> +#include <linux/percpu_counter.h> +#include <asm/atomic.h> + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); + +static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static DEFINE_SPINLOCK(filp_count_lock); +static inline void file_free_rcu(struct rcu_head *head) +{ + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); +} -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files +static inline void file_free(struct file *f) +{ + percpu_counter_mod(&nr_files, -1L); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); +} + +/* + * Return the total number of open files in the system */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +int get_nr_files(void) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + return percpu_counter_read_positive(&nr_files); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + return files_stat.max_files; } -static inline void file_free_rcu(struct rcu_head *head) +EXPORT_SYMBOL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); } - -static inline void file_free(struct file *f) +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return -ENOSYS; } +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,7 +94,7 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) goto over; @@ -86,6 +102,7 @@ struct file *get_empty_filp(void) if (f == NULL) goto fail; + percpu_counter_mod(&nr_files, 1L); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -101,10 +118,10 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; @@ -276,4 +293,5 @@ void __init files_init(unsigned long mem if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + percpu_counter_init(&nr_files); } diff -puN include/linux/file.h~fix-file-counting include/linux/file.h --- linux-2.6.16-rc3-rcu/include/linux/file.h~fix-file-counting 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/include/linux/file.h 2006-02-15 16:00:02.000000000 +0530 @@ -60,8 +60,6 @@ extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); struct kmem_cache; -extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags); -extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); diff -puN include/linux/fs.h~fix-file-counting include/linux/fs.h --- linux-2.6.16-rc3-rcu/include/linux/fs.h~fix-file-counting 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/include/linux/fs.h 2006-02-15 16:00:02.000000000 +0530 @@ -35,6 +35,8 @@ struct files_stat_struct { int max_files; /* tunable */ }; extern struct files_stat_struct files_stat; +extern int get_nr_files(void); +extern int get_max_files(void); struct inodes_stat_t { int nr_inodes; diff -puN kernel/sysctl.c~fix-file-counting kernel/sysctl.c --- linux-2.6.16-rc3-rcu/kernel/sysctl.c~fix-file-counting 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/kernel/sysctl.c 2006-02-15 16:00:02.000000000 +0530 @@ -52,6 +52,9 @@ #include <linux/nfs_fs.h> #endif +extern int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -921,7 +924,7 @@ static ctl_table fs_table[] = { .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_files, }, { .ctl_name = FS_MAXFILE, diff -puN net/unix/af_unix.c~fix-file-counting net/unix/af_unix.c --- linux-2.6.16-rc3-rcu/net/unix/af_unix.c~fix-file-counting 2006-02-15 16:00:02.000000000 +0530 +++ linux-2.6.16-rc3-rcu-dipankar/net/unix/af_unix.c 2006-02-15 16:00:02.000000000 +0530 @@ -547,7 +547,7 @@ static struct sock * unix_create1(struct struct sock *sk = NULL; struct unix_sock *u; - if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + if (atomic_read(&unix_nr_socks) >= 2*get_max_files()) goto out; sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); _ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-17 15:46 ` [PATCH 2/2] fix file counting Dipankar Sarma @ 2006-02-18 9:04 ` Andrew Morton 2006-02-18 9:25 ` Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Andrew Morton @ 2006-02-18 9:04 UTC (permalink / raw) To: dipankar; +Cc: linux-kernel, paulmck, dada1 Dipankar Sarma <dipankar@in.ibm.com> wrote: > > I have benchmarked this on an x86_64 NUMA system and see no > significant performance difference on kernbench. Tested on > both x86_64 and powerpc. > > The way we do file struct accounting is not very suitable for batched > freeing. For scalability reasons, file accounting was constructor/destructor > based. This meant that nr_files was decremented only when > the object was removed from the slab cache. This is > susceptible to slab fragmentation. With RCU based file structure, > consequent batched freeing and a test program like Serge's, > we just speed this up and end up with a very fragmented slab - > > llm22:~ # cat /proc/sys/fs/file-nr > 587730 0 758844 > > At the same time, I see only a 2000+ objects in filp cache. > The following patch I fixes this problem. > > This patch changes the file counting by removing the filp_count_lock. > Instead we use a separate percpu counter, nr_files, for now and all > accesses to it are through get_nr_files() api. In the sysctl > handler for nr_files, we populate files_stat.nr_files before returning > to user. > > Counting files as an when they are created and destroyed (as opposed > to inside slab) allows us to correctly count open files with RCU. Fair enough. What do you think of these changes? From: Andrew Morton <akpm@osdl.org> - Nuke the blank line between "}" and EXPORT_SYMBOL(). That's never seemed pointful to me. - Make the get_max_files export use _GPL - only unix.ko uses it. - Use `-1' in the arg to percpu_counter_mod() rather than `-1L'. The compiler will dtrt and we shouldn't be peering inside percpu_counter internals here anyway. - Scrub that - use percpu_counter_dec() and percpu_counter_inc(). - percpu_counters can be inaccurate on big SMP. Before we actually fail a get_empty_filp() attempt, use the (new in -mm) expensive percpu_counter_sum() to check whether we're really over the limit. - Make get_nr_files() static. Which is just as well - any callers might want the percpu_counter_sum() treatment. In which case we'd be better off exporting some bool are_files_over_limit(how_many_i_want); API. Cc: Dipankar Sarma <dipankar@in.ibm.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> --- fs/file_table.c | 21 +++++++++++++-------- include/linux/fs.h | 1 - 2 files changed, 13 insertions(+), 9 deletions(-) diff -puN fs/file_table.c~fix-file-counting-fixes fs/file_table.c --- devel/fs/file_table.c~fix-file-counting-fixes 2006-02-18 01:02:43.000000000 -0800 +++ devel-akpm/fs/file_table.c 2006-02-18 01:02:43.000000000 -0800 @@ -22,6 +22,7 @@ #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> + #include <asm/atomic.h> /* sysctl tunables... */ @@ -42,14 +43,14 @@ static inline void file_free_rcu(struct static inline void file_free(struct file *f) { - percpu_counter_mod(&nr_files, -1L); + percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } /* * Return the total number of open files in the system */ -int get_nr_files(void) +static int get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } @@ -61,8 +62,7 @@ int get_max_files(void) { return files_stat.max_files; } - -EXPORT_SYMBOL(get_max_files); +EXPORT_SYMBOL_GPL(get_max_files); /* * Handle nr_files sysctl @@ -95,15 +95,20 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && - !capable(CAP_SYS_ADMIN)) - goto over; + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + goto over; + } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; - percpu_counter_mod(&nr_files, 1L); + percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; diff -puN include/linux/fs.h~fix-file-counting-fixes include/linux/fs.h --- devel/include/linux/fs.h~fix-file-counting-fixes 2006-02-18 01:02:43.000000000 -0800 +++ devel-akpm/include/linux/fs.h 2006-02-18 01:02:43.000000000 -0800 @@ -35,7 +35,6 @@ struct files_stat_struct { int max_files; /* tunable */ }; extern struct files_stat_struct files_stat; -extern int get_nr_files(void); extern int get_max_files(void); struct inodes_stat_t { _ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 9:04 ` Andrew Morton @ 2006-02-18 9:25 ` Dipankar Sarma 2006-02-18 9:45 ` Andrew Morton 2006-02-18 12:14 ` Christoph Hellwig 0 siblings, 2 replies; 24+ messages in thread From: Dipankar Sarma @ 2006-02-18 9:25 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, paulmck, dada1 On Sat, Feb 18, 2006 at 01:04:14AM -0800, Andrew Morton wrote: > Dipankar Sarma <dipankar@in.ibm.com> wrote: > Fair enough. > > What do you think of these changes? > > > - Nuke the blank line between "}" and EXPORT_SYMBOL(). That's never seemed > pointful to me. Sounds good. > > - Make the get_max_files export use _GPL - only unix.ko uses it. Always good. I just didn't want to be the one :) > - Use `-1' in the arg to percpu_counter_mod() rather than `-1L'. The > compiler will dtrt and we shouldn't be peering inside percpu_counter > internals here anyway. > > - Scrub that - use percpu_counter_dec() and percpu_counter_inc(). Gah! I missed those APIs. Makes perfect sense. > > - percpu_counters can be inaccurate on big SMP. Before we actually fail a > get_empty_filp() attempt, use the (new in -mm) expensive > percpu_counter_sum() to check whether we're really over the limit. > - if (get_nr_files() >= files_stat.max_files && > - !capable(CAP_SYS_ADMIN)) > - goto over; > + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { > + /* > + * percpu_counters are inaccurate. Do an expensive check before > + * we go and fail. > + */ > + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) > + goto over; > + } Slight optimization - if (get_nr_files() >= files_stat.max_files) { if (capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum(&nr_files) >= files_stat.max_files) goto over; } else goto over; } > > > - Make get_nr_files() static. Which is just as well - any callers might > want the percpu_counter_sum() treatment. In which case we'd be better off > exporting some > > bool are_files_over_limit(how_many_i_want); Well, as of now there is none (xfs use was removed a few months ago). So, I removed the EXPORT_SYMBOL for get_nr_files() and forgot to make it static. Your patch does the right thing. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 9:25 ` Dipankar Sarma @ 2006-02-18 9:45 ` Andrew Morton 2006-02-18 10:06 ` Dipankar Sarma 2006-02-18 12:14 ` Christoph Hellwig 1 sibling, 1 reply; 24+ messages in thread From: Andrew Morton @ 2006-02-18 9:45 UTC (permalink / raw) To: dipankar; +Cc: linux-kernel, paulmck, dada1 Dipankar Sarma <dipankar@in.ibm.com> wrote: > > > - if (get_nr_files() >= files_stat.max_files && > > - !capable(CAP_SYS_ADMIN)) > > - goto over; > > + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { > > + /* > > + * percpu_counters are inaccurate. Do an expensive check before > > + * we go and fail. > > + */ > > + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) > > + goto over; > > + } > > Slight optimization - > > if (get_nr_files() >= files_stat.max_files) { > if (capable(CAP_SYS_ADMIN)) { > /* > * percpu_counters are inaccurate. Do an expensive check before > * we go and fail. > */ > if (percpu_counter_sum(&nr_files) >= > files_stat.max_files) > goto over; > } else > goto over; > } That changes the behaviour for root. Maybe you meant !capable(), but that still changes the behaviour. I'm all confused. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 9:45 ` Andrew Morton @ 2006-02-18 10:06 ` Dipankar Sarma 2006-02-18 10:10 ` Andrew Morton 0 siblings, 1 reply; 24+ messages in thread From: Dipankar Sarma @ 2006-02-18 10:06 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, paulmck, dada1 On Sat, Feb 18, 2006 at 01:45:29AM -0800, Andrew Morton wrote: > Dipankar Sarma <dipankar@in.ibm.com> wrote: > > > > > > Slight optimization - > > > > if (get_nr_files() >= files_stat.max_files) { > > if (capable(CAP_SYS_ADMIN)) { > > /* > > * percpu_counters are inaccurate. Do an expensive check before > > * we go and fail. > > */ > > if (percpu_counter_sum(&nr_files) >= > > files_stat.max_files) > > goto over; > > } else > > goto over; > > } > > That changes the behaviour for root. Maybe you meant !capable(), but that > still changes the behaviour. I'm all confused. Hmm.. on second thoughts, there is no harm doing the expensive check for both priviledged and non-priviledged user. It will correctly allow non-priviledged users to create the new file provided the fast-path percpu counter value returned was greater than the slow path per-cpu counter value. Just ignore my comment in the previous mail. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 10:06 ` Dipankar Sarma @ 2006-02-18 10:10 ` Andrew Morton 2006-02-18 10:44 ` Dipankar Sarma 0 siblings, 1 reply; 24+ messages in thread From: Andrew Morton @ 2006-02-18 10:10 UTC (permalink / raw) To: dipankar; +Cc: linux-kernel, paulmck, dada1 Dipankar Sarma <dipankar@in.ibm.com> wrote: > > On Sat, Feb 18, 2006 at 01:45:29AM -0800, Andrew Morton wrote: > > Dipankar Sarma <dipankar@in.ibm.com> wrote: > > > > > > > > > Slight optimization - > > > > > > if (get_nr_files() >= files_stat.max_files) { > > > if (capable(CAP_SYS_ADMIN)) { > > > /* > > > * percpu_counters are inaccurate. Do an expensive check before > > > * we go and fail. > > > */ > > > if (percpu_counter_sum(&nr_files) >= > > > files_stat.max_files) > > > goto over; > > > } else > > > goto over; > > > } > > > > That changes the behaviour for root. Maybe you meant !capable(), but that > > still changes the behaviour. I'm all confused. > > Hmm.. on second thoughts, there is no harm doing the expensive check > for both priviledged and non-priviledged user. It will correctly > allow non-priviledged users to create the new file provided > the fast-path percpu counter value returned was greater than the > slow path per-cpu counter value. Look closer ;) if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum(&nr_files) >= files_stat.max_files) goto over; } ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 10:10 ` Andrew Morton @ 2006-02-18 10:44 ` Dipankar Sarma 0 siblings, 0 replies; 24+ messages in thread From: Dipankar Sarma @ 2006-02-18 10:44 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, paulmck, dada1 On Sat, Feb 18, 2006 at 02:10:22AM -0800, Andrew Morton wrote: > Dipankar Sarma <dipankar@in.ibm.com> wrote: > > > > On Sat, Feb 18, 2006 at 01:45:29AM -0800, Andrew Morton wrote: > > Look closer ;) > > if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { > /* > * percpu_counters are inaccurate. Do an expensive check before > * we go and fail. > */ > if (percpu_counter_sum(&nr_files) >= files_stat.max_files) > goto over; > } Ah, that check is not relevant for priviledged users anyway. That means I should just continue doing where my mind is at the moment - running the weekend errands :) And not look at code. Thanks Dipankar ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 9:25 ` Dipankar Sarma 2006-02-18 9:45 ` Andrew Morton @ 2006-02-18 12:14 ` Christoph Hellwig 2006-02-18 12:31 ` Arjan van de Ven 1 sibling, 1 reply; 24+ messages in thread From: Christoph Hellwig @ 2006-02-18 12:14 UTC (permalink / raw) To: Dipankar Sarma; +Cc: Andrew Morton, linux-kernel, paulmck, dada1, davem, netdev > > - Make the get_max_files export use _GPL - only unix.ko uses it. The real question is, does af_unix really need to allow beeing built modular? It's quite different from other network protocol and deeply tied to the kernel due to things like descriptor passing or using the filesystem namespace. I already had to export another symbol that really should be internal just for it, and if one module acquires lots of such hacks it's usually a bad sign.. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] fix file counting 2006-02-18 12:14 ` Christoph Hellwig @ 2006-02-18 12:31 ` Arjan van de Ven 0 siblings, 0 replies; 24+ messages in thread From: Arjan van de Ven @ 2006-02-18 12:31 UTC (permalink / raw) To: Christoph Hellwig Cc: Dipankar Sarma, Andrew Morton, linux-kernel, paulmck, dada1, davem, netdev On Sat, 2006-02-18 at 12:14 +0000, Christoph Hellwig wrote: > > > - Make the get_max_files export use _GPL - only unix.ko uses it. > > The real question is, does af_unix really need to allow beeing built > modular? It's quite different from other network protocol and deeply > tied to the kernel due to things like descriptor passing or using > the filesystem namespace. I already had to export another symbol that > really should be internal just for it, and if one module acquires lots > of such hacks it's usually a bad sign.. in 2.4 the answer would have been simple; modutils back then used AF_UNIX stuff before it could load modules, so modular was in practice impossible. Anyway I'd agree with making this non-modular... NOBODY will use this as a module, or if they do loading it somehow is the very first thing done. You just can't live without this, so making it a module is non-sensical. ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2006-02-18 12:31 UTC | newest] Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2006-01-26 18:40 [patch 0/2] RCU: fix various latency/oom issues Dipankar Sarma 2006-01-26 18:41 ` [patch 1/2] rcu batch tuning Dipankar Sarma 2006-01-26 18:42 ` [patch 2/2] fix file counting Dipankar Sarma 2006-01-26 20:15 ` Eric Dumazet 2006-01-27 22:54 ` Andrew Morton 2006-01-27 23:14 ` Paul E. McKenney 2006-01-27 23:28 ` Andrew Morton 2006-01-28 18:42 ` Dipankar Sarma 2006-01-28 18:51 ` Andrew Morton 2006-01-28 19:10 ` Dipankar Sarma 2006-01-30 17:00 ` Dipankar Sarma 2006-01-31 10:33 ` Eric Dumazet 2006-01-31 20:19 ` Dipankar Sarma 2006-01-26 19:33 ` [patch 1/2] rcu batch tuning Paul E. McKenney 2006-01-26 19:42 ` Dipankar Sarma 2006-02-17 15:41 [PATCH 0/2] RCU updates Dipankar Sarma 2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma 2006-02-17 15:46 ` [PATCH 2/2] fix file counting Dipankar Sarma 2006-02-18 9:04 ` Andrew Morton 2006-02-18 9:25 ` Dipankar Sarma 2006-02-18 9:45 ` Andrew Morton 2006-02-18 10:06 ` Dipankar Sarma 2006-02-18 10:10 ` Andrew Morton 2006-02-18 10:44 ` Dipankar Sarma 2006-02-18 12:14 ` Christoph Hellwig 2006-02-18 12:31 ` Arjan van de Ven
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).