linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/7] breaking the global file_list_lock
@ 2007-01-28 11:51 Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 1/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
                   ` (7 more replies)
  0 siblings, 8 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

This patch-set breaks up the global file_list_lock which was found to be a
severe contention point under basically any filesystem intensive workload.

It has been part of the -rt kernel for some time now and is deemed solid and
useful enough to post in its own right. This contention should also occur on
the large SMP machines.

Feedback would be appreciated, especially with regard to the extra workqueue
that was added to flush per cpu lists. Is there an alternative aproach?




^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 1/7] lockdep: lock_set_subclass - reset a held locks subclass
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
@ 2007-01-28 11:51 ` Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 3/7] barrier: a scalable synchonisation barrier Peter Zijlstra
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

[-- Attachment #1: lockdep-lock_set_subclass.patch --]
[-- Type: text/plain, Size: 3702 bytes --]

this can be used to reset a held lock's subclass, for arbitrary-depth
iterated data structures such as trees or lists which have per-node
locks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h |    4 ++
 kernel/lockdep.c        |   69 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h	2007-01-13 21:04:10.000000000 +0100
+++ linux-2.6/include/linux/lockdep.h	2007-01-27 21:05:55.000000000 +0100
@@ -240,6 +240,9 @@ extern void lock_acquire(struct lockdep_
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
+extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass,
+			      unsigned long ip);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0,
 
 #define lockdep_depth(tsk)	((tsk)->lockdep_depth)
@@ -256,6 +259,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, i)		do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
+# define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub)	do { (void)(key); } while (0)
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c	2007-01-13 21:04:11.000000000 +0100
+++ linux-2.6/kernel/lockdep.c	2007-01-27 21:05:55.000000000 +0100
@@ -2264,6 +2264,55 @@ static int check_unlock(struct task_stru
 	return 1;
 }
 
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+		    unsigned int subclass, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class *class;
+	unsigned int depth;
+	int i;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+	class = register_lock_class(lock, subclass, 0);
+	hlock->class = class;
+
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	for (; i < depth; i++) {
+		hlock = curr->held_locks + i;
+		if (!__lock_acquire(hlock->instance,
+			hlock->class->subclass, hlock->trylock,
+				hlock->read, hlock->check, hlock->hardirqs_off,
+				hlock->acquire_ip))
+			return 0;
+	}
+
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+		return 0;
+	return 1;
+}
+
 /*
  * Remove the lock to the list of currently held locks in a
  * potentially non-nested (out of order) manner. This is a
@@ -2459,6 +2508,26 @@ void lock_release(struct lockdep_map *lo
 
 EXPORT_SYMBOL_GPL(lock_release);
 
+void
+lock_set_subclass(struct lockdep_map *lock,
+		  unsigned int subclass, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
+	check_flags(flags);
+	if (__lock_set_subclass(lock, subclass, ip))
+		check_chain_key(current);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_set_subclass);
+
 /*
  * Used by the testsuite, sanitize the validator state
  * after a simulated failure:

--


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 1/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
@ 2007-01-28 11:51 ` Peter Zijlstra
  2007-01-28 14:39   ` Christoph Hellwig
  2007-01-28 11:51 ` [PATCH 4/7] fs: break the file_list_lock for sb->s_files Peter Zijlstra
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

[-- Attachment #1: barrier.patch --]
[-- Type: text/plain, Size: 2317 bytes --]

This barrier thing is constructed so that it will not write in the sync()
condition (the hot path) when there are no active lock sections; thus avoiding
cacheline bouncing. -- I'm just not sure how this will work out in relation to
PI. We might track those in the barrier scope and boost those by the max prio
of the blockers.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/barrier.h |   65 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

Index: linux/include/linux/barrier.h
===================================================================
--- /dev/null
+++ linux/include/linux/barrier.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2006, Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Licenced under the GPLv2.
+ *
+ * simple synchonisation barrier
+ *
+ * The sync() operation will wait for completion of all lock sections if any.
+ *
+ * The lock sections are intended to be rare and the sync operation frequent.
+ * This construct is created to be scalable and does only 1 read in the fast
+ * path (sync), hence avoiding cacheline bounces.
+ *
+ * NOTE: it _synchronisation_ only, so if there are serialisation requirements
+ * those must be met by something external to this construct.
+ */
+#ifndef _LINUX_BARRIER_H
+#define _LINUX_BARRIER_H
+
+#ifdef __KERNEL__
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+
+struct barrier {
+	atomic_t count;
+	wait_queue_head_t wait;
+};
+
+static inline void init_barrier(struct barrier *b)
+{
+	atomic_set(&b->count, 0);
+	init_waitqueue_head(&b->wait);
+	__acquire(b);
+}
+
+static inline void barrier_lock(struct barrier *b)
+{
+	__release(b);
+	atomic_inc(&b->count);
+	smp_wmb();
+}
+
+static inline void barrier_unlock(struct barrier *b)
+{
+	smp_wmb();
+	if (atomic_dec_and_test(&b->count))
+		__wake_up(&b->wait, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 0, b);
+}
+
+static inline void barrier_sync(struct barrier *b)
+{
+	might_sleep();
+
+	if (unlikely(atomic_read(&b->count))) {
+		DEFINE_WAIT(wait);
+		prepare_to_wait(&b->wait, &wait, TASK_UNINTERRUPTIBLE);
+		while (atomic_read(&b->count))
+			schedule();
+		finish_wait(&b->wait, &wait);
+	}
+}
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_BARRIER_H */

--


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 1/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 3/7] barrier: a scalable synchonisation barrier Peter Zijlstra
@ 2007-01-28 11:51 ` Peter Zijlstra
  2007-01-28 14:43   ` Christoph Hellwig
  2007-01-28 11:51 ` [PATCH 5/7] fs: restore previous sb->s_files iteration semantics Peter Zijlstra
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

[-- Attachment #1: s_files.patch --]
[-- Type: text/plain, Size: 13785 bytes --]

Break the protection of sb->s_files out from under the global file_list_lock.

sb->s_files is converted to a lock_list. furthermore to prevent the 
lock_list_head of getting too contended with concurrent add operations
the add is buffered in per cpu filevecs.

This would ordinarily require a flush before a delete operation - to ensure
the to be deleted entry is indeed added to the list. This is avoided by 
storing a pointer to the filevec location in the not yet used list_head.
This pointer can then be used to clear the filevec entry before its actually
added.

The file_flag mess is a bit unfortunate - this can be removed by also
converting tty->tty_files to a lock_list (TODO).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/dquot.c                   |   10 +-
 fs/file_table.c              |  170 +++++++++++++++++++++++++++++++++++++++----
 fs/open.c                    |    2 
 fs/proc/generic.c            |    8 --
 fs/super.c                   |    7 -
 include/linux/fs.h           |   23 +++++
 mm/readahead.c               |    2 
 security/selinux/selinuxfs.c |    9 +-
 8 files changed, 194 insertions(+), 37 deletions(-)

Index: linux-2.6/fs/dquot.c
===================================================================
--- linux-2.6.orig/fs/dquot.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/dquot.c	2007-01-27 21:07:44.000000000 +0100
@@ -688,23 +688,21 @@ static int dqinit_needed(struct inode *i
 /* This routine is guarded by dqonoff_mutex mutex */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
-	struct list_head *p;
+	struct file *filp;
 
 restart:
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = filp->f_path.dentry->d_inode;
 		if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) {
 			struct dentry *dentry = dget(filp->f_path.dentry);
-			file_list_unlock();
+			lock_list_for_each_entry_stop(filp, f_u.fu_llist);
 			sb->dq_op->initialize(inode, type);
 			dput(dentry);
 			/* As we may have blocked we had better restart... */
 			goto restart;
 		}
 	}
-	file_list_unlock();
 }
 
 /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/file_table.c	2007-01-27 21:07:44.000000000 +0100
@@ -113,7 +113,7 @@ struct file *get_empty_filp(void)
 		goto fail_sec;
 
 	tsk = current;
-	INIT_LIST_HEAD(&f->f_u.fu_list);
+	INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist);
 	atomic_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_uid = tsk->fsuid;
@@ -245,32 +245,175 @@ void put_filp(struct file *file)
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+enum {
+	FILEVEC_SIZE = 15
+};
+
+struct filevec {
+	unsigned long nr;
+	struct file *files[FILEVEC_SIZE];
+};
+
+static DEFINE_PER_CPU(struct filevec, sb_fvec);
+
+static inline unsigned int filevec_size(struct filevec *fvec)
 {
-	if (!list)
-		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+	return FILEVEC_SIZE - fvec->nr;
+}
+
+static inline unsigned int filevec_count(struct filevec *fvec)
+{
+	return fvec->nr;
+}
+
+static inline void filevec_reinit(struct filevec *fvec)
+{
+	fvec->nr = 0;
+}
+
+static inline unsigned int filevec_add(struct filevec *fvec, struct file *filp)
+{
+	rcu_assign_pointer(fvec->files[fvec->nr], filp);
+
+	/*
+	 * Here we do icky stuff in order to avoid flushing the per cpu filevec
+	 * on list removal.
+	 *
+	 * We store the location on the per cpu filevec in the as of yet unused
+	 * fu_llist.next field and toggle bit 0 to indicate we done so. This
+	 * allows the removal code to set the filevec entry to NULL, thereby
+	 * avoiding the list add.
+	 *
+	 * Abuse the fu_llist.lock for protection.
+	 */
+	spin_lock(&filp->f_u.fu_llist.lock);
+	filp->f_u.fu_llist.next = (void *)&fvec->files[fvec->nr];
+	__set_bit(0, (void *)&filp->f_u.fu_llist.next);
+	spin_unlock(&filp->f_u.fu_llist.lock);
+
+	fvec->nr++;
+	return filevec_size(fvec);
+}
+
+static void __filevec_add(struct filevec *fvec)
+{
+	int i;
+
+	for (i = 0; i < filevec_count(fvec); i++) {
+		struct file *filp;
+
+		/*
+		 * see the comment in filevec_add();
+		 * need RCU because a concurrent remove might have deleted
+		 * the entry from under us.
+		 */
+		rcu_read_lock();
+		filp = rcu_dereference(fvec->files[i]);
+		/*
+		 * the simple case, its gone - NEXT!
+		 */
+		if (!filp) {
+			rcu_read_unlock();
+			continue;
+		}
+
+		spin_lock(&filp->f_u.fu_llist.lock);
+		/*
+		 * If the entry really is still there, add it!
+		 */
+		if (rcu_dereference(fvec->files[i])) {
+			struct super_block *sb =
+				filp->f_mapping->host->i_sb;
+
+			__lock_list_add(&filp->f_u.fu_llist, &sb->s_files);
+		}
+		spin_unlock(&filp->f_u.fu_llist.lock);
+		rcu_read_unlock();
+	}
+	filevec_reinit(fvec);
+}
+
+static void filevec_add_drain(void)
+{
+	struct filevec *fvec = &get_cpu_var(sb_fvec);
+	if (filevec_count(fvec))
+		__filevec_add(fvec);
+	put_cpu_var(sb_fvec);
 }
 
+static void filevec_add_drain_per_cpu(struct work_struct *dummy)
+{
+	filevec_add_drain();
+}
+
+int filevec_add_drain_all(void)
+{
+	return schedule_on_each_cpu(filevec_add_drain_per_cpu);
+}
+EXPORT_SYMBOL_GPL(filevec_add_drain_all);
+
 void file_kill(struct file *file)
 {
-	if (!list_empty(&file->f_u.fu_list)) {
+	if (file_flag(file, F_SUPERBLOCK)) {
+		void **ptr;
+
+		file_flag_clear(file, F_SUPERBLOCK);
+
+		/*
+		 * If bit 0 of the fu_llist.next pointer is set we're still
+		 * enqueued on a per cpu filevec, in that case clear the entry
+		 * and we're done.
+		 */
+		spin_lock(&file->f_u.fu_llist.lock);
+		ptr = (void **)file->f_u.fu_llist.next;
+		if (__test_and_clear_bit(0, (void *)&ptr)) {
+			rcu_assign_pointer(*ptr, NULL);
+			INIT_LIST_HEAD(&file->f_u.fu_llist.head);
+			spin_unlock(&file->f_u.fu_llist.lock);
+			return;
+		}
+		spin_unlock(&file->f_u.fu_llist.lock);
+
+		if (!list_empty(&file->f_u.fu_list))
+			lock_list_del_init(&file->f_u.fu_llist);
+
+	} else if (!list_empty(&file->f_u.fu_list)) {
 		file_list_lock();
 		list_del_init(&file->f_u.fu_list);
 		file_list_unlock();
 	}
 }
 
+void file_move(struct file *file, struct list_head *list)
+{
+	struct super_block *sb;
+
+	if (!list)
+		return;
+
+	file_kill(file);
+
+	sb = file->f_mapping->host->i_sb;
+	if (list == &sb->s_files.head) {
+		struct filevec *fvec = &get_cpu_var(sb_fvec);
+		file_flag_set(file, F_SUPERBLOCK);
+		if (!filevec_add(fvec, file))
+			__filevec_add(fvec);
+		put_cpu_var(sb_fvec);
+	} else {
+		file_list_lock();
+		list_add(&file->f_u.fu_list, list);
+		file_list_unlock();
+	}
+}
+
 int fs_may_remount_ro(struct super_block *sb)
 {
-	struct list_head *p;
+	struct file *file;
 
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *file = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -281,10 +424,9 @@ int fs_may_remount_ro(struct super_block
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
 	}
-	file_list_unlock();
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	lock_list_for_each_entry_stop(file, f_u.fu_llist);
 	return 0;
 }
 
Index: linux-2.6/fs/open.c
===================================================================
--- linux-2.6.orig/fs/open.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/open.c	2007-01-27 21:07:44.000000000 +0100
@@ -692,7 +692,7 @@ static struct file *__dentry_open(struct
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
-	file_move(f, &inode->i_sb->s_files);
+	file_move(f, &inode->i_sb->s_files.head);
 
 	if (!open && f->f_op)
 		open = f->f_op->open;
Index: linux-2.6/fs/proc/generic.c
===================================================================
--- linux-2.6.orig/fs/proc/generic.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/proc/generic.c	2007-01-27 21:07:44.000000000 +0100
@@ -549,15 +549,14 @@ static int proc_register(struct proc_dir
  */
 static void proc_kill_inodes(struct proc_dir_entry *de)
 {
-	struct list_head *p;
+	struct file *filp;
 	struct super_block *sb = proc_mnt->mnt_sb;
 
 	/*
 	 * Actually it's a partial revoke().
 	 */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
 		struct inode * inode;
 		const struct file_operations *fops;
@@ -571,7 +570,6 @@ static void proc_kill_inodes(struct proc
 		filp->f_op = NULL;
 		fops_put(fops);
 	}
-	file_list_unlock();
 }
 
 static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/super.c	2007-01-27 21:07:44.000000000 +0100
@@ -67,7 +67,7 @@ static struct super_block *alloc_super(s
 		}
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
-		INIT_LIST_HEAD(&s->s_files);
+		INIT_LOCK_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -568,12 +568,11 @@ static void mark_files_ro(struct super_b
 {
 	struct file *f;
 
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	filevec_add_drain_all();
+	lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) {
 		if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
 			f->f_mode &= ~FMODE_WRITE;
 	}
-	file_list_unlock();
 }
 
 /**
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2007-01-22 22:43:28.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2007-01-27 21:07:44.000000000 +0100
@@ -275,6 +275,7 @@ extern int dir_notify_enable;
 #include <linux/cache.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/lock_list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
@@ -709,9 +710,13 @@ struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
 	 * fu_rcuhead for RCU freeing
+	 * fu_llist is used for the superblock s_files list; its crucial that
+	 * the spinlock contained therein is not clobbered by other uses of
+	 * the union.
 	 */
 	union {
 		struct list_head	fu_list;
+		struct lock_list_head	fu_llist;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
@@ -744,9 +749,25 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
+/*
+ * steal the upper 8 bits from the read-a-head flags
+ */
+#define F_SHIFT		24
+
+#define F_SUPERBLOCK	0
+
+#define file_flag_set(file, flag)	\
+	__set_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag_clear(file, flag)	\
+	__clear_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag(file, flag)		\
+	test_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+
 #define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
+extern int filevec_add_drain_all(void);
+
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
@@ -928,7 +949,7 @@ struct super_block {
 	struct list_head	s_dirty;	/* dirty inodes */
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
-	struct list_head	s_files;
+	struct lock_list_head	s_files;
 
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c	2007-01-22 22:43:29.000000000 +0100
+++ linux-2.6/mm/readahead.c	2007-01-27 21:07:44.000000000 +0100
@@ -69,7 +69,7 @@ static inline void reset_ahead_window(st
 static inline void ra_off(struct file_ra_state *ra)
 {
 	ra->start = 0;
-	ra->flags = 0;
+	ra->flags &= (~0UL) << F_SHIFT;
 	ra->size = 0;
 	reset_ahead_window(ra);
 	return;
Index: linux-2.6/security/selinux/selinuxfs.c
===================================================================
--- linux-2.6.orig/security/selinux/selinuxfs.c	2007-01-13 21:04:19.000000000 +0100
+++ linux-2.6/security/selinux/selinuxfs.c	2007-01-27 21:10:48.000000000 +0100
@@ -940,7 +940,8 @@ static struct file_operations sel_commit
  * fs/proc/generic.c proc_kill_inodes */
 static void sel_remove_bools(struct dentry *de)
 {
-	struct list_head *p, *node;
+	struct list_head *node;
+	struct file *filp;
 	struct super_block *sb = de->d_sb;
 
 	spin_lock(&dcache_lock);
@@ -962,9 +963,8 @@ static void sel_remove_bools(struct dent
 
 	spin_unlock(&dcache_lock);
 
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
 
 		if (dentry->d_parent != de) {
@@ -972,7 +972,6 @@ static void sel_remove_bools(struct dent
 		}
 		filp->f_op = NULL;
 	}
-	file_list_unlock();
 }
 
 #define BOOL_DIR_NAME "booleans"

--


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 5/7] fs: restore previous sb->s_files iteration semantics
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
                   ` (2 preceding siblings ...)
  2007-01-28 11:51 ` [PATCH 4/7] fs: break the file_list_lock for sb->s_files Peter Zijlstra
@ 2007-01-28 11:51 ` Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 6/7] schedule_on_each_cpu_wq() Peter Zijlstra
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

[-- Attachment #1: s_files-barrier.patch --]
[-- Type: text/plain, Size: 4024 bytes --]

Ensure no new files will be added when we're inspecting 'all' files. Without
this, files could be added in front while we're iterating and we'd miss those.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/file_table.c              |    9 +++++++++
 fs/super.c                   |    3 +++
 include/linux/fs.h           |    2 ++
 security/selinux/selinuxfs.c |    7 ++++---
 4 files changed, 18 insertions(+), 3 deletions(-)

Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c	2007-01-27 21:07:44.000000000 +0100
+++ linux-2.6/fs/file_table.c	2007-01-27 21:11:12.000000000 +0100
@@ -354,6 +354,12 @@ EXPORT_SYMBOL_GPL(filevec_add_drain_all)
 
 void file_kill(struct file *file)
 {
+	if (file && file->f_mapping && file->f_mapping->host) {
+		struct super_block *sb = file->f_mapping->host->i_sb;
+		if (sb)
+			barrier_sync(&sb->s_barrier);
+	}
+
 	if (file_flag(file, F_SUPERBLOCK)) {
 		void **ptr;
 
@@ -412,6 +418,7 @@ int fs_may_remount_ro(struct super_block
 	struct file *file;
 
 	/* Check that no files are currently opened for writing. */
+	barrier_lock(&sb->s_barrier);
 	filevec_add_drain_all();
 	lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = file->f_path.dentry->d_inode;
@@ -424,9 +431,11 @@ int fs_may_remount_ro(struct super_block
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
 	}
+	barrier_unlock(&sb->s_barrier);
 	return 1; /* Tis' cool bro. */
 too_bad:
 	lock_list_for_each_entry_stop(file, f_u.fu_llist);
+	barrier_unlock(&sb->s_barrier);
 	return 0;
 }
 
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c	2007-01-27 21:07:44.000000000 +0100
+++ linux-2.6/fs/super.c	2007-01-27 21:11:12.000000000 +0100
@@ -68,6 +68,7 @@ static struct super_block *alloc_super(s
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
 		INIT_LOCK_LIST_HEAD(&s->s_files);
+		init_barrier(&s->s_barrier);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -568,11 +569,13 @@ static void mark_files_ro(struct super_b
 {
 	struct file *f;
 
+	barrier_lock(&sb->s_barrier);
 	filevec_add_drain_all();
 	lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) {
 		if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
 			f->f_mode &= ~FMODE_WRITE;
 	}
+	barrier_unlock(&sb->s_barrier);
 }
 
 /**
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2007-01-27 21:07:44.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2007-01-27 21:11:12.000000000 +0100
@@ -281,6 +281,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/mutex.h>
+#include <linux/barrier.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -950,6 +951,7 @@ struct super_block {
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct lock_list_head	s_files;
+	struct barrier		s_barrier;
 
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
Index: linux-2.6/security/selinux/selinuxfs.c
===================================================================
--- linux-2.6.orig/security/selinux/selinuxfs.c	2007-01-27 21:10:48.000000000 +0100
+++ linux-2.6/security/selinux/selinuxfs.c	2007-01-27 21:11:40.000000000 +0100
@@ -963,6 +963,7 @@ static void sel_remove_bools(struct dent
 
 	spin_unlock(&dcache_lock);
 
+	barrier_lock(&sb->s_barrier);
 	filevec_add_drain_all();
 	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
@@ -972,6 +973,7 @@ static void sel_remove_bools(struct dent
 		}
 		filp->f_op = NULL;
 	}
+	barrier_unlock(&sb->s_barrier);
 }
 
 #define BOOL_DIR_NAME "booleans"

--


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 6/7] schedule_on_each_cpu_wq()
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
                   ` (3 preceding siblings ...)
  2007-01-28 11:51 ` [PATCH 5/7] fs: restore previous sb->s_files iteration semantics Peter Zijlstra
@ 2007-01-28 11:51 ` Peter Zijlstra
  2007-01-28 11:51 ` [PATCH 7/7] fs: fixup filevec_add_drain_all Peter Zijlstra
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

[-- Attachment #1: s_files-schedule_on_each_cpu_wq.patch --]
[-- Type: text/plain, Size: 3618 bytes --]

From: Ingo Molnar <mingo@elte.hu>

Provide a schedule_on_each_cpu_wq() function that uses a workqueue to do all
the work. This avoid the calling process to schedule.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/workqueue.h |    1 
 kernel/workqueue.c        |   66 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 11 deletions(-)

Index: linux-2.6/include/linux/workqueue.h
===================================================================
--- linux-2.6.orig/include/linux/workqueue.h	2007-01-13 21:04:11.000000000 +0100
+++ linux-2.6/include/linux/workqueue.h	2007-01-27 21:12:16.000000000 +0100
@@ -179,6 +179,7 @@ extern int FASTCALL(schedule_delayed_wor
 
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
+extern int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
Index: linux-2.6/kernel/workqueue.c
===================================================================
--- linux-2.6.orig/kernel/workqueue.c	2007-01-13 21:04:12.000000000 +0100
+++ linux-2.6/kernel/workqueue.c	2007-01-27 21:12:16.000000000 +0100
@@ -296,6 +296,20 @@ int queue_delayed_work_on(int cpu, struc
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+static void leak_check(void *func)
+{
+	if (!in_atomic() && lockdep_depth(current) <= 0)
+		return;
+	printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(),
+			       	current->pid);
+	printk(KERN_ERR "    last function: ");
+	print_symbol("%s\n", (unsigned long)func);
+	debug_show_held_locks(current);
+	dump_stack();
+}
+
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	unsigned long flags;
@@ -323,18 +337,10 @@ static void run_workqueue(struct cpu_wor
 		BUG_ON(get_wq_data(work) != cwq);
 		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
 			work_release(work);
-		f(work);
 
-		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-					"%s/0x%08x/%d\n",
-					current->comm, preempt_count(),
-				       	current->pid);
-			printk(KERN_ERR "    last function: ");
-			print_symbol("%s\n", (unsigned long)f);
-			debug_show_held_locks(current);
-			dump_stack();
-		}
+		leak_check(NULL);
+		f(work);
+		leak_check(f);
 
 		spin_lock_irqsave(&cwq->lock, flags);
 		cwq->remove_sequence++;
@@ -649,6 +655,44 @@ int schedule_on_each_cpu(work_func_t fun
 	return 0;
 }
 
+/**
+ * schedule_on_each_cpu_wq - call a function on each online CPU on a per-CPU wq
+ * @func: the function to call
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func)
+{
+	int cpu;
+	struct work_struct *works;
+
+	if (is_single_threaded(wq)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+	works = alloc_percpu(struct work_struct);
+	if (!works)
+		return -ENOMEM;
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		INIT_WORK(work, func);
+		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+	}
+	flush_workqueue(wq);
+	free_percpu(works);
+
+	return 0;
+}
+
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);

--


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 7/7] fs: fixup filevec_add_drain_all
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
                   ` (4 preceding siblings ...)
  2007-01-28 11:51 ` [PATCH 6/7] schedule_on_each_cpu_wq() Peter Zijlstra
@ 2007-01-28 11:51 ` Peter Zijlstra
  2007-01-28 12:16 ` [PATCH 8/7] fs: free_write_pipe() fix Peter Zijlstra
  2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
  7 siblings, 0 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 11:51 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Peter Zijlstra

[-- Attachment #1: s_files-per_cpu-flush-fix.patch --]
[-- Type: text/plain, Size: 1641 bytes --]

From: Ingo Molnar <mingo@elte.hu>

filevec_add_drain_all() is called under locks and schedule_on_each_cpu() can
schedule. Solve this by using the new schedule_on_each_cpu_wq().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/file_table.c |   21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c	2007-01-27 23:09:23.000000000 +0100
+++ linux-2.6/fs/file_table.c	2007-01-27 23:10:02.000000000 +0100
@@ -333,6 +333,22 @@ static void __filevec_add(struct filevec
 	filevec_reinit(fvec);
 }
 
+/*
+ * Flush files per-CPU workqueue:
+ */
+static struct workqueue_struct *flush_files_workqueue;
+
+int __init flush_files_init(void)
+{
+        flush_files_workqueue = create_workqueue("flush_filesd");
+        if (!flush_files_workqueue)
+                panic("Failed to create flush_filesd\n");
+
+	return 0;
+}
+
+__initcall(flush_files_init);
+
 static void filevec_add_drain(void)
 {
 	struct filevec *fvec = &get_cpu_var(sb_fvec);
@@ -341,14 +357,15 @@ static void filevec_add_drain(void)
 	put_cpu_var(sb_fvec);
 }
 
-static void filevec_add_drain_per_cpu(struct work_struct *dummy)
+static void filevec_add_drain_per_cpu(struct work_struct *none)
 {
 	filevec_add_drain();
 }
 
 int filevec_add_drain_all(void)
 {
-	return schedule_on_each_cpu(filevec_add_drain_per_cpu);
+	return schedule_on_each_cpu_wq(flush_files_workqueue,
+				       filevec_add_drain_per_cpu);
 }
 EXPORT_SYMBOL_GPL(filevec_add_drain_all);
 

--


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 8/7] fs: free_write_pipe() fix
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
                   ` (5 preceding siblings ...)
  2007-01-28 11:51 ` [PATCH 7/7] fs: fixup filevec_add_drain_all Peter Zijlstra
@ 2007-01-28 12:16 ` Peter Zijlstra
  2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
  7 siblings, 0 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 12:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Ingo Molnar


From: Ingo Molnar <mingo@elte.hu>

file_kill() has to look at the file's inode (for the barrier logic),
hence make sure we free the inode before the file.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/pipe.c |   15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

Index: linux/fs/pipe.c
===================================================================
--- linux.orig/fs/pipe.c
+++ linux/fs/pipe.c
@@ -945,12 +945,17 @@ struct file *create_write_pipe(void)
 	return ERR_PTR(err);
 }
 
-void free_write_pipe(struct file *f)
+void free_write_pipe(struct file *file)
 {
-	free_pipe_info(f->f_dentry->d_inode);
-	dput(f->f_path.dentry);
-	mntput(f->f_path.mnt);
-	put_filp(f);
+	struct dentry *dentry = file->f_path.dentry;
+	struct vfsmount *mnt = file->f_path.mnt;
+
+	free_pipe_info(file->f_dentry->d_inode);
+	file->f_path.dentry = NULL;
+	file->f_path.mnt = NULL;
+	put_filp(file);
+	dput(dentry);
+	mntput(mnt);
 }
 
 struct file *create_read_pipe(struct file *wrf)



^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-28 11:51 ` [PATCH 3/7] barrier: a scalable synchonisation barrier Peter Zijlstra
@ 2007-01-28 14:39   ` Christoph Hellwig
  2007-01-28 15:24     ` Ingo Molnar
  0 siblings, 1 reply; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 14:39 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel, Ingo Molnar

On Sun, Jan 28, 2007 at 12:51:21PM +0100, Peter Zijlstra wrote:
> This barrier thing is constructed so that it will not write in the sync()
> condition (the hot path) when there are no active lock sections; thus avoiding
> cacheline bouncing. -- I'm just not sure how this will work out in relation to
> PI. We might track those in the barrier scope and boost those by the max prio
> of the blockers.

Is this really needed?  We seem to grow new funky locking algorithms
exponentially, while people already have a hard time understanding
the existing ones.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
                   ` (6 preceding siblings ...)
  2007-01-28 12:16 ` [PATCH 8/7] fs: free_write_pipe() fix Peter Zijlstra
@ 2007-01-28 14:43 ` Christoph Hellwig
  2007-01-28 15:11   ` Christoph Hellwig
                     ` (2 more replies)
  7 siblings, 3 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 14:43 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel, Ingo Molnar

On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
> This patch-set breaks up the global file_list_lock which was found to be a
> severe contention point under basically any filesystem intensive workload.

Benchmarks, please.  Where exactly do you see contention for this?


filesystem intensive workload apparently means namespace operation heavy
workload, right?  The biggest bottleneck I've seen with those is dcache lock.

Even if this is becoming a real problem there must be simpler ways to fix
this than introducing various new locking primitives and all kinds of
complexity.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 11:51 ` [PATCH 4/7] fs: break the file_list_lock for sb->s_files Peter Zijlstra
@ 2007-01-28 14:43   ` Christoph Hellwig
  2007-01-28 15:21     ` Ingo Molnar
  0 siblings, 1 reply; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 14:43 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel, Ingo Molnar

On Sun, Jan 28, 2007 at 12:51:22PM +0100, Peter Zijlstra wrote:
> Break the protection of sb->s_files out from under the global file_list_lock.
> 
> sb->s_files is converted to a lock_list. furthermore to prevent the 
> lock_list_head of getting too contended with concurrent add operations
> the add is buffered in per cpu filevecs.

NACK.  Please don't start using lockdep internals in core code.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
@ 2007-01-28 15:11   ` Christoph Hellwig
  2007-01-28 15:29     ` Peter Zijlstra
  2007-01-29 13:32     ` Stephen Smalley
  2007-01-28 15:24   ` Ingo Molnar
  2007-01-28 18:41   ` [PATCH 0/7] breaking the global file_list_lock Ingo Molnar
  2 siblings, 2 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:11 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel,
	Ingo Molnar

On Sun, Jan 28, 2007 at 02:43:25PM +0000, Christoph Hellwig wrote:
> On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
> > This patch-set breaks up the global file_list_lock which was found to be a
> > severe contention point under basically any filesystem intensive workload.
> 
> Benchmarks, please.  Where exactly do you see contention for this?
> 
> 
> filesystem intensive workload apparently means namespace operation heavy
> workload, right?  The biggest bottleneck I've seen with those is dcache lock.
> 
> Even if this is becoming a real problem there must be simpler ways to fix
> this than introducing various new locking primitives and all kinds of
> complexity.

One good way to fix scalability without all this braindamage is
to get rid of sb->s_files.  Current uses are:

 - fs/dquot.c:add_dquot_ref()

	This performs it's actual operation on inodes.  We should
	be able to check inode->i_writecount to see which inodes
	need quota initialization.

 - fs/file_table.c:fs_may_remount_ro()

	This one is gone in Dave Hansens per-mountpoint r/o patchkit

 - fs/proc/generic.c:proc_kill_inodes()

	This can be done with a list inside procfs.

 - fs/super.c:mark_files_ro()

	This one is only used for do_emergency_remount(), which is
	and utter hack.  It might be much better to just deny any
	kind of write access through a superblock flag here.

 - fs/selinuxfs.c:sel_remove_bools()

	Utter madness.  I have no idea how this ever got merged.
	Maybe the selinux folks can explain what crack they were
	on when writing this.  The problem would go away with
	a generic rewoke infrastructure.

Once sb->s_files is gone we can also kill of fu_list entirely and
replace it by a list head entirely in the tty code and make the lock
for it per-tty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 14:43   ` Christoph Hellwig
@ 2007-01-28 15:21     ` Ingo Molnar
  2007-01-28 15:30       ` Christoph Hellwig
  0 siblings, 1 reply; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 15:21 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 12:51:22PM +0100, Peter Zijlstra wrote:
> > Break the protection of sb->s_files out from under the global file_list_lock.
> > 
> > sb->s_files is converted to a lock_list. furthermore to prevent the 
> > lock_list_head of getting too contended with concurrent add 
> > operations the add is buffered in per cpu filevecs.
> 
> NACK.  Please don't start using lockdep internals in core code.

what do you mean by that?

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
  2007-01-28 15:11   ` Christoph Hellwig
@ 2007-01-28 15:24   ` Ingo Molnar
  2007-01-28 16:52     ` Martin J. Bligh
  2007-01-28 18:41   ` [PATCH 0/7] breaking the global file_list_lock Ingo Molnar
  2 siblings, 1 reply; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 15:24 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
> > This patch-set breaks up the global file_list_lock which was found 
> > to be a severe contention point under basically any filesystem 
> > intensive workload.
> 
> Benchmarks, please.  Where exactly do you see contention for this?

it's the most contended spinlock we have during a parallel kernel 
compile on an 8-way system. But it's pretty common-sense as well, 
without doing any measurements, it's basically the only global lock left 
in just about every VFS workload that doesnt involve massive amount of 
dentries created/removed (which is still dominated by the dcache_lock).

> filesystem intensive workload apparently means namespace operation 
> heavy workload, right?  The biggest bottleneck I've seen with those is 
> dcache lock.

the dcache lock is not a problem during kernel compiles. (its 
rcu-ification works nicely in that workload)

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-28 14:39   ` Christoph Hellwig
@ 2007-01-28 15:24     ` Ingo Molnar
  2007-01-28 15:34       ` Christoph Hellwig
  2007-01-31 19:12       ` Paul E. McKenney
  0 siblings, 2 replies; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 15:24 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 12:51:21PM +0100, Peter Zijlstra wrote:
> > This barrier thing is constructed so that it will not write in the 
> > sync() condition (the hot path) when there are no active lock 
> > sections; thus avoiding cacheline bouncing. -- I'm just not sure how 
> > this will work out in relation to PI. We might track those in the 
> > barrier scope and boost those by the max prio of the blockers.
> 
> Is this really needed?  We seem to grow new funky locking algorithms 
> exponentially, while people already have a hard time understanding the 
> existing ones.

yes, it's needed.

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 15:11   ` Christoph Hellwig
@ 2007-01-28 15:29     ` Peter Zijlstra
  2007-01-28 15:33       ` Christoph Hellwig
  2007-01-29 13:32     ` Stephen Smalley
  1 sibling, 1 reply; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 15:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Andrew Morton, linux-kernel, Ingo Molnar

On Sun, 2007-01-28 at 15:11 +0000, Christoph Hellwig wrote:

> > Even if this is becoming a real problem there must be simpler ways to fix
> > this than introducing various new locking primitives and all kinds of
> > complexity.
> 
> One good way to fix scalability without all this braindamage is
> to get rid of sb->s_files.  Current uses are:
> 
>  - fs/dquot.c:add_dquot_ref()
> 
> 	This performs it's actual operation on inodes.  We should
> 	be able to check inode->i_writecount to see which inodes
> 	need quota initialization.
> 
>  - fs/file_table.c:fs_may_remount_ro()
> 
> 	This one is gone in Dave Hansens per-mountpoint r/o patchkit
> 
>  - fs/proc/generic.c:proc_kill_inodes()
> 
> 	This can be done with a list inside procfs.
> 
>  - fs/super.c:mark_files_ro()
> 
> 	This one is only used for do_emergency_remount(), which is
> 	and utter hack.  It might be much better to just deny any
> 	kind of write access through a superblock flag here.
> 
>  - fs/selinuxfs.c:sel_remove_bools()
> 
> 	Utter madness.  I have no idea how this ever got merged.
> 	Maybe the selinux folks can explain what crack they were
> 	on when writing this.  The problem would go away with
> 	a generic rewoke infrastructure.
> 
> Once sb->s_files is gone we can also kill of fu_list entirely and
> replace it by a list head entirely in the tty code and make the lock
> for it per-tty.

I shall pursue this direction. Thanks for the information.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 15:21     ` Ingo Molnar
@ 2007-01-28 15:30       ` Christoph Hellwig
  2007-01-28 15:32         ` Peter Zijlstra
                           ` (2 more replies)
  0 siblings, 3 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel

On Sun, Jan 28, 2007 at 04:21:06PM +0100, Ingo Molnar wrote:
> > > sb->s_files is converted to a lock_list. furthermore to prevent the 
> > > lock_list_head of getting too contended with concurrent add 
> > > operations the add is buffered in per cpu filevecs.
> > 
> > NACK.  Please don't start using lockdep internals in core code.
> 
> what do you mean by that?

struct lock_list is an lockdep implementation detail and should not
leak out and be used anywhere else.   If we want something similar it
should be given a proper name and a header of it's own, but I don't
think it's a valueable abstraction for the core kernel.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 15:30       ` Christoph Hellwig
@ 2007-01-28 15:32         ` Peter Zijlstra
  2007-01-28 15:36           ` Christoph Hellwig
  2007-01-28 15:44         ` Ingo Molnar
  2007-01-28 16:25         ` Bill Huey
  2 siblings, 1 reply; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-28 15:32 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ingo Molnar, Andrew Morton, linux-kernel

On Sun, 2007-01-28 at 15:30 +0000, Christoph Hellwig wrote:
> On Sun, Jan 28, 2007 at 04:21:06PM +0100, Ingo Molnar wrote:
> > > > sb->s_files is converted to a lock_list. furthermore to prevent the 
> > > > lock_list_head of getting too contended with concurrent add 
> > > > operations the add is buffered in per cpu filevecs.
> > > 
> > > NACK.  Please don't start using lockdep internals in core code.
> > 
> > what do you mean by that?
> 
> struct lock_list is an lockdep implementation detail and should not
> leak out and be used anywhere else.   If we want something similar it
> should be given a proper name and a header of it's own, but I don't
> think it's a valueable abstraction for the core kernel.

please see patch 2/7, its unrelated to lockdep internals.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 15:29     ` Peter Zijlstra
@ 2007-01-28 15:33       ` Christoph Hellwig
  0 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Hellwig, Andrew Morton, linux-kernel, Ingo Molnar

On Sun, Jan 28, 2007 at 04:29:04PM +0100, Peter Zijlstra wrote:
> I shall pursue this direction. Thanks for the information.

Btw, if you need this short-term there might be an even simpler
solution:

  - all files go on sb->s_list and stay there until the file is freed
  - ttys grow their private list instead of beeing mixed up with the
    per-sb list

This already allows nice splitting of the locks into per-sb and per-tty
(although all tty_files accesses might be protected by tty_mutex anyway).
Especially for the superblock a rcu conversion might be quite easy
if nessecary, but all traversals are in what I'd consider more or less
slowpathes anyway.

> 
---end quoted text---

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-28 15:24     ` Ingo Molnar
@ 2007-01-28 15:34       ` Christoph Hellwig
  2007-01-31 19:12       ` Paul E. McKenney
  1 sibling, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel

On Sun, Jan 28, 2007 at 04:24:35PM +0100, Ingo Molnar wrote:
> > > This barrier thing is constructed so that it will not write in the 
> > > sync() condition (the hot path) when there are no active lock 
> > > sections; thus avoiding cacheline bouncing. -- I'm just not sure how 
> > > this will work out in relation to PI. We might track those in the 
> > > barrier scope and boost those by the max prio of the blockers.
> > 
> > Is this really needed?  We seem to grow new funky locking algorithms 
> > exponentially, while people already have a hard time understanding the 
> > existing ones.
> 
> yes, it's needed.

Thanks for the wonderful and indepth explanation </sarcasm>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 15:32         ` Peter Zijlstra
@ 2007-01-28 15:36           ` Christoph Hellwig
  0 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 15:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Hellwig, Ingo Molnar, Andrew Morton, linux-kernel

On Sun, Jan 28, 2007 at 04:32:43PM +0100, Peter Zijlstra wrote:
> please see patch 2/7, its unrelated to lockdep internals.

I can't see any 2/7 on lkml yet..


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 15:30       ` Christoph Hellwig
  2007-01-28 15:32         ` Peter Zijlstra
@ 2007-01-28 15:44         ` Ingo Molnar
  2007-01-28 16:25         ` Bill Huey
  2 siblings, 0 replies; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 15:44 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel


* Christoph Hellwig <hch@infradead.org> wrote:

> > > NACK.  Please don't start using lockdep internals in core code.
> > 
> > what do you mean by that?
> 
> struct lock_list is an lockdep implementation detail and should not 
> leak out and be used anywhere else.  [...]

no, it is not. It is a new data type.

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
  2007-01-28 15:30       ` Christoph Hellwig
  2007-01-28 15:32         ` Peter Zijlstra
  2007-01-28 15:44         ` Ingo Molnar
@ 2007-01-28 16:25         ` Bill Huey
  2 siblings, 0 replies; 54+ messages in thread
From: Bill Huey @ 2007-01-28 16:25 UTC (permalink / raw)
  To: Christoph Hellwig, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	linux-kernel
  Cc: Bill Huey (hui)

On Sun, Jan 28, 2007 at 03:30:06PM +0000, Christoph Hellwig wrote:
> On Sun, Jan 28, 2007 at 04:21:06PM +0100, Ingo Molnar wrote:
> > > > sb->s_files is converted to a lock_list. furthermore to prevent the 
> > > > lock_list_head of getting too contended with concurrent add 
> > > > operations the add is buffered in per cpu filevecs.
> > > 
> > > NACK.  Please don't start using lockdep internals in core code.
> > 
> > what do you mean by that?
> 
> struct lock_list is an lockdep implementation detail and should not
> leak out and be used anywhere else.   If we want something similar it
> should be given a proper name and a header of it's own, but I don't
> think it's a valueable abstraction for the core kernel.

Christoph,

"lock list" has nothing to do with lockdep. It's a relatively new
data type used to construct concurrent linked lists using a spinlock
per entry.

bill


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 15:24   ` Ingo Molnar
@ 2007-01-28 16:52     ` Martin J. Bligh
  2007-01-28 17:04       ` lockmeter Christoph Hellwig
  0 siblings, 1 reply; 54+ messages in thread
From: Martin J. Bligh @ 2007-01-28 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel

Ingo Molnar wrote:
> * Christoph Hellwig <hch@infradead.org> wrote:
> 
>> On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
>>> This patch-set breaks up the global file_list_lock which was found 
>>> to be a severe contention point under basically any filesystem 
>>> intensive workload.
>> Benchmarks, please.  Where exactly do you see contention for this?
> 
> it's the most contended spinlock we have during a parallel kernel 
> compile on an 8-way system. But it's pretty common-sense as well, 
> without doing any measurements, it's basically the only global lock left 
> in just about every VFS workload that doesnt involve massive amount of 
> dentries created/removed (which is still dominated by the dcache_lock).
> 
>> filesystem intensive workload apparently means namespace operation 
>> heavy workload, right?  The biggest bottleneck I've seen with those is 
>> dcache lock.
> 
> the dcache lock is not a problem during kernel compiles. (its 
> rcu-ification works nicely in that workload)

Mmm. not wholly convinced that's true. Whilst i don't have lockmeter
stats to hand, the heavy time in __d_lookup seems to indicate we may
still have a problem to me. I guess we could move the spinlocks out
of line again to test this fairly easily (or get lockmeter upstream).

114076 total                                      0.0545
  57766 default_idle                             916.9206
  11869 prep_new_page                             49.4542
   3830 find_trylock_page                         67.1930
   2637 zap_pte_range                              3.9125
   2486 strnlen_user                              54.0435
   2018 do_page_fault                              1.1941
   1940 do_wp_page                                 1.6973
   1869 __d_lookup                                 7.7231
   1331 page_remove_rmap                           5.2196
   1287 do_no_page                                 1.6108
   1272 buffered_rmqueue                           4.6423
   1160 __copy_to_user_ll                         14.6835
   1027 _atomic_dec_and_lock                      11.1630
    655 release_pages                              1.9670
    644 do_path_lookup                             1.6304
    630 schedule                                   0.4046
    617 kunmap_atomic                              7.7125
    573 __handle_mm_fault                          0.7365
    548 free_hot_page                             78.2857
    500 __copy_user_intel                          3.3784
    483 copy_pte_range                             0.5941
    482 page_address                               2.9571
    478 file_move                                  9.1923
    441 do_anonymous_page                          0.7424
    429 filemap_nopage                             0.4450
    401 anon_vma_unlink                            4.8902

^ permalink raw reply	[flat|nested] 54+ messages in thread

* lockmeter
  2007-01-28 16:52     ` Martin J. Bligh
@ 2007-01-28 17:04       ` Christoph Hellwig
  2007-01-28 17:38         ` lockmeter Martin J. Bligh
  2007-01-29  1:08         ` lockmeter Arjan van de Ven
  0 siblings, 2 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 17:04 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Ingo Molnar, Christoph Hellwig, Peter Zijlstra, Andrew Morton,
	linux-kernel

On Sun, Jan 28, 2007 at 08:52:25AM -0800, Martin J. Bligh wrote:
> Mmm. not wholly convinced that's true. Whilst i don't have lockmeter
> stats to hand, the heavy time in __d_lookup seems to indicate we may
> still have a problem to me. I guess we could move the spinlocks out
> of line again to test this fairly easily (or get lockmeter upstream).

We definitly should get lockmeter in.  Does anyone volunteer for doing
the cleanup and merged?


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-28 17:04       ` lockmeter Christoph Hellwig
@ 2007-01-28 17:38         ` Martin J. Bligh
  2007-01-28 18:01           ` lockmeter Bill Huey
  2007-01-29  1:08         ` lockmeter Arjan van de Ven
  1 sibling, 1 reply; 54+ messages in thread
From: Martin J. Bligh @ 2007-01-28 17:38 UTC (permalink / raw)
  To: Christoph Hellwig, Martin J. Bligh, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, linux-kernel, Dipankar Sarma

Christoph Hellwig wrote:
> On Sun, Jan 28, 2007 at 08:52:25AM -0800, Martin J. Bligh wrote:
>> Mmm. not wholly convinced that's true. Whilst i don't have lockmeter
>> stats to hand, the heavy time in __d_lookup seems to indicate we may
>> still have a problem to me. I guess we could move the spinlocks out
>> of line again to test this fairly easily (or get lockmeter upstream).
> 
> We definitly should get lockmeter in.  Does anyone volunteer for doing
> the cleanup and merged?

On second thoughts .. I don't think it'd actually work for this since
the locks aren't global. Not that it shouldn't be done anyway, but ...

ISTR we still thought dcache scalability was a significant problem last
time anyone looked at it seriously - just never got fixed. Dipankar?

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-28 17:38         ` lockmeter Martin J. Bligh
@ 2007-01-28 18:01           ` Bill Huey
  2007-01-28 19:26             ` lockmeter Ingo Molnar
  2007-01-28 21:17             ` lockmeter Ingo Molnar
  0 siblings, 2 replies; 54+ messages in thread
From: Bill Huey @ 2007-01-28 18:01 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Christoph Hellwig, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	linux-kernel, Dipankar Sarma, Bill Huey (hui)

On Sun, Jan 28, 2007 at 09:38:16AM -0800, Martin J. Bligh wrote:
> Christoph Hellwig wrote:
> >On Sun, Jan 28, 2007 at 08:52:25AM -0800, Martin J. Bligh wrote:
> >>Mmm. not wholly convinced that's true. Whilst i don't have lockmeter
> >>stats to hand, the heavy time in __d_lookup seems to indicate we may
> >>still have a problem to me. I guess we could move the spinlocks out
> >>of line again to test this fairly easily (or get lockmeter upstream).
> >
> >We definitly should get lockmeter in.  Does anyone volunteer for doing
> >the cleanup and merged?
> 
> On second thoughts .. I don't think it'd actually work for this since
> the locks aren't global. Not that it shouldn't be done anyway, but ...
> 
> ISTR we still thought dcache scalability was a significant problem last
> time anyone looked at it seriously - just never got fixed. Dipankar?

My lock stat stuff shows dcache to a be a problem under -rt as well. It
is keyed off the same mechanism as lockdep. It's pretty heavily hit
under even normal loads relative to other kinds of lock overhead even
for casual file operations on a 2x system. I can't imagine how lousy
it's going to be under real load on a 8x or higher machine.

However, this pathc is -rt only and spinlock times are meaningless under
it because of preemptiblity.

bill


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
  2007-01-28 15:11   ` Christoph Hellwig
  2007-01-28 15:24   ` Ingo Molnar
@ 2007-01-28 18:41   ` Ingo Molnar
  2007-01-28 20:38     ` Christoph Hellwig
  2 siblings, 1 reply; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 18:41 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	linux-kernel


* Christoph Hellwig <hch@infradead.org> wrote:

> On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
> > This patch-set breaks up the global file_list_lock which was found to be a
> > severe contention point under basically any filesystem intensive workload.
> 
> Benchmarks, please.  Where exactly do you see contention for this?

the following very simple workload:

   http://redhat.com/~mingo/file-ops-test/file-ops-test.c

starts one process per CPU and open()s/close()s a file all over again, 
simulating an open/close-intense workload. This pattern is quite typical 
of several key Linux applications.

Using Peter's s_files patchset the following scalability improvement can 
be measured (lower numbers are better):

 ----------------------------------------------------------------------
                  v2.6.20-rc6     |   v2.6.20-rc6+Peter's s_files queue
 ----------------------------------------------------------------------
 dual-core:       2.11 usecs/op   |   1.51 usecs/op     (  +39.7% win )
 8-socket:        6.30 usecs/op   |   2.70 usecs/op     ( +233.3% win )

[ i'd expect something a 64-way box to improve its open/close 
  scalability dramatically, factor 10x or 20x better. On a 1024-CPU (or 
  4096-CPU) system the effects are very likely even more dramatic. ]

Why does this patch make such a difference? Not because the critical 
section is in any way contended - it isnt, we only do a simple list 
operation there. But this lock is touched in every sys_open() and 
sys_close() system-call, so it is a high-frequency accessed cacheline. 
The cost is there due to the global cacheline ping-pong of files_lock. 
Furthermore, in this very important VFS codepath this is the 'last' 
global cacheline that got eliminated, hence all the scalability benefits 
(of previous improvements) get reaped all at once.

Now could you please tell me why i had to waste 3.5 hours on measuring 
and profiling this /again/, while a tiny little bit of goodwill from 
your side could have avoided this? I told you that we lock-profiled this 
under -rt, and that it's an accurate measurement of such things - as the 
numbers above prove it too. Would it have been so hard to say something 
like: "Cool Peter! That lock had been in our way of good open()/close() 
scalability for such a long time and it's an obviously good idea to 
eliminate it. Now here's a couple of suggestions of how to do it even 
simpler: [...]." Why did you have to in essence piss on his patchset? 
Any rational explanation?

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-28 18:01           ` lockmeter Bill Huey
@ 2007-01-28 19:26             ` Ingo Molnar
  2007-01-28 21:17             ` lockmeter Ingo Molnar
  1 sibling, 0 replies; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 19:26 UTC (permalink / raw)
  To: Bill Huey
  Cc: Martin J. Bligh, Christoph Hellwig, Peter Zijlstra,
	Andrew Morton, linux-kernel, Dipankar Sarma


* Bill Huey <billh@gnuppy.monkey.org> wrote:

> > ISTR we still thought dcache scalability was a significant problem 
> > last time anyone looked at it seriously - just never got fixed. 
> > Dipankar?
> 
> My lock stat stuff shows dcache to a be a problem under -rt as well. 
> [...]

yeah, it shows up frequently, but it's hard to fix.

for a well-cached kernel rebuild on a system with lots of RAM and lots 
of CPUs, files_lock is the leading lock. (because the dcache is mostly 
populated already) files_lock is even more of a problem for workloads 
that do lots of open()/close() [Apache comes to mind].

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 18:41   ` [PATCH 0/7] breaking the global file_list_lock Ingo Molnar
@ 2007-01-28 20:38     ` Christoph Hellwig
  2007-01-28 21:05       ` Ingo Molnar
  0 siblings, 1 reply; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-28 20:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	linux-kernel

On Sun, Jan 28, 2007 at 07:41:16PM +0100, Ingo Molnar wrote:
> starts one process per CPU and open()s/close()s a file all over again, 
> simulating an open/close-intense workload. This pattern is quite typical 
> of several key Linux applications.
> 
> Using Peter's s_files patchset the following scalability improvement can 
> be measured (lower numbers are better):
> 
>  ----------------------------------------------------------------------
>                   v2.6.20-rc6     |   v2.6.20-rc6+Peter's s_files queue
>  ----------------------------------------------------------------------
>  dual-core:       2.11 usecs/op   |   1.51 usecs/op     (  +39.7% win )
>  8-socket:        6.30 usecs/op   |   2.70 usecs/op     ( +233.3% win )

Thanks for having some numbers to start with.

> Now could you please tell me why i had to waste 3.5 hours on measuring 
> and profiling this /again/, while a tiny little bit of goodwill from 
> your side could have avoided this? I told you that we lock-profiled this 
> under -rt, and that it's an accurate measurement of such things - as the 
> numbers above prove it too. Would it have been so hard to say something 
> like: "Cool Peter! That lock had been in our way of good open()/close() 
> scalability for such a long time and it's an obviously good idea to 
> eliminate it. Now here's a couple of suggestions of how to do it even 
> simpler: [...]." Why did you have to in essence piss on his patchset? 
> Any rational explanation?

Can we please stop this stupid pissing contest.  I'm totally fine to admit
yours is bigger than mine in public, so let's get back to the facts.


The patchkit we're discussing here introduces a lot of complexity:

 - a new type of implicitly locked linked lists
 - a new synchronization primitive
 - a new locking scheme that utilizes the previous two items, aswell
   as rcu.

I think we definitly we want some numbers (which you finally provided)
to justify this.

Then going on to the implementation I don't like trying to "fix" a problem
with this big hammer approach.  I've outlined some alternate ways that
actually simplify both the underlying data structures and locking that
should help towards this problem instead of making the code more complex
and really hard to understand.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 20:38     ` Christoph Hellwig
@ 2007-01-28 21:05       ` Ingo Molnar
  0 siblings, 0 replies; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 21:05 UTC (permalink / raw)
  To: Christoph Hellwig, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	linux-kernel


* Christoph Hellwig <hch@infradead.org> wrote:

> Thanks for having some numbers to start with.

you are welcome.

> > Any rational explanation?
> 
> Can we please stop this stupid pissing contest.  I'm totally fine to 
> admit yours is bigger than mine in public, so let's get back to the 
> facts.

sure. Could we begin with you not doing derogative, belittling comments 
like:

 <hch> so -rt folks also restart the lock brigade crap :P

and:

 * hch hands Peter Zijlstra the mister braindead locking madness award

but to show a little bit of .. respect and goodwill towards the work of 
others? Those comments might sound funny to you, but they are offensive 
to me.

I'll readily admit that the patches are complex and even ugly, but the 
problem is complex and ugly to begin with. /Finally/ someone comes along 
and tries to sort out the mess that YOU (amonst others, like me) have 
not fixed for years, and you welcome his (hard!) efforts with ridicule?

You also are NAK-ing the same patches on lkml for clearly bogus reasons: 
for example your 'this is lockdep internals' comment was off the mark by 
far. So this isnt just colorful comments, it is also actual prejudice 
that is irrational and harmful.

Linux will take 100 years to break into the mainstream if this attitude 
prevails. And i think this is a far bigger problem than pretty much 
/any/ technological problem we have at the moment so i will continue to 
be upset about it, no matter how uncomfortable it is to finally face 
this and no matter how ugly such flames might look externally.

Or if you really cannot talk about hard problems without insulting 
others then /just dont do it/. Nobody is forcing you to do this if you 
cannot find any fun in it, really.

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-28 18:01           ` lockmeter Bill Huey
  2007-01-28 19:26             ` lockmeter Ingo Molnar
@ 2007-01-28 21:17             ` Ingo Molnar
  2007-01-29  5:27               ` lockmeter Bill Huey
  1 sibling, 1 reply; 54+ messages in thread
From: Ingo Molnar @ 2007-01-28 21:17 UTC (permalink / raw)
  To: Bill Huey
  Cc: Martin J. Bligh, Christoph Hellwig, Peter Zijlstra,
	Andrew Morton, linux-kernel, Dipankar Sarma


* Bill Huey <billh@gnuppy.monkey.org> wrote:

> My lock stat stuff shows dcache to a be a problem under -rt as well. 
> It is keyed off the same mechanism as lockdep. [...]

btw., while my plan is to prototype your lock-stat patch in -rt 
initially, it should be doable to extend it to be usable with the 
upstream kernel as well.

We can gather lock contention events when there is spinlock debugging 
enabled, from lib/spinlock_debug.c. For example __spin_lock_debug() does 
this:

static void __spin_lock_debug(spinlock_t *lock)
{
...
                for (i = 0; i < loops; i++) {
                        if (__raw_spin_trylock(&lock->raw_lock))
                                return;
                        __delay(1);
                }

where the __delay(1) call is done do we notice contention - and there 
you could drive the lock-stat code. Similarly, rwlocks have such natural 
points too where you could insert a lock-stat callback without impacting 
performance (or the code) all that much. mutexes and rwsems have natural 
contention points too (kernel/mutex.c:__mutex_lock_slowpath() and 
kernel/rwsem.c:rwsem_down_failed_common()), even with mutex debugging is 
off.

for -rt the natural point to gather contention events is in 
kernel/rtmutex.c, as you are doing it currently.

finally, you can enable lockdep's initialization/etc. wrappers so that 
infrastructure between lockdep and lock-stat is shared, but you dont 
have to call into the lock-tracking code of lockdep.c if LOCK_STAT is 
enabled and PROVE_LOCKING is disabled. That should give you the lockdep 
infrastructure for LOCK_STAT, without the lockdep overhead.

all in one, one motivation behind my interest in your patch for -rt is 
that i think it's useful for upstream too, and that it can merge with 
lockdep to a fair degree.

	Ingo

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-28 17:04       ` lockmeter Christoph Hellwig
  2007-01-28 17:38         ` lockmeter Martin J. Bligh
@ 2007-01-29  1:08         ` Arjan van de Ven
  2007-01-29  1:12           ` lockmeter Martin J. Bligh
  1 sibling, 1 reply; 54+ messages in thread
From: Arjan van de Ven @ 2007-01-29  1:08 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Martin J. Bligh, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	linux-kernel

On Sun, 2007-01-28 at 17:04 +0000, Christoph Hellwig wrote:
> On Sun, Jan 28, 2007 at 08:52:25AM -0800, Martin J. Bligh wrote:
> > Mmm. not wholly convinced that's true. Whilst i don't have lockmeter
> > stats to hand, the heavy time in __d_lookup seems to indicate we may
> > still have a problem to me. I guess we could move the spinlocks out
> > of line again to test this fairly easily (or get lockmeter upstream).
> 
> We definitly should get lockmeter in.  Does anyone volunteer for doing
> the cleanup and merged?

specifically; implementing it on top of lockdep should be very lean and
simple...



^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-29  1:08         ` lockmeter Arjan van de Ven
@ 2007-01-29  1:12           ` Martin J. Bligh
  0 siblings, 0 replies; 54+ messages in thread
From: Martin J. Bligh @ 2007-01-29  1:12 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Christoph Hellwig, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	linux-kernel, John Hawkes

Arjan van de Ven wrote:
> On Sun, 2007-01-28 at 17:04 +0000, Christoph Hellwig wrote:
>> On Sun, Jan 28, 2007 at 08:52:25AM -0800, Martin J. Bligh wrote:
>>> Mmm. not wholly convinced that's true. Whilst i don't have lockmeter
>>> stats to hand, the heavy time in __d_lookup seems to indicate we may
>>> still have a problem to me. I guess we could move the spinlocks out
>>> of line again to test this fairly easily (or get lockmeter upstream).
>> We definitly should get lockmeter in.  Does anyone volunteer for doing
>> the cleanup and merged?
> 
> specifically; implementing it on top of lockdep should be very lean and
> simple...

cc: John Hawkes, if we're going to discuss this ;-)

M.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-28 21:17             ` lockmeter Ingo Molnar
@ 2007-01-29  5:27               ` Bill Huey
  2007-01-29 10:26                 ` lockmeter Bill Huey
  0 siblings, 1 reply; 54+ messages in thread
From: Bill Huey @ 2007-01-29  5:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Martin J. Bligh, Christoph Hellwig, Peter Zijlstra,
	Andrew Morton, linux-kernel, Dipankar Sarma, Bill Huey (hui)

On Sun, Jan 28, 2007 at 10:17:05PM +0100, Ingo Molnar wrote:
> btw., while my plan is to prototype your lock-stat patch in -rt 
> initially, it should be doable to extend it to be usable with the 
> upstream kernel as well.
> 
> We can gather lock contention events when there is spinlock debugging 
> enabled, from lib/spinlock_debug.c. For example __spin_lock_debug() does 
> this:
> 
> static void __spin_lock_debug(spinlock_t *lock)
> {
> ...
>                 for (i = 0; i < loops; i++) {
>                         if (__raw_spin_trylock(&lock->raw_lock))
>                                 return;
>                         __delay(1);
>                 }
> 
> where the __delay(1) call is done do we notice contention - and there 
> you could drive the lock-stat code. Similarly, rwlocks have such natural 
> points too where you could insert a lock-stat callback without impacting 
> performance (or the code) all that much. mutexes and rwsems have natural 
> contention points too (kernel/mutex.c:__mutex_lock_slowpath() and 
> kernel/rwsem.c:rwsem_down_failed_common()), even with mutex debugging is 
> off.
> 
> for -rt the natural point to gather contention events is in 
> kernel/rtmutex.c, as you are doing it currently.
> 
> finally, you can enable lockdep's initialization/etc. wrappers so that 
> infrastructure between lockdep and lock-stat is shared, but you dont 
> have to call into the lock-tracking code of lockdep.c if LOCK_STAT is 
> enabled and PROVE_LOCKING is disabled. That should give you the lockdep 
> infrastructure for LOCK_STAT, without the lockdep overhead.
> 
> all in one, one motivation behind my interest in your patch for -rt is 
> that i think it's useful for upstream too, and that it can merge with 
> lockdep to a fair degree.

Fantastic. I'm going to try and finish up your suggested changes tonight
and get it to work with CONFIG_LOCK_STAT off. It's been challenging to
find time to do Linux these days, so I don't mind handing it off to you
after this point so that you and tweek it to your heart's content.

Yeah, one of the major motivations behind it was to see if Solaris style
locks were useful and to either validate or invalidate their usefulness.
Because of this patch, we have an idea of what's going on with regard to
adaptive spinning and such. The sensible conclusion is that it's not
sophistication of the lock, but parallelizing the code in the first
place to prevent the contention in the first place that's the key
philosophical drive.

I forward merged it into rc6-rt2 and you can expect a drop tonight of
what I have regardless whether it's perfect or not.

bill


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: lockmeter
  2007-01-29  5:27               ` lockmeter Bill Huey
@ 2007-01-29 10:26                 ` Bill Huey
  0 siblings, 0 replies; 54+ messages in thread
From: Bill Huey @ 2007-01-29 10:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Martin J. Bligh, Christoph Hellwig, Peter Zijlstra,
	Andrew Morton, linux-kernel, Dipankar Sarma, Bill Huey (hui)

On Sun, Jan 28, 2007 at 09:27:45PM -0800, Bill Huey wrote:
> On Sun, Jan 28, 2007 at 10:17:05PM +0100, Ingo Molnar wrote:
> > btw., while my plan is to prototype your lock-stat patch in -rt 
> > initially, it should be doable to extend it to be usable with the 
> > upstream kernel as well.
...
> Fantastic. I'm going to try and finish up your suggested changes tonight
> and get it to work with CONFIG_LOCK_STAT off. It's been challenging to
> find time to do Linux these days, so I don't mind handing it off to you
> after this point so that you and tweek it to your heart's content.

Ingo,

Got it.
	http://mmlinux.sourceforge.net/public/patch-2.6.20-rc6-rt2.1.lock_stat.patch

This compiles and runs with the CONFIG_LOCK_STAT option turned off now
and I believe addresses all of your forementioned concern that you
mentioned. I could have missed a detail here and there, but I think
it's in pretty good shape now.

bill


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-28 15:11   ` Christoph Hellwig
  2007-01-28 15:29     ` Peter Zijlstra
@ 2007-01-29 13:32     ` Stephen Smalley
  2007-01-29 18:02       ` Christoph Hellwig
  1 sibling, 1 reply; 54+ messages in thread
From: Stephen Smalley @ 2007-01-29 13:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Peter Zijlstra, Andrew Morton, linux-kernel, Ingo Molnar,
	James Morris, Eric Paris

On Sun, 2007-01-28 at 15:11 +0000, Christoph Hellwig wrote:
> On Sun, Jan 28, 2007 at 02:43:25PM +0000, Christoph Hellwig wrote:
> > On Sun, Jan 28, 2007 at 12:51:18PM +0100, Peter Zijlstra wrote:
> > > This patch-set breaks up the global file_list_lock which was found to be a
> > > severe contention point under basically any filesystem intensive workload.
> > 
> > Benchmarks, please.  Where exactly do you see contention for this?
> > 
> > 
> > filesystem intensive workload apparently means namespace operation heavy
> > workload, right?  The biggest bottleneck I've seen with those is dcache lock.
> > 
> > Even if this is becoming a real problem there must be simpler ways to fix
> > this than introducing various new locking primitives and all kinds of
> > complexity.
> 
> One good way to fix scalability without all this braindamage is
> to get rid of sb->s_files.  Current uses are:
> 
>  - fs/dquot.c:add_dquot_ref()
> 
> 	This performs it's actual operation on inodes.  We should
> 	be able to check inode->i_writecount to see which inodes
> 	need quota initialization.
> 
>  - fs/file_table.c:fs_may_remount_ro()
> 
> 	This one is gone in Dave Hansens per-mountpoint r/o patchkit
> 
>  - fs/proc/generic.c:proc_kill_inodes()
> 
> 	This can be done with a list inside procfs.
> 
>  - fs/super.c:mark_files_ro()
> 
> 	This one is only used for do_emergency_remount(), which is
> 	and utter hack.  It might be much better to just deny any
> 	kind of write access through a superblock flag here.
> 
>  - fs/selinuxfs.c:sel_remove_bools()
> 
> 	Utter madness.  I have no idea how this ever got merged.
> 	Maybe the selinux folks can explain what crack they were
> 	on when writing this.  The problem would go away with
> 	a generic rewoke infrastructure.

It was modeled after proc_kill_inodes(), as noted in the comments.  So
if you have a suitable replacement for proc_kill_inodes(), you can apply
the same approach to sel_remove_bools().

> 
> Once sb->s_files is gone we can also kill of fu_list entirely and
> replace it by a list head entirely in the tty code and make the lock
> for it per-tty.

-- 
Stephen Smalley
National Security Agency


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/7] breaking the global file_list_lock
  2007-01-29 13:32     ` Stephen Smalley
@ 2007-01-29 18:02       ` Christoph Hellwig
  0 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2007-01-29 18:02 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel,
	Ingo Molnar, James Morris, Eric Paris

On Mon, Jan 29, 2007 at 08:32:53AM -0500, Stephen Smalley wrote:
> >  - fs/selinuxfs.c:sel_remove_bools()
> > 
> > 	Utter madness.  I have no idea how this ever got merged.
> > 	Maybe the selinux folks can explain what crack they were
> > 	on when writing this.  The problem would go away with
> > 	a generic rewoke infrastructure.
> 
> It was modeled after proc_kill_inodes(), as noted in the comments.  So
> if you have a suitable replacement for proc_kill_inodes(), you can apply
> the same approach to sel_remove_bools().

Looking at it again sel_remove_bools actually only operates on files
backed by selinuxfs, so yes we could use the same approach.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-28 15:24     ` Ingo Molnar
  2007-01-28 15:34       ` Christoph Hellwig
@ 2007-01-31 19:12       ` Paul E. McKenney
  2007-01-31 21:13         ` Oleg Nesterov
  1 sibling, 1 reply; 54+ messages in thread
From: Paul E. McKenney @ 2007-01-31 19:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Peter Zijlstra, Andrew Morton, linux-kernel, oleg

On Sun, Jan 28, 2007 at 04:24:35PM +0100, Ingo Molnar wrote:
> 
> * Christoph Hellwig <hch@infradead.org> wrote:
> 
> > On Sun, Jan 28, 2007 at 12:51:21PM +0100, Peter Zijlstra wrote:
> > > This barrier thing is constructed so that it will not write in the 
> > > sync() condition (the hot path) when there are no active lock 
> > > sections; thus avoiding cacheline bouncing. -- I'm just not sure how 
> > > this will work out in relation to PI. We might track those in the 
> > > barrier scope and boost those by the max prio of the blockers.
> > 
> > Is this really needed?  We seem to grow new funky locking algorithms 
> > exponentially, while people already have a hard time understanding the 
> > existing ones.
> 
> yes, it's needed.

Would it be possible to come up with something common between this primitive
and the one that Oleg Nesterov put together for Jens Axboe?

	http://lkml.org/lkml/2006/11/29/330

Oleg's approach acquires a lock on the update side, which Peter would
not want in the uncontended case -- but perhaps there is some way to
make Oleg's approach be able to safely test both counters so as to
avoid acquiring the lock if there are no readers.

Oleg, any chance of this working?  I believe it does, but have not
thought it through fully.

If it does turn out that we cannot converge these, I believe that
Peter's implementation needs an smp_mb() at both the beginning
and the end of barrier_sync().  Without the first smp_mb(), the
test in barrier_sync() might precede the prior change, and without
the second smp_mb() the barrier_sync() might slide after the following
cleanup code.  (But I could easily be misunderstanding the code
using barrier_sync().)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-31 19:12       ` Paul E. McKenney
@ 2007-01-31 21:13         ` Oleg Nesterov
  2007-01-31 21:30           ` Oleg Nesterov
  2007-01-31 21:48           ` Peter Zijlstra
  0 siblings, 2 replies; 54+ messages in thread
From: Oleg Nesterov @ 2007-01-31 21:13 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Ingo Molnar, Christoph Hellwig, Peter Zijlstra, Andrew Morton,
	linux-kernel

On 01/31, Paul E. McKenney wrote:
>
> On Sun, Jan 28, 2007 at 04:24:35PM +0100, Ingo Molnar wrote:
> > 
> > * Christoph Hellwig <hch@infradead.org> wrote:
> > 
> > > On Sun, Jan 28, 2007 at 12:51:21PM +0100, Peter Zijlstra wrote:
> > > > This barrier thing is constructed so that it will not write in the 
> > > > sync() condition (the hot path) when there are no active lock 
> > > > sections; thus avoiding cacheline bouncing. -- I'm just not sure how 
> > > > this will work out in relation to PI. We might track those in the 
> > > > barrier scope and boost those by the max prio of the blockers.
> > > 
> > > Is this really needed?  We seem to grow new funky locking algorithms 
> > > exponentially, while people already have a hard time understanding the 
> > > existing ones.
> > 
> > yes, it's needed.
> 
> Would it be possible to come up with something common between this primitive
> and the one that Oleg Nesterov put together for Jens Axboe?
> 
> 	http://lkml.org/lkml/2006/11/29/330
> 
> Oleg's approach acquires a lock on the update side, which Peter would
> not want in the uncontended case -- but perhaps there is some way to
> make Oleg's approach be able to safely test both counters so as to
> avoid acquiring the lock if there are no readers.
> 
> Oleg, any chance of this working?  I believe it does, but have not
> thought it through fully.

I think no. From the quick reading, barrier_sync() and qrcu/srcu are
quite different. Consider:

barrier_lock()

				barrier_sync();

barrier_unlock();
				... wake up ...
							barrier_lock();

				schedule again

The last "schedule again" would be a BUG for qrcu/srcu, but probably
it is ok for barrier_sync(). It looks like barrier_sync() is more a
rw semaphore biased to readers.

A couple of minor off-topic notes,

+static inline void barrier_unlock(struct barrier *b)
+{
+       smp_wmb();
+       if (atomic_dec_and_test(&b->count))
+               __wake_up(&b->wait, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 0, b);

This is wake_up_all(&b->wait), yes? I don't undestans why key == b, it could be NULL.

+static inline void barrier_sync(struct barrier *b)
+{
+       might_sleep();
+
+       if (unlikely(atomic_read(&b->count))) {
+               DEFINE_WAIT(wait);
+               prepare_to_wait(&b->wait, &wait, TASK_UNINTERRUPTIBLE);
+               while (atomic_read(&b->count))
+                       schedule();
+               finish_wait(&b->wait, &wait);
+       }
+}

This should be open-coded wait_event(), but wrong! With the scenario above this
can hang forever! because the first wake_up removes the task from the &b->wait.

Oleg.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-31 21:13         ` Oleg Nesterov
@ 2007-01-31 21:30           ` Oleg Nesterov
  2007-01-31 21:48           ` Peter Zijlstra
  1 sibling, 0 replies; 54+ messages in thread
From: Oleg Nesterov @ 2007-01-31 21:30 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Ingo Molnar, Christoph Hellwig, Peter Zijlstra, Andrew Morton,
	linux-kernel

On 02/01, Oleg Nesterov wrote:
> 
> +static inline void barrier_sync(struct barrier *b)
> +{
> +       might_sleep();
> +
> +       if (unlikely(atomic_read(&b->count))) {
> +               DEFINE_WAIT(wait);
> +               prepare_to_wait(&b->wait, &wait, TASK_UNINTERRUPTIBLE);
> +               while (atomic_read(&b->count))
> +                       schedule();
> +               finish_wait(&b->wait, &wait);
> +       }
> +}
> 
> This should be open-coded wait_event(), but wrong! With the scenario above this
> can hang forever! because the first wake_up removes the task from the &b->wait.

I am afraid I was not clear (as usual :). prepare_to_wait means autoremove_wake_function.
So, if barrier_unlock() wakes up the caller of barrier_sync(), it will be removed
from b->wait. If it goes to schedule() because another barrier_lock() incremented
b->count, we can't wake it via __wake_up(&b->wait, ...).

Oleg.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-31 21:13         ` Oleg Nesterov
  2007-01-31 21:30           ` Oleg Nesterov
@ 2007-01-31 21:48           ` Peter Zijlstra
  2007-01-31 23:32             ` Paul E. McKenney
  1 sibling, 1 reply; 54+ messages in thread
From: Peter Zijlstra @ 2007-01-31 21:48 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Paul E. McKenney, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Thu, 2007-02-01 at 00:13 +0300, Oleg Nesterov wrote:
> On 01/31, Paul E. McKenney wrote:
> >
> > On Sun, Jan 28, 2007 at 04:24:35PM +0100, Ingo Molnar wrote:
> > > 
> > > * Christoph Hellwig <hch@infradead.org> wrote:
> > > 
> > > > On Sun, Jan 28, 2007 at 12:51:21PM +0100, Peter Zijlstra wrote:
> > > > > This barrier thing is constructed so that it will not write in the 
> > > > > sync() condition (the hot path) when there are no active lock 
> > > > > sections; thus avoiding cacheline bouncing. -- I'm just not sure how 
> > > > > this will work out in relation to PI. We might track those in the 
> > > > > barrier scope and boost those by the max prio of the blockers.
> > > > 
> > > > Is this really needed?  We seem to grow new funky locking algorithms 
> > > > exponentially, while people already have a hard time understanding the 
> > > > existing ones.
> > > 
> > > yes, it's needed.
> > 
> > Would it be possible to come up with something common between this primitive
> > and the one that Oleg Nesterov put together for Jens Axboe?
> > 
> > 	http://lkml.org/lkml/2006/11/29/330
> > 
> > Oleg's approach acquires a lock on the update side, which Peter would
> > not want in the uncontended case -- but perhaps there is some way to
> > make Oleg's approach be able to safely test both counters so as to
> > avoid acquiring the lock if there are no readers.
> > 
> > Oleg, any chance of this working?  I believe it does, but have not
> > thought it through fully.
> 
> I think no. From the quick reading, barrier_sync() and qrcu/srcu are
> quite different. Consider:
> 
> barrier_lock()
> 
> 				barrier_sync();
> 
> barrier_unlock();
> 				... wake up ...
> 							barrier_lock();
> 
> 				schedule again
> 
> The last "schedule again" would be a BUG for qrcu/srcu, but probably
> it is ok for barrier_sync().

Yes, that would be ok.

>  It looks like barrier_sync() is more a
> rw semaphore biased to readers.

Indeed, the locked sections are designed to be the rare case.

> A couple of minor off-topic notes,
> 
> +static inline void barrier_unlock(struct barrier *b)
> +{
> +       smp_wmb();
> +       if (atomic_dec_and_test(&b->count))
> +               __wake_up(&b->wait, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 0, b);
> 
> This is wake_up_all(&b->wait), yes? I don't undestans why key == b, it could be NULL.
> 
> +static inline void barrier_sync(struct barrier *b)
> +{
> +       might_sleep();
> +
> +       if (unlikely(atomic_read(&b->count))) {
> +               DEFINE_WAIT(wait);
> +               prepare_to_wait(&b->wait, &wait, TASK_UNINTERRUPTIBLE);
> +               while (atomic_read(&b->count))
> +                       schedule();
> +               finish_wait(&b->wait, &wait);
> +       }
> +}
> 
> This should be open-coded wait_event(), but wrong! With the scenario above this
> can hang forever! because the first wake_up removes the task from the &b->wait.

This would be me struggling with the waitqueue API, its all a tad
confusing at first look.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-31 21:48           ` Peter Zijlstra
@ 2007-01-31 23:32             ` Paul E. McKenney
  2007-02-01  0:03               ` Peter Zijlstra
  0 siblings, 1 reply; 54+ messages in thread
From: Paul E. McKenney @ 2007-01-31 23:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Wed, Jan 31, 2007 at 10:48:21PM +0100, Peter Zijlstra wrote:
> On Thu, 2007-02-01 at 00:13 +0300, Oleg Nesterov wrote:
> > On 01/31, Paul E. McKenney wrote:
> > > On Sun, Jan 28, 2007 at 04:24:35PM +0100, Ingo Molnar wrote:
> > > > * Christoph Hellwig <hch@infradead.org> wrote:
> > > > > On Sun, Jan 28, 2007 at 12:51:21PM +0100, Peter Zijlstra wrote:
> > > > > > This barrier thing is constructed so that it will not write in the 
> > > > > > sync() condition (the hot path) when there are no active lock 
> > > > > > sections; thus avoiding cacheline bouncing. -- I'm just not sure how 
> > > > > > this will work out in relation to PI. We might track those in the 
> > > > > > barrier scope and boost those by the max prio of the blockers.
> > > > > 
> > > > > Is this really needed?  We seem to grow new funky locking algorithms 
> > > > > exponentially, while people already have a hard time understanding the 
> > > > > existing ones.
> > > > 
> > > > yes, it's needed.
> > > 
> > > Would it be possible to come up with something common between this primitive
> > > and the one that Oleg Nesterov put together for Jens Axboe?
> > > 
> > > 	http://lkml.org/lkml/2006/11/29/330
> > > 
> > > Oleg's approach acquires a lock on the update side, which Peter would
> > > not want in the uncontended case -- but perhaps there is some way to
> > > make Oleg's approach be able to safely test both counters so as to
> > > avoid acquiring the lock if there are no readers.
> > > 
> > > Oleg, any chance of this working?  I believe it does, but have not
> > > thought it through fully.
> > 
> > I think no. From the quick reading, barrier_sync() and qrcu/srcu are
> > quite different. Consider:
> > 
> > barrier_lock()
> > 
> > 				barrier_sync();
> > 
> > barrier_unlock();
> > 				... wake up ...
> > 							barrier_lock();
> > 
> > 				schedule again
> > 
> > The last "schedule again" would be a BUG for qrcu/srcu, but probably
> > it is ok for barrier_sync().
> 
> Yes, that would be ok.

The wakeup in barrier_sync() would mean that the counter was zero
at some point in the past.  The counter would then be rechecked, and
if it were still zero, barrier_sync() would invoke finish_wait() and
then return -- but the counter might well become non-zero in the
meantime, right?

So given that barrier_sync() is permitted to return after the counter
becomes non-zero, why can't it just rely on the fact that barrier_unlock()
saw it as zero not long in the past?

> >  It looks like barrier_sync() is more a
> > rw semaphore biased to readers.
> 
> Indeed, the locked sections are designed to be the rare case.

OK -- but barrier_sync() just waits for readers, it doesn't exclude them.

If all barrier_sync() needs to do is to wait until all pre-existing
barrier_lock()/barrier_unlock() pairs to complete, it seems to me to
be compatible with qrcu's semantics.

So what am I missing here?

							Thanx, Paul

> > A couple of minor off-topic notes,
> > 
> > +static inline void barrier_unlock(struct barrier *b)
> > +{
> > +       smp_wmb();
> > +       if (atomic_dec_and_test(&b->count))
> > +               __wake_up(&b->wait, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 0, b);
> > 
> > This is wake_up_all(&b->wait), yes? I don't undestans why key == b, it could be NULL.
> > 
> > +static inline void barrier_sync(struct barrier *b)
> > +{
> > +       might_sleep();
> > +
> > +       if (unlikely(atomic_read(&b->count))) {
> > +               DEFINE_WAIT(wait);
> > +               prepare_to_wait(&b->wait, &wait, TASK_UNINTERRUPTIBLE);
> > +               while (atomic_read(&b->count))
> > +                       schedule();
> > +               finish_wait(&b->wait, &wait);
> > +       }
> > +}
> > 
> > This should be open-coded wait_event(), but wrong! With the scenario above this
> > can hang forever! because the first wake_up removes the task from the &b->wait.
> 
> This would be me struggling with the waitqueue API, its all a tad
> confusing at first look.
> 

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-01-31 23:32             ` Paul E. McKenney
@ 2007-02-01  0:03               ` Peter Zijlstra
  2007-02-01  0:48                 ` Paul E. McKenney
  0 siblings, 1 reply; 54+ messages in thread
From: Peter Zijlstra @ 2007-02-01  0:03 UTC (permalink / raw)
  To: paulmck
  Cc: Oleg Nesterov, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Wed, 2007-01-31 at 15:32 -0800, Paul E. McKenney wrote:

> The wakeup in barrier_sync() would mean that the counter was zero
> at some point in the past.  The counter would then be rechecked, and
> if it were still zero, barrier_sync() would invoke finish_wait() and
> then return -- but the counter might well become non-zero in the
> meantime, right?
> 
> So given that barrier_sync() is permitted to return after the counter
> becomes non-zero, why can't it just rely on the fact that barrier_unlock()
> saw it as zero not long in the past?
> 
> > >  It looks like barrier_sync() is more a
> > > rw semaphore biased to readers.
> > 
> > Indeed, the locked sections are designed to be the rare case.
> 
> OK -- but barrier_sync() just waits for readers, it doesn't exclude them.
> 
> If all barrier_sync() needs to do is to wait until all pre-existing
> barrier_lock()/barrier_unlock() pairs to complete, it seems to me to
> be compatible with qrcu's semantics.
> 
> So what am I missing here?

I might be the one missing stuff, I'll have a hard look at qrcu.

The intent was that barrier_sync() would not write to memory when there
are no active locked sections, so that the cacheline can stay shared,
thus keeping is fast.

If qrcu does exactly this, then yes we have a match.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-01  0:03               ` Peter Zijlstra
@ 2007-02-01  0:48                 ` Paul E. McKenney
  2007-02-01 16:00                   ` Oleg Nesterov
  2007-02-03 16:38                   ` Oleg Nesterov
  0 siblings, 2 replies; 54+ messages in thread
From: Paul E. McKenney @ 2007-02-01  0:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Thu, Feb 01, 2007 at 01:03:09AM +0100, Peter Zijlstra wrote:
> On Wed, 2007-01-31 at 15:32 -0800, Paul E. McKenney wrote:
> 
> > The wakeup in barrier_sync() would mean that the counter was zero
> > at some point in the past.  The counter would then be rechecked, and
> > if it were still zero, barrier_sync() would invoke finish_wait() and
> > then return -- but the counter might well become non-zero in the
> > meantime, right?
> > 
> > So given that barrier_sync() is permitted to return after the counter
> > becomes non-zero, why can't it just rely on the fact that barrier_unlock()
> > saw it as zero not long in the past?
> > 
> > > >  It looks like barrier_sync() is more a
> > > > rw semaphore biased to readers.
> > > 
> > > Indeed, the locked sections are designed to be the rare case.
> > 
> > OK -- but barrier_sync() just waits for readers, it doesn't exclude them.
> > 
> > If all barrier_sync() needs to do is to wait until all pre-existing
> > barrier_lock()/barrier_unlock() pairs to complete, it seems to me to
> > be compatible with qrcu's semantics.
> > 
> > So what am I missing here?
> 
> I might be the one missing stuff, I'll have a hard look at qrcu.
> 
> The intent was that barrier_sync() would not write to memory when there
> are no active locked sections, so that the cacheline can stay shared,
> thus keeping is fast.
> 
> If qrcu does exactly this, then yes we have a match.

QRCU as currently written (http://lkml.org/lkml/2006/11/29/330) doesn't
do what you want, as it acquires the lock unconditionally.  I am proposing
that synchronize_qrcu() change to something like the following:

	void synchronize_qrcu(struct qrcu_struct *qp)
	{
		int idx;
	
		smp_mb();
	
		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
			smp_rmb();
			if (atomic_read(qp->ctr[0]) +
			    atomic_read(qp->ctr[1]) <= 1)
				goto out;
		}
	
		mutex_lock(&qp->mutex);
		idx = qp->completed & 0x1;
		atomic_inc(qp->ctr + (idx ^ 0x1));
		/* Reduce the likelihood that qrcu_read_lock() will loop */
		smp_mb__after_atomic_inc();
		qp->completed++;
	
		atomic_dec(qp->ctr + idx);
		__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
		mutex_unlock(&qp->mutex);
	out:
		smp_mb();
	}

For the first "if" to give a false positive, a concurrent switch had
to have happened.  For example, qp->ctr[0] was zero and qp->ctr[1]
was two at the time of the first atomic_read(), but then qp->completed
switched so that both qp->ctr[0] and qp->ctr[1] were one at the time
of the second atomic_read.  The only way the second "if" can give us a
false positive is if there was another change to qp->completed in the
meantime -- but that means that all of the pre-existing qrcu_read_lock()
holders must have gotten done, otherwise the second switch could not
have happened.  Yes, you do incur three memory barriers on the fast
path, but the best you could hope for with your approach was two of them
(unless I am confused about how you were using barrier_sync()).

Oleg, does this look safe?

Ugly at best, I know, but I do very much sympathize with Christoph's
desire to keep the number of synchronization primitives down to a
dull roar.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-01  0:48                 ` Paul E. McKenney
@ 2007-02-01 16:00                   ` Oleg Nesterov
  2007-02-01 21:38                     ` Paul E. McKenney
  2007-02-03 16:38                   ` Oleg Nesterov
  1 sibling, 1 reply; 54+ messages in thread
From: Oleg Nesterov @ 2007-02-01 16:00 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On 01/31, Paul E. McKenney wrote:
> 
> QRCU as currently written (http://lkml.org/lkml/2006/11/29/330) doesn't
> do what you want, as it acquires the lock unconditionally.  I am proposing
> that synchronize_qrcu() change to something like the following:
> 
> 	void synchronize_qrcu(struct qrcu_struct *qp)
> 	{
> 		int idx;
> 	
> 		smp_mb();
> 	
> 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> 			smp_rmb();
> 			if (atomic_read(qp->ctr[0]) +
> 			    atomic_read(qp->ctr[1]) <= 1)
> 				goto out;
> 		}
> 	
> 		mutex_lock(&qp->mutex);
> 		idx = qp->completed & 0x1;
> 		atomic_inc(qp->ctr + (idx ^ 0x1));
> 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> 		smp_mb__after_atomic_inc();
> 		qp->completed++;
> 	
> 		atomic_dec(qp->ctr + idx);
> 		__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
> 		mutex_unlock(&qp->mutex);
> 	out:
> 		smp_mb();
> 	}
> 
> For the first "if" to give a false positive, a concurrent switch had
> to have happened.  For example, qp->ctr[0] was zero and qp->ctr[1]
> was two at the time of the first atomic_read(), but then qp->completed
> switched so that both qp->ctr[0] and qp->ctr[1] were one at the time
> of the second atomic_read.  The only way the second "if" can give us a
> false positive is if there was another change to qp->completed in the
> meantime -- but that means that all of the pre-existing qrcu_read_lock()
> holders must have gotten done, otherwise the second switch could not
> have happened.  Yes, you do incur three memory barriers on the fast
> path, but the best you could hope for with your approach was two of them
> (unless I am confused about how you were using barrier_sync()).

While doing qrcu, somehow I convinced myself we can't optimize out taking
qp->mutex. Now I think I was wrong. Good!

Q: you deleted "if (atomic_read(qp->ctr + idx) == 1)" fastpath under ->mutex,
was this needed for this optimization to work? I am asking because I can't
understand how it can make any difference.

> Oleg, does this look safe?

Yes. But let me think more about this later, I've got a fever, and can't
think properly today :)

Oleg.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-01 16:00                   ` Oleg Nesterov
@ 2007-02-01 21:38                     ` Paul E. McKenney
  2007-02-02 11:56                       ` Oleg Nesterov
  0 siblings, 1 reply; 54+ messages in thread
From: Paul E. McKenney @ 2007-02-01 21:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Thu, Feb 01, 2007 at 07:00:10PM +0300, Oleg Nesterov wrote:
> On 01/31, Paul E. McKenney wrote:
> > 
> > QRCU as currently written (http://lkml.org/lkml/2006/11/29/330) doesn't
> > do what you want, as it acquires the lock unconditionally.  I am proposing
> > that synchronize_qrcu() change to something like the following:
> > 
> > 	void synchronize_qrcu(struct qrcu_struct *qp)
> > 	{
> > 		int idx;
> > 	
> > 		smp_mb();
> > 	
> > 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> > 			smp_rmb();
> > 			if (atomic_read(qp->ctr[0]) +
> > 			    atomic_read(qp->ctr[1]) <= 1)
> > 				goto out;
> > 		}
> > 	
> > 		mutex_lock(&qp->mutex);
> > 		idx = qp->completed & 0x1;
> > 		atomic_inc(qp->ctr + (idx ^ 0x1));
> > 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> > 		smp_mb__after_atomic_inc();
> > 		qp->completed++;
> > 	
> > 		atomic_dec(qp->ctr + idx);
> > 		__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
> > 		mutex_unlock(&qp->mutex);
> > 	out:
> > 		smp_mb();
> > 	}
> > 
> > For the first "if" to give a false positive, a concurrent switch had
> > to have happened.  For example, qp->ctr[0] was zero and qp->ctr[1]
> > was two at the time of the first atomic_read(), but then qp->completed
> > switched so that both qp->ctr[0] and qp->ctr[1] were one at the time
> > of the second atomic_read.  The only way the second "if" can give us a
> > false positive is if there was another change to qp->completed in the
> > meantime -- but that means that all of the pre-existing qrcu_read_lock()
> > holders must have gotten done, otherwise the second switch could not
> > have happened.  Yes, you do incur three memory barriers on the fast
> > path, but the best you could hope for with your approach was two of them
> > (unless I am confused about how you were using barrier_sync()).
> 
> While doing qrcu, somehow I convinced myself we can't optimize out taking
> qp->mutex. Now I think I was wrong. Good!

Me, I didn't want to worry about it unless someone needed it.  Which
it now appears they do.  ;-)

> Q: you deleted "if (atomic_read(qp->ctr + idx) == 1)" fastpath under ->mutex,
> was this needed for this optimization to work? I am asking because I can't
> understand how it can make any difference.

Before, we held the lock, so we could just check the single current
element.  Now we don't hold the lock, so we need to check both elements.
So I replaced the "if (atomic_read(qp->ctr + idx) == 1)" with the
nested "if" statements that test both elements.

> > Oleg, does this look safe?
> 
> Yes. But let me think more about this later, I've got a fever, and can't
> think properly today :)

Get well!!!  ;-)

And yes, the concurrency on the fastpath is nontrivial...

							Thanx, Paul

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-01 21:38                     ` Paul E. McKenney
@ 2007-02-02 11:56                       ` Oleg Nesterov
  2007-02-02 12:01                         ` Peter Zijlstra
  2007-02-02 17:13                         ` Paul E. McKenney
  0 siblings, 2 replies; 54+ messages in thread
From: Oleg Nesterov @ 2007-02-02 11:56 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On 02/01, Paul E. McKenney wrote:
> > > 
> > > 	void synchronize_qrcu(struct qrcu_struct *qp)
> > > 	{
> > > 		int idx;
> > > 	
> > > 		smp_mb();
> > > 	
> > > 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> > > 			smp_rmb();
> > > 			if (atomic_read(qp->ctr[0]) +
> > > 			    atomic_read(qp->ctr[1]) <= 1)
> > > 				goto out;
> > > 		}
> > > 	
> > > 		mutex_lock(&qp->mutex);
> > > 		idx = qp->completed & 0x1;
> > > 		atomic_inc(qp->ctr + (idx ^ 0x1));
> > > 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> > > 		smp_mb__after_atomic_inc();
> > > 		qp->completed++;
> > > 	
> > > 		atomic_dec(qp->ctr + idx);
> > > 		__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
> > > 		mutex_unlock(&qp->mutex);
> > > 	out:
> > > 		smp_mb();
> > > 	}
> > > 
> > > For the first "if" to give a false positive, a concurrent switch had
> > > to have happened.  For example, qp->ctr[0] was zero and qp->ctr[1]
> > > was two at the time of the first atomic_read(), but then qp->completed
> > > switched so that both qp->ctr[0] and qp->ctr[1] were one at the time
> > > of the second atomic_read.  The only way the second "if" can give us a
> > > false positive is if there was another change to qp->completed in the
> > > meantime -- but that means that all of the pre-existing qrcu_read_lock()
> > > holders must have gotten done, otherwise the second switch could not
> > > have happened.  Yes, you do incur three memory barriers on the fast
> > > path, but the best you could hope for with your approach was two of them
> > > (unless I am confused about how you were using barrier_sync()).

Yes. Without synchronize_qrcu() in between, one of the counters should be == 0,
another >= 1. == 1 means we have no active readers. So the false positive really
means a concurrent switch. And we can check twice - excellent idea!

> > While doing qrcu, somehow I convinced myself we can't optimize out taking
> > qp->mutex. Now I think I was wrong. Good!
> 
> Me, I didn't want to worry about it unless someone needed it.  Which
> it now appears they do.  ;-)

No. I do remember I tried hard to optimize out taking qp->mutex, but failed.
So I decided it is not possible. And now you show that I just don't have enough
brains! (of course, I hate you :)

> > Q: you deleted "if (atomic_read(qp->ctr + idx) == 1)" fastpath under ->mutex,
> > was this needed for this optimization to work? I am asking because I can't
> > understand how it can make any difference.
> 
> Before, we held the lock, so we could just check the single current
> element.  Now we don't hold the lock, so we need to check both elements.
> So I replaced the "if (atomic_read(qp->ctr + idx) == 1)" with the
> nested "if" statements that test both elements.

Ah, my question was different. The current version of qrcu does

	mutex_lock(&qp->mutex);

	idx = qp->completed & 0x1;
	if (atomic_read(qp->ctr + idx) == 1)	// fast path
		return;

	...

and it seems to me that we can retain this fastpath even with your optimization,
no? Surely, it is not so important, but it is nearly free.

Paul, could you make a patch? (I'll do rcutorture test tomorrow, but I only have
P-4 ht).

Peter, do you think you can use qrcu?

Oleg.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-02 11:56                       ` Oleg Nesterov
@ 2007-02-02 12:01                         ` Peter Zijlstra
  2007-02-02 17:13                         ` Paul E. McKenney
  1 sibling, 0 replies; 54+ messages in thread
From: Peter Zijlstra @ 2007-02-02 12:01 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Paul E. McKenney, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Fri, 2007-02-02 at 14:56 +0300, Oleg Nesterov wrote:
> On 02/01, Paul E. McKenney wrote:
> > > > 
> > > > 	void synchronize_qrcu(struct qrcu_struct *qp)
> > > > 	{
> > > > 		int idx;
> > > > 	
> > > > 		smp_mb();
> > > > 	
> > > > 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> > > > 			smp_rmb();
> > > > 			if (atomic_read(qp->ctr[0]) +
> > > > 			    atomic_read(qp->ctr[1]) <= 1)
> > > > 				goto out;
> > > > 		}
> > > > 	
> > > > 		mutex_lock(&qp->mutex);
> > > > 		idx = qp->completed & 0x1;
> > > > 		atomic_inc(qp->ctr + (idx ^ 0x1));
> > > > 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> > > > 		smp_mb__after_atomic_inc();
> > > > 		qp->completed++;
> > > > 	
> > > > 		atomic_dec(qp->ctr + idx);
> > > > 		__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
> > > > 		mutex_unlock(&qp->mutex);
> > > > 	out:
> > > > 		smp_mb();
> > > > 	}
> > > > 
> > > > For the first "if" to give a false positive, a concurrent switch had
> > > > to have happened.  For example, qp->ctr[0] was zero and qp->ctr[1]
> > > > was two at the time of the first atomic_read(), but then qp->completed
> > > > switched so that both qp->ctr[0] and qp->ctr[1] were one at the time
> > > > of the second atomic_read.  The only way the second "if" can give us a
> > > > false positive is if there was another change to qp->completed in the
> > > > meantime -- but that means that all of the pre-existing qrcu_read_lock()
> > > > holders must have gotten done, otherwise the second switch could not
> > > > have happened.  Yes, you do incur three memory barriers on the fast
> > > > path, but the best you could hope for with your approach was two of them
> > > > (unless I am confused about how you were using barrier_sync()).
> 
> Yes. Without synchronize_qrcu() in between, one of the counters should be == 0,
> another >= 1. == 1 means we have no active readers. So the false positive really
> means a concurrent switch. And we can check twice - excellent idea!
> 
> > > While doing qrcu, somehow I convinced myself we can't optimize out taking
> > > qp->mutex. Now I think I was wrong. Good!
> > 
> > Me, I didn't want to worry about it unless someone needed it.  Which
> > it now appears they do.  ;-)
> 
> No. I do remember I tried hard to optimize out taking qp->mutex, but failed.
> So I decided it is not possible. And now you show that I just don't have enough
> brains! (of course, I hate you :)
> 
> > > Q: you deleted "if (atomic_read(qp->ctr + idx) == 1)" fastpath under ->mutex,
> > > was this needed for this optimization to work? I am asking because I can't
> > > understand how it can make any difference.
> > 
> > Before, we held the lock, so we could just check the single current
> > element.  Now we don't hold the lock, so we need to check both elements.
> > So I replaced the "if (atomic_read(qp->ctr + idx) == 1)" with the
> > nested "if" statements that test both elements.
> 
> Ah, my question was different. The current version of qrcu does
> 
> 	mutex_lock(&qp->mutex);
> 
> 	idx = qp->completed & 0x1;
> 	if (atomic_read(qp->ctr + idx) == 1)	// fast path
> 		return;
> 
> 	...
> 
> and it seems to me that we can retain this fastpath even with your optimization,
> no? Surely, it is not so important, but it is nearly free.
> 
> Paul, could you make a patch? (I'll do rcutorture test tomorrow, but I only have
> P-4 ht).
> 
> Peter, do you think you can use qrcu?

Yes, this looks very much like what I need. Awesome work! 


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-02 11:56                       ` Oleg Nesterov
  2007-02-02 12:01                         ` Peter Zijlstra
@ 2007-02-02 17:13                         ` Paul E. McKenney
  1 sibling, 0 replies; 54+ messages in thread
From: Paul E. McKenney @ 2007-02-02 17:13 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel

On Fri, Feb 02, 2007 at 02:56:12PM +0300, Oleg Nesterov wrote:
> On 02/01, Paul E. McKenney wrote:
> > > > 
> > > > 	void synchronize_qrcu(struct qrcu_struct *qp)
> > > > 	{
> > > > 		int idx;
> > > > 	
> > > > 		smp_mb();
> > > > 	
> > > > 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> > > > 			smp_rmb();
> > > > 			if (atomic_read(qp->ctr[0]) +
> > > > 			    atomic_read(qp->ctr[1]) <= 1)
> > > > 				goto out;
> > > > 		}
> > > > 	
> > > > 		mutex_lock(&qp->mutex);
> > > > 		idx = qp->completed & 0x1;
> > > > 		atomic_inc(qp->ctr + (idx ^ 0x1));
> > > > 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> > > > 		smp_mb__after_atomic_inc();
> > > > 		qp->completed++;
> > > > 	
> > > > 		atomic_dec(qp->ctr + idx);
> > > > 		__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
> > > > 		mutex_unlock(&qp->mutex);
> > > > 	out:
> > > > 		smp_mb();
> > > > 	}
> > > > 
> > > > For the first "if" to give a false positive, a concurrent switch had
> > > > to have happened.  For example, qp->ctr[0] was zero and qp->ctr[1]
> > > > was two at the time of the first atomic_read(), but then qp->completed
> > > > switched so that both qp->ctr[0] and qp->ctr[1] were one at the time
> > > > of the second atomic_read.  The only way the second "if" can give us a
> > > > false positive is if there was another change to qp->completed in the
> > > > meantime -- but that means that all of the pre-existing qrcu_read_lock()
> > > > holders must have gotten done, otherwise the second switch could not
> > > > have happened.  Yes, you do incur three memory barriers on the fast
> > > > path, but the best you could hope for with your approach was two of them
> > > > (unless I am confused about how you were using barrier_sync()).
> 
> Yes. Without synchronize_qrcu() in between, one of the counters should be == 0,
> another >= 1. == 1 means we have no active readers. So the false positive really
> means a concurrent switch. And we can check twice - excellent idea!

Well, if it ends up really working.  ;-)

This one needs more than just testing -- I will put together a Promela
model that does a full state-space search for races.

I do have one fall-back, namely putting both counters into a single
aligned long.  But I would like to avoid this, since there might be
architectures out there that cannot cleanly store into half-longs.
Such architectures would have to use atomic ops.  :-/

> > > While doing qrcu, somehow I convinced myself we can't optimize out taking
> > > qp->mutex. Now I think I was wrong. Good!
> > 
> > Me, I didn't want to worry about it unless someone needed it.  Which
> > it now appears they do.  ;-)
> 
> No. I do remember I tried hard to optimize out taking qp->mutex, but failed.
> So I decided it is not possible. And now you show that I just don't have enough
> brains! (of course, I hate you :)

Coming from you, that is high praise indeed!!!  ;-)

Now if it really does work...

> > > Q: you deleted "if (atomic_read(qp->ctr + idx) == 1)" fastpath under ->mutex,
> > > was this needed for this optimization to work? I am asking because I can't
> > > understand how it can make any difference.
> > 
> > Before, we held the lock, so we could just check the single current
> > element.  Now we don't hold the lock, so we need to check both elements.
> > So I replaced the "if (atomic_read(qp->ctr + idx) == 1)" with the
> > nested "if" statements that test both elements.
> 
> Ah, my question was different. The current version of qrcu does
> 
> 	mutex_lock(&qp->mutex);
> 
> 	idx = qp->completed & 0x1;
> 	if (atomic_read(qp->ctr + idx) == 1)	// fast path
> 		return;
> 
> 	...
> 
> and it seems to me that we can retain this fastpath even with your optimization,
> no? Surely, it is not so important, but it is nearly free.

Ah!  This does make sense, excellent point!!!

> Paul, could you make a patch? (I'll do rcutorture test tomorrow, but I only have
> P-4 ht).

I will do the Promela model, and if that works, I will submit a patch.

						Thanx, Paul

> Peter, do you think you can use qrcu?
> 
> Oleg.
> 

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-01  0:48                 ` Paul E. McKenney
  2007-02-01 16:00                   ` Oleg Nesterov
@ 2007-02-03 16:38                   ` Oleg Nesterov
  2007-02-04  0:23                     ` Paul E. McKenney
  1 sibling, 1 reply; 54+ messages in thread
From: Oleg Nesterov @ 2007-02-03 16:38 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel, Alan Stern

On 01/31, Paul E. McKenney wrote:
>
> QRCU as currently written (http://lkml.org/lkml/2006/11/29/330) doesn't
> do what you want, as it acquires the lock unconditionally.  I am proposing
> that synchronize_qrcu() change to something like the following:
> 
> 	void synchronize_qrcu(struct qrcu_struct *qp)
> 	{
> 		int idx;
> 	
> 		smp_mb();
> 	
> 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> 			smp_rmb();
> 			if (atomic_read(qp->ctr[0]) +
> 			    atomic_read(qp->ctr[1]) <= 1)
> 				goto out;
> 		}
> 	
> 		mutex_lock(&qp->mutex);
> 		idx = qp->completed & 0x1;
> 		atomic_inc(qp->ctr + (idx ^ 0x1));
> 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> 		smp_mb__after_atomic_inc();

I almost forgot. Currently this smp_mb__after_atomic_inc() is not strictly
needed, and the comment is correct. However, it becomes mandatory with your
optimization. Without this barrier, it is possible that both checks above
mutex_lock() will see the result of atomic_dec(), but not the atomic_inc().

So, may I ask you to also update this comment?

	/*
	 * Reduce the likelihood that qrcu_read_lock() will loop
	 *	AND
	 * make sure the second re-check above will see the result
	 * of atomic_inc() if it sees the result of atomic_dec()
	 */

Something like this, I hope you will make it better.

And another note: this all assumes that STORE-MB-LOAD works "correctly", yes?
We have other code which relies on that, should not be a problem.

(Alan Stern cc'ed).

Oleg.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-03 16:38                   ` Oleg Nesterov
@ 2007-02-04  0:23                     ` Paul E. McKenney
  2007-02-04  3:24                       ` Alan Stern
  0 siblings, 1 reply; 54+ messages in thread
From: Paul E. McKenney @ 2007-02-04  0:23 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Ingo Molnar, Christoph Hellwig, Andrew Morton,
	linux-kernel, Alan Stern

[-- Attachment #1: Type: text/plain, Size: 3316 bytes --]

On Sat, Feb 03, 2007 at 07:38:50PM +0300, Oleg Nesterov wrote:
> On 01/31, Paul E. McKenney wrote:
> >
> > QRCU as currently written (http://lkml.org/lkml/2006/11/29/330) doesn't
> > do what you want, as it acquires the lock unconditionally.  I am proposing
> > that synchronize_qrcu() change to something like the following:
> > 
> > 	void synchronize_qrcu(struct qrcu_struct *qp)
> > 	{
> > 		int idx;
> > 	
> > 		smp_mb();
> > 	
> > 		if (atomic_read(qp->ctr[0]) + atomic_read(qp->ctr[1]) <= 1) {
> > 			smp_rmb();
> > 			if (atomic_read(qp->ctr[0]) +
> > 			    atomic_read(qp->ctr[1]) <= 1)
> > 				goto out;
> > 		}
> > 	
> > 		mutex_lock(&qp->mutex);
> > 		idx = qp->completed & 0x1;
> > 		atomic_inc(qp->ctr + (idx ^ 0x1));
> > 		/* Reduce the likelihood that qrcu_read_lock() will loop */
> > 		smp_mb__after_atomic_inc();
> 
> I almost forgot. Currently this smp_mb__after_atomic_inc() is not strictly
> needed, and the comment is correct. However, it becomes mandatory with your
> optimization. Without this barrier, it is possible that both checks above
> mutex_lock() will see the result of atomic_dec(), but not the atomic_inc().
> 
> So, may I ask you to also update this comment?
> 
> 	/*
> 	 * Reduce the likelihood that qrcu_read_lock() will loop
> 	 *	AND
> 	 * make sure the second re-check above will see the result
> 	 * of atomic_inc() if it sees the result of atomic_dec()
> 	 */
> 
> Something like this, I hope you will make it better.

Good catch!!!  I will make sure that this is reflected.

Also, validation is proceeding nicely, if extremely hoggishly.
The validation program and output thus far attached in case someone
is interested.  The three-readers/three-updaters case has worked its
way up to 16% of the memory on a 48GB machine.  ;-)

If it overflows, I will see if I can get someone to convert it to
VHDL and run hardware validation tools on it.  This turned out to
be necessary for the -rt implementation of RCU...

> And another note: this all assumes that STORE-MB-LOAD works "correctly", yes?
> We have other code which relies on that, should not be a problem.

We have been working with Doug Lea of SUNY Oswego, Sebatian Burckhardt of
University of Pennsylvania, and Vijay Saraswat of IBM Research towards
a "universal memory model" that accommodates all machines.  Currently,
it does in fact handle store-mb-load the way we all want, thankfully!
Actually, the other guys are doing most of the formalism, my main role
has been programming a very stupid checker based on their formalisms
and yelling at them when something bad happens.

See the cccc directory in the x10 project on SourceForge if you want more
info, but be warned that the checker's UI really sucks.  It's input and
output formats are abominations that could only have been dreamed up by
someone who started out on punchcards and early-70s BASIC.  Not pretty,
but it does a good job of letting people know what I think the formalism
is saying!

There are some people working on a Prolog program called jmmsolve that
as a much nicer input format, but I need to prototype memory barriers
before they will incorporate them (currently, each statement acts as
if it had smp_mb() before and after it).  Also, their output is as yet
incomprehensible to me.

						Thanx, Paul

> (Alan Stern cc'ed).
> 
> Oleg.
> 

[-- Attachment #2: qrcuval.2007.02.03a.tgz --]
[-- Type: application/x-compressed-tar, Size: 7246 bytes --]

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-04  0:23                     ` Paul E. McKenney
@ 2007-02-04  3:24                       ` Alan Stern
  2007-02-04  5:46                         ` Paul E. McKenney
  0 siblings, 1 reply; 54+ messages in thread
From: Alan Stern @ 2007-02-04  3:24 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Christoph Hellwig,
	Andrew Morton, linux-kernel

On Sat, 3 Feb 2007, Paul E. McKenney wrote:

> > And another note: this all assumes that STORE-MB-LOAD works "correctly", yes?
> > We have other code which relies on that, should not be a problem.
> 
> We have been working with Doug Lea of SUNY Oswego, Sebatian Burckhardt of
> University of Pennsylvania, and Vijay Saraswat of IBM Research towards
> a "universal memory model" that accommodates all machines.  Currently,
> it does in fact handle store-mb-load the way we all want, thankfully!

We should add that many places in the kernel do depend on proper behavior 
for this data access pattern.  So whatever "universal memory model" we end 
up with, it had better handle the pattern correctly if Linux is to support 
it.

It's interesting to note, however, that this does exclude simple MESI.

Alan Stern


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] barrier: a scalable synchonisation barrier
  2007-02-04  3:24                       ` Alan Stern
@ 2007-02-04  5:46                         ` Paul E. McKenney
  0 siblings, 0 replies; 54+ messages in thread
From: Paul E. McKenney @ 2007-02-04  5:46 UTC (permalink / raw)
  To: Alan Stern
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Christoph Hellwig,
	Andrew Morton, linux-kernel

On Sat, Feb 03, 2007 at 10:24:04PM -0500, Alan Stern wrote:
> On Sat, 3 Feb 2007, Paul E. McKenney wrote:
> 
> > > And another note: this all assumes that STORE-MB-LOAD works "correctly", yes?
> > > We have other code which relies on that, should not be a problem.
> > 
> > We have been working with Doug Lea of SUNY Oswego, Sebatian Burckhardt of
> > University of Pennsylvania, and Vijay Saraswat of IBM Research towards
> > a "universal memory model" that accommodates all machines.  Currently,
> > it does in fact handle store-mb-load the way we all want, thankfully!
> 
> We should add that many places in the kernel do depend on proper behavior 
> for this data access pattern.  So whatever "universal memory model" we end 
> up with, it had better handle the pattern correctly if Linux is to support 
> it.

Agreed!

> It's interesting to note, however, that this does exclude simple MESI.

Yep!  And also a number of compiler optimizations, as it turns out.  ;-)

There is a tension between nice-to-software memory-barrier properties
on the one hand and easily understood code on the other.  But I guess
that this is true of pretty much any software tool.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 54+ messages in thread

end of thread, other threads:[~2007-02-04  5:46 UTC | newest]

Thread overview: 54+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
2007-01-28 11:51 ` [PATCH 1/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
2007-01-28 11:51 ` [PATCH 3/7] barrier: a scalable synchonisation barrier Peter Zijlstra
2007-01-28 14:39   ` Christoph Hellwig
2007-01-28 15:24     ` Ingo Molnar
2007-01-28 15:34       ` Christoph Hellwig
2007-01-31 19:12       ` Paul E. McKenney
2007-01-31 21:13         ` Oleg Nesterov
2007-01-31 21:30           ` Oleg Nesterov
2007-01-31 21:48           ` Peter Zijlstra
2007-01-31 23:32             ` Paul E. McKenney
2007-02-01  0:03               ` Peter Zijlstra
2007-02-01  0:48                 ` Paul E. McKenney
2007-02-01 16:00                   ` Oleg Nesterov
2007-02-01 21:38                     ` Paul E. McKenney
2007-02-02 11:56                       ` Oleg Nesterov
2007-02-02 12:01                         ` Peter Zijlstra
2007-02-02 17:13                         ` Paul E. McKenney
2007-02-03 16:38                   ` Oleg Nesterov
2007-02-04  0:23                     ` Paul E. McKenney
2007-02-04  3:24                       ` Alan Stern
2007-02-04  5:46                         ` Paul E. McKenney
2007-01-28 11:51 ` [PATCH 4/7] fs: break the file_list_lock for sb->s_files Peter Zijlstra
2007-01-28 14:43   ` Christoph Hellwig
2007-01-28 15:21     ` Ingo Molnar
2007-01-28 15:30       ` Christoph Hellwig
2007-01-28 15:32         ` Peter Zijlstra
2007-01-28 15:36           ` Christoph Hellwig
2007-01-28 15:44         ` Ingo Molnar
2007-01-28 16:25         ` Bill Huey
2007-01-28 11:51 ` [PATCH 5/7] fs: restore previous sb->s_files iteration semantics Peter Zijlstra
2007-01-28 11:51 ` [PATCH 6/7] schedule_on_each_cpu_wq() Peter Zijlstra
2007-01-28 11:51 ` [PATCH 7/7] fs: fixup filevec_add_drain_all Peter Zijlstra
2007-01-28 12:16 ` [PATCH 8/7] fs: free_write_pipe() fix Peter Zijlstra
2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
2007-01-28 15:11   ` Christoph Hellwig
2007-01-28 15:29     ` Peter Zijlstra
2007-01-28 15:33       ` Christoph Hellwig
2007-01-29 13:32     ` Stephen Smalley
2007-01-29 18:02       ` Christoph Hellwig
2007-01-28 15:24   ` Ingo Molnar
2007-01-28 16:52     ` Martin J. Bligh
2007-01-28 17:04       ` lockmeter Christoph Hellwig
2007-01-28 17:38         ` lockmeter Martin J. Bligh
2007-01-28 18:01           ` lockmeter Bill Huey
2007-01-28 19:26             ` lockmeter Ingo Molnar
2007-01-28 21:17             ` lockmeter Ingo Molnar
2007-01-29  5:27               ` lockmeter Bill Huey
2007-01-29 10:26                 ` lockmeter Bill Huey
2007-01-29  1:08         ` lockmeter Arjan van de Ven
2007-01-29  1:12           ` lockmeter Martin J. Bligh
2007-01-28 18:41   ` [PATCH 0/7] breaking the global file_list_lock Ingo Molnar
2007-01-28 20:38     ` Christoph Hellwig
2007-01-28 21:05       ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).