All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] cpumask: fix lg_lock/br_lock.
@ 2012-02-27 23:22 Rusty Russell
  2012-02-27 23:53   ` Andrew Morton
  0 siblings, 1 reply; 45+ messages in thread
From: Rusty Russell @ 2012-02-27 23:22 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Alexander Viro, Andrew Morton, Andi Kleen,
	Srivatsa S. Bhat, linux-fsdevel

Use a cpumask_var_t instead of cpumask_t.  We're doing plenty of
allocations here anyway, so it's not really an issue, and it sets a
good example.

(cpumask_t is obsolescent, as are the cpus_* functions).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/file_table.c        |    2 +-
 fs/namespace.c         |    4 +++-
 include/linux/lglock.h |   23 +++++++++++++----------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -526,6 +526,6 @@ void __init files_init(unsigned long mem
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
-	lg_lock_init(files_lglock);
+	lg_lock_init(files_lglock, GFP_KERNEL);
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2576,7 +2576,9 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
-	br_lock_init(vfsmount_lock);
+	err = br_lock_init(vfsmount_lock, GFP_KERNEL);
+	if (err)
+		panic("Failed to init vfsmount_lock: %d\n", err);
 
 	err = sysfs_init();
 	if (err)
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -25,7 +25,7 @@
 #include <linux/cpu.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
+#define br_lock_init(name, gfp)	name##_lock_init(gfp)
 #define br_read_lock(name)	name##_local_lock()
 #define br_read_unlock(name)	name##_local_unlock()
 #define br_write_lock(name)	name##_global_lock_online()
@@ -35,7 +35,7 @@
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
 
-#define lg_lock_init(name)	name##_lock_init()
+#define lg_lock_init(name, gfp)	name##_lock_init(gfp)
 #define lg_local_lock(name)	name##_local_lock()
 #define lg_local_unlock(name)	name##_local_unlock()
 #define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
@@ -61,7 +61,7 @@
 
 
 #define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
+ extern int name##_lock_init(gfp_t gfp);				\
  extern void name##_local_lock(void);					\
  extern void name##_local_unlock(void);					\
  extern void name##_local_lock_cpu(int cpu);				\
@@ -74,7 +74,7 @@
 #define DEFINE_LGLOCK(name)						\
 									\
  DEFINE_SPINLOCK(name##_cpu_lock);					\
- cpumask_t name##_cpus __read_mostly;					\
+ cpumask_var_t name##_cpus __read_mostly;				\
  DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
  DEFINE_LGLOCK_LOCKDEP(name);						\
 									\
@@ -85,12 +85,12 @@
 	switch (action & ~CPU_TASKS_FROZEN) {				\
 	case CPU_UP_PREPARE:						\
 		spin_lock(&name##_cpu_lock);				\
-		cpu_set((unsigned long)hcpu, name##_cpus);		\
+		cpumask_set_cpu((unsigned long)hcpu, name##_cpus);		\
 		spin_unlock(&name##_cpu_lock);				\
 		break;							\
 	case CPU_UP_CANCELED: case CPU_DEAD:				\
 		spin_lock(&name##_cpu_lock);				\
-		cpu_clear((unsigned long)hcpu, name##_cpus);		\
+		cpumask_clear_cpu((unsigned long)hcpu, name##_cpus);		\
 		spin_unlock(&name##_cpu_lock);				\
 	}								\
 	return NOTIFY_OK;						\
@@ -98,7 +98,7 @@
  static struct notifier_block name##_lg_cpu_notifier = {		\
 	.notifier_call = name##_lg_cpu_callback,			\
  };									\
- void name##_lock_init(void) {						\
+ int name##_lock_init(gfp_t gfp) {						\
 	int i;								\
 	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
 	for_each_possible_cpu(i) {					\
@@ -107,10 +107,13 @@
 		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
 	}								\
 	register_hotcpu_notifier(&name##_lg_cpu_notifier);		\
+	if (!alloc_cpumask_var(&name##_cpus, gfp))			\
+		return -ENOMEM;						\
 	get_online_cpus();						\
 	for_each_online_cpu(i)						\
-		cpu_set(i, name##_cpus);				\
+		cpumask_set_cpu(i, name##_cpus);			\
 	put_online_cpus();						\
+	return 0;							\
  }									\
  EXPORT_SYMBOL(name##_lock_init);					\
 									\
@@ -154,7 +157,7 @@
 	int i;								\
 	spin_lock(&name##_cpu_lock);					\
 	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
+	for_each_cpu(i, name##_cpus) {					\
 		arch_spinlock_t *lock;					\
 		lock = &per_cpu(name##_lock, i);			\
 		arch_spin_lock(lock);					\
@@ -165,7 +168,7 @@
  void name##_global_unlock_online(void) {				\
 	int i;								\
 	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
+	for_each_cpu(i, name##_cpus) {					\
 		arch_spinlock_t *lock;					\
 		lock = &per_cpu(name##_lock, i);			\
 		arch_spin_unlock(lock);					\

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-27 23:22 [PATCH] cpumask: fix lg_lock/br_lock Rusty Russell
@ 2012-02-27 23:53   ` Andrew Morton
  0 siblings, 0 replies; 45+ messages in thread
From: Andrew Morton @ 2012-02-27 23:53 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Nick Piggin, linux-kernel, Alexander Viro, Andi Kleen,
	Srivatsa S. Bhat, linux-fsdevel

On Tue, 28 Feb 2012 09:52:30 +1030
Rusty Russell <rusty@rustcorp.com.au> wrote:

> Use a cpumask_var_t instead of cpumask_t.  We're doing plenty of
> allocations here anyway, so it's not really an issue, and it sets a
> good example.
> 
> (cpumask_t is obsolescent, as are the cpus_* functions).

Congratulations to yourself and Andi:

patching file fs/file_table.c
Hunk #1 FAILED at 526.
1 out of 1 hunk FAILED -- saving rejects to file fs/file_table.c.rej
patching file fs/namespace.c
Hunk #1 FAILED at 2576.
1 out of 1 hunk FAILED -- saving rejects to file fs/namespace.c.rej
patching file include/linux/lglock.h
Hunk #1 FAILED at 25.
Hunk #2 FAILED at 35.
Hunk #3 FAILED at 61.
Hunk #4 FAILED at 74.
Hunk #5 FAILED at 85.
Hunk #6 FAILED at 98.
Hunk #7 FAILED at 107.
Hunk #8 FAILED at 154.
Hunk #9 FAILED at 165.
9 out of 9 hunks FAILED -- saving rejects to file include/linux/lglock.h.rej
Failed to apply cpumask-fix-lg_lock-br_lock

a clean sweep!

It's due to brlocks-lglocks-cleanups.patch, which I didn't think to cc
you on.  Thoughts?


From: Andi Kleen <ak@linux.intel.com>
Subject: brlocks/lglocks: cleanups

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

Since there are at least two users it makes sense to share this code in a
library.  This is also easier maintainable than a macro forest.

This will also make it later possible to dynamically allocate lglocks and
also use them in modules (this would both still need some additional, but
now straightforward, code)

In general the users now look more like normal function calls with
pointers, not magic macros.

The patch is rather large because I move over all users in one go to keep
it bisectable.  This impacts the VFS somewhat in terms of lines changed. 
But no actual behaviour change.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 fs/dcache.c            |    4 
 fs/file_table.c        |   17 +--
 fs/internal.h          |    3 
 fs/namei.c             |   24 ++--
 fs/namespace.c         |  140 ++++++++++++++--------------
 fs/pnode.c             |    4 
 fs/proc_namespace.c    |    4 
 include/linux/lglock.h |  189 +++++++--------------------------------
 kernel/Makefile        |    2 
 kernel/lglock.c        |  136 ++++++++++++++++++++++++++++
 10 files changed, 269 insertions(+), 254 deletions(-)

diff -puN fs/dcache.c~brlocks-lglocks-cleanups fs/dcache.c
--- a/fs/dcache.c~brlocks-lglocks-cleanups
+++ a/fs/dcache.c
@@ -2442,7 +2442,7 @@ static int prepend_path(const struct pat
 	bool slash = false;
 	int error = 0;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -2473,7 +2473,7 @@ static int prepend_path(const struct pat
 		error = prepend(buffer, buflen, "/", 1);
 
 out:
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return error;
 
 global_root:
diff -puN fs/file_table.c~brlocks-lglocks-cleanups fs/file_table.c
--- a/fs/file_table.c~brlocks-lglocks-cleanups
+++ a/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
@@ -421,9 +420,9 @@ static inline void __file_sb_list_add(st
  */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	lg_local_lock(files_lglock);
+	lg_local_lock(&files_lglock);
 	__file_sb_list_add(file, sb);
-	lg_local_unlock(files_lglock);
+	lg_local_unlock(&files_lglock);
 }
 
 /**
@@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file,
 void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
 	}
 }
 
@@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *s
 	struct file *f;
 
 retry:
-	lg_global_lock(files_lglock);
+	lg_global_lock(&files_lglock);
 	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -502,12 +501,12 @@ retry:
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
 		/* This can sleep, so we can't hold the spinlock. */
-		lg_global_unlock(files_lglock);
+		lg_global_unlock(&files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
 	} while_file_list_for_each_entry;
-	lg_global_unlock(files_lglock);
+	lg_global_unlock(&files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -525,6 +524,6 @@ void __init files_init(unsigned long mem
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
-	lg_lock_init(files_lglock);
+	lg_lock_init(&files_lglock, "files_lglock");
 	percpu_counter_init(&nr_files, 0);
 } 
diff -puN fs/internal.h~brlocks-lglocks-cleanups fs/internal.h
--- a/fs/internal.h~brlocks-lglocks-cleanups
+++ a/fs/internal.h
@@ -56,8 +56,7 @@ extern int sb_prepare_remount_readonly(s
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
-
+extern struct lglock vfsmount_lock;
 
 /*
  * fs_struct.c
diff -puN fs/namei.c~brlocks-lglocks-cleanups fs/namei.c
--- a/fs/namei.c~brlocks-lglocks-cleanups
+++ a/fs/namei.c
@@ -462,7 +462,7 @@ static int unlazy_walk(struct nameidata 
 	mntget(nd->path.mnt);
 
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	nd->flags &= ~LOOKUP_RCU;
 	return 0;
 
@@ -520,14 +520,14 @@ static int complete_walk(struct nameidat
 		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
 			spin_unlock(&dentry->d_lock);
 			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
+			br_read_unlock(&vfsmount_lock);
 			return -ECHILD;
 		}
 		BUG_ON(nd->inode != dentry->d_inode);
 		spin_unlock(&dentry->d_lock);
 		mntget(nd->path.mnt);
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -694,15 +694,15 @@ int follow_up(struct path *path)
 	struct mount *parent;
 	struct dentry *mountpoint;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	parent = mnt->mnt_parent;
 	if (&parent->mnt == path->mnt) {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -960,7 +960,7 @@ failed:
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return -ECHILD;
 }
 
@@ -1256,7 +1256,7 @@ static void terminate_walk(struct nameid
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 }
 
@@ -1490,7 +1490,7 @@ static int path_init(int dfd, const char
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 		} else {
@@ -1503,7 +1503,7 @@ static int path_init(int dfd, const char
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			set_root_rcu(nd);
 		} else {
@@ -1516,7 +1516,7 @@ static int path_init(int dfd, const char
 			struct fs_struct *fs = current->fs;
 			unsigned seq;
 
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 
 			do {
@@ -1552,7 +1552,7 @@ static int path_init(int dfd, const char
 			if (fput_needed)
 				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 		} else {
 			path_get(&file->f_path);
diff -puN fs/namespace.c~brlocks-lglocks-cleanups fs/namespace.c
--- a/fs/namespace.c~brlocks-lglocks-cleanups
+++ a/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct moun
 {
 	int ret = 0;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct moun
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct s
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct s
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return err;
 }
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path 
 {
 	struct mount *child_mnt;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
 	if (child_mnt) {
 		mnt_add_count(child_mnt, 1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return &child_mnt->mnt;
 	} else {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return NULL;
 	}
 }
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mo
 		mnt->mnt.mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 		mnt->mnt_parent = mnt;
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (flag & CL_SLAVE) {
 			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct moun
 {
 put_again:
 #ifdef CONFIG_SMP
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (likely(atomic_read(&mnt->mnt_longterm))) {
 		mnt_add_count(mnt, -1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		return;
 	}
 #else
 	mnt_add_count(mnt, -1);
 	if (likely(mnt_get_count(mnt)))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 #endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+
 	list_del(&mnt->mnt_instance);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	mntfree(mnt);
 }
 
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	real_mount(mnt)->mnt_pinned++;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (mnt->mnt_pinned) {
 		mnt_add_count(mnt, 1);
 		mnt->mnt_pinned--;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
 	BUG_ON(!m);
 
 	/* write lock needed for mnt_get_count */
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *he
 			struct dentry *dentry;
 			struct mount *m;
 
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			dentry = mnt->mnt_mountpoint;
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 			mnt->mnt_parent = mnt;
 			m->mnt_ghosts--;
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			dput(dentry);
 			mntput(&m->mnt);
 		}
@@ -1112,12 +1113,12 @@ static int do_umount(struct mount *mnt, 
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		if (mnt_get_count(mnt) != 2) {
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			return -EBUSY;
 		}
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -1159,7 +1160,7 @@ static int do_umount(struct mount *mnt, 
 	}
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1171,7 +1172,7 @@ static int do_umount(struct mount *mnt, 
 			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1286,19 +1287,19 @@ struct mount *copy_tree(struct mount *mn
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (!q)
 				goto Enomem;
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(res, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 	return NULL;
@@ -1318,9 +1319,9 @@ void drop_collected_mounts(struct vfsmou
 {
 	LIST_HEAD(umount_list);
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(real_mount(mnt), 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
@@ -1448,7 +1449,7 @@ static int attach_recursive_mnt(struct m
 	if (err)
 		goto out_cleanup_ids;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1468,7 @@ static int attach_recursive_mnt(struct m
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return 0;
 
@@ -1565,10 +1566,10 @@ static int do_change_type(struct path *p
 			goto out_unlock;
 	}
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
  out_unlock:
 	up_write(&namespace_sem);
@@ -1617,9 +1618,9 @@ static int do_loopback(struct path *path
 
 	err = graft_tree(mnt, path);
 	if (err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 out2:
 	unlock_mount(path);
@@ -1677,16 +1678,16 @@ static int do_remount(struct path *path,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		touch_mnt_namespace(mnt->mnt_ns);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	return err;
 }
@@ -1893,9 +1894,9 @@ fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		down_write(&namespace_sem);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_del_init(&mnt->mnt_expire);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		up_write(&namespace_sem);
 	}
 	mntput(m);
@@ -1911,11 +1912,11 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1936,7 @@ void mark_mounts_for_expiry(struct list_
 		return;
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -1954,7 +1955,7 @@ void mark_mounts_for_expiry(struct list_
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1, &umounts);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 
 	release_mounts(&umounts);
@@ -2218,9 +2219,9 @@ void mnt_make_shortterm(struct vfsmount 
 	struct mount *mnt = real_mount(m);
 	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	atomic_dec(&mnt->mnt_longterm);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 #endif
 }
 
@@ -2249,10 +2250,9 @@ static struct mnt_namespace *dup_mnt_ns(
 		kfree(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
-	new_ns->root = new;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new->mnt_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2416,9 @@ bool is_path_reachable(struct mount *mnt
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2505,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
 		goto out4;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	/* mount old root on put_old */
@@ -2513,7 +2513,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* mount new_root on / */
 	attach_mnt(new_mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	chroot_fs_refs(&root, &new);
 	error = 0;
 out4:
@@ -2576,7 +2576,7 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
-	br_lock_init(vfsmount_lock);
+	br_lock_init(&vfsmount_lock);
 
 	err = sysfs_init();
 	if (err)
@@ -2596,9 +2596,9 @@ void put_mnt_ns(struct mnt_namespace *ns
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(ns->root, 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(ns);
diff -puN fs/pnode.c~brlocks-lglocks-cleanups fs/pnode.c
--- a/fs/pnode.c~brlocks-lglocks-cleanups
+++ a/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt
 		prev_src_mnt  = child;
 	}
 out:
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct mount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
diff -puN fs/proc_namespace.c~brlocks-lglocks-cleanups fs/proc_namespace.c
--- a/fs/proc_namespace.c~brlocks-lglocks-cleanups
+++ a/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file 
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (p->m.poll_event != ns->event) {
 		p->m.poll_event = ns->event;
 		res |= POLLERR | POLLPRI;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
 	return res;
 }
diff -puN include/linux/lglock.h~brlocks-lglocks-cleanups include/linux/lglock.h
--- a/include/linux/lglock.h~brlocks-lglocks-cleanups
+++ a/include/linux/lglock.h
@@ -23,28 +23,17 @@
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
-#define br_read_lock(name)	name##_local_lock()
-#define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock_online()
-#define br_write_unlock(name)	name##_global_unlock_online()
+#define br_lock_init(name)	lg_lock_init(name, #name)
+#define br_read_lock(name)	lg_local_lock(name)
+#define br_read_unlock(name)	lg_local_unlock(name)
+#define br_write_lock(name)	lg_global_lock_online(name)
+#define br_write_unlock(name)	lg_global_unlock_online(name)
 
-#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)	name##_lock_init()
-#define lg_local_lock(name)	name##_local_lock()
-#define lg_local_unlock(name)	name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)	name##_global_lock()
-#define lg_global_unlock(name)	name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
@@ -59,142 +48,34 @@
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
+struct lglock {
+	arch_spinlock_t __percpu *lock;
+	cpumask_t cpus;	/* XXX need to put on separate cacheline? */
+	spinlock_t cpu_lock;
+	struct notifier_block cpu_notifier;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key lock_key;
+	struct lockdep_map    lock_dep_map;
+#endif
+};
+
+#define DEFINE_LGLOCK(name) \
+	DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) = __ARCH_SPIN_LOCK_UNLOCKED; \
+	struct lglock name = { .lock = &name ## _lock, 				\
+			       .cpu_lock = __SPIN_LOCK_UNLOCKED(cpu_lock), 	\
+			       .cpu_notifier.notifier_call = lg_cpu_callback }
+
+/* Only valid for statics */
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock_online(struct lglock *lg);
+void lg_global_unlock_online(struct lglock *lg);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
+int lg_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu);
 
-#define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
- extern void name##_local_lock(void);					\
- extern void name##_local_unlock(void);					\
- extern void name##_local_lock_cpu(int cpu);				\
- extern void name##_local_unlock_cpu(int cpu);				\
- extern void name##_global_lock(void);					\
- extern void name##_global_unlock(void);				\
- extern void name##_global_lock_online(void);				\
- extern void name##_global_unlock_online(void);				\
-
-#define DEFINE_LGLOCK(name)						\
-									\
- DEFINE_SPINLOCK(name##_cpu_lock);					\
- cpumask_t name##_cpus __read_mostly;					\
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
- DEFINE_LGLOCK_LOCKDEP(name);						\
-									\
- static int								\
- name##_lg_cpu_callback(struct notifier_block *nb,			\
-				unsigned long action, void *hcpu)	\
- {									\
-	switch (action & ~CPU_TASKS_FROZEN) {				\
-	case CPU_UP_PREPARE:						\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_set((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-		break;							\
-	case CPU_UP_CANCELED: case CPU_DEAD:				\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_clear((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-	}								\
-	return NOTIFY_OK;						\
- }									\
- static struct notifier_block name##_lg_cpu_notifier = {		\
-	.notifier_call = name##_lg_cpu_callback,			\
- };									\
- void name##_lock_init(void) {						\
-	int i;								\
-	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
-	}								\
-	register_hotcpu_notifier(&name##_lg_cpu_notifier);		\
-	get_online_cpus();						\
-	for_each_online_cpu(i)						\
-		cpu_set(i, name##_cpus);				\
-	put_online_cpus();						\
- }									\
- EXPORT_SYMBOL(name##_lock_init);					\
-									\
- void name##_local_lock(void) {						\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock);					\
-									\
- void name##_local_unlock(void) {					\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock);					\
-									\
- void name##_local_lock_cpu(int cpu) {					\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock_cpu);					\
-									\
- void name##_local_unlock_cpu(int cpu) {				\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock_cpu);				\
-									\
- void name##_global_lock_online(void) {					\
-	int i;								\
-	spin_lock(&name##_cpu_lock);					\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock_online);				\
-									\
- void name##_global_unlock_online(void) {				\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	spin_unlock(&name##_cpu_lock);					\
- }									\
- EXPORT_SYMBOL(name##_global_unlock_online);				\
-									\
- void name##_global_lock(void) {					\
-	int i;								\
-	preempt_disable();						\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock);					\
-									\
- void name##_global_unlock(void) {					\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_global_unlock);
 #endif
diff -puN kernel/Makefile~brlocks-lglocks-cleanups kernel/Makefile
--- a/kernel/Makefile~brlocks-lglocks-cleanups
+++ a/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o
+	    async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff -puN /dev/null kernel/lglock.c
--- /dev/null
+++ a/kernel/lglock.c
@@ -0,0 +1,136 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/* Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+	int i;
+
+	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+
+	register_hotcpu_notifier(&lg->cpu_notifier);
+	get_online_cpus();
+	for_each_online_cpu (i)
+		cpu_set(i, lg->cpus);
+	put_online_cpus();
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock_online(struct lglock *lg)
+{
+	int i;
+	spin_lock(&lg->cpu_lock);
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_cpu(i, &lg->cpus) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock_online);
+
+void lg_global_unlock_online(struct lglock *lg)
+{
+	int i;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_cpu(i, &lg->cpus) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	spin_unlock(&lg->cpu_lock);
+}
+EXPORT_SYMBOL(lg_global_unlock_online);
+
+void lg_global_lock(struct lglock *lg)
+{
+	int i;
+	preempt_disable();
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+	int i;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
+
+int lg_cpu_callback(struct notifier_block *nb,
+                              unsigned long action, void *hcpu)
+{
+	struct lglock *lglock = container_of(nb, struct lglock, cpu_notifier);
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		spin_lock(&lglock->cpu_lock);
+		cpu_set((unsigned long)hcpu, lglock->cpus);
+		spin_unlock(&lglock->cpu_lock);
+		break;
+	case CPU_UP_CANCELED: case CPU_DEAD:
+		spin_lock(&lglock->cpu_lock);
+		cpu_clear((unsigned long)hcpu, lglock->cpus);
+		spin_unlock(&lglock->cpu_lock);
+	}
+	return NOTIFY_OK;
+}
+EXPORT_SYMBOL(lg_cpu_callback);
+
_



^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
@ 2012-02-27 23:53   ` Andrew Morton
  0 siblings, 0 replies; 45+ messages in thread
From: Andrew Morton @ 2012-02-27 23:53 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Nick Piggin, linux-kernel, Alexander Viro, Andi Kleen,
	Srivatsa S. Bhat, linux-fsdevel

On Tue, 28 Feb 2012 09:52:30 +1030
Rusty Russell <rusty@rustcorp.com.au> wrote:

> Use a cpumask_var_t instead of cpumask_t.  We're doing plenty of
> allocations here anyway, so it's not really an issue, and it sets a
> good example.
> 
> (cpumask_t is obsolescent, as are the cpus_* functions).

Congratulations to yourself and Andi:

patching file fs/file_table.c
Hunk #1 FAILED at 526.
1 out of 1 hunk FAILED -- saving rejects to file fs/file_table.c.rej
patching file fs/namespace.c
Hunk #1 FAILED at 2576.
1 out of 1 hunk FAILED -- saving rejects to file fs/namespace.c.rej
patching file include/linux/lglock.h
Hunk #1 FAILED at 25.
Hunk #2 FAILED at 35.
Hunk #3 FAILED at 61.
Hunk #4 FAILED at 74.
Hunk #5 FAILED at 85.
Hunk #6 FAILED at 98.
Hunk #7 FAILED at 107.
Hunk #8 FAILED at 154.
Hunk #9 FAILED at 165.
9 out of 9 hunks FAILED -- saving rejects to file include/linux/lglock.h.rej
Failed to apply cpumask-fix-lg_lock-br_lock

a clean sweep!

It's due to brlocks-lglocks-cleanups.patch, which I didn't think to cc
you on.  Thoughts?


From: Andi Kleen <ak@linux.intel.com>
Subject: brlocks/lglocks: cleanups

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

Since there are at least two users it makes sense to share this code in a
library.  This is also easier maintainable than a macro forest.

This will also make it later possible to dynamically allocate lglocks and
also use them in modules (this would both still need some additional, but
now straightforward, code)

In general the users now look more like normal function calls with
pointers, not magic macros.

The patch is rather large because I move over all users in one go to keep
it bisectable.  This impacts the VFS somewhat in terms of lines changed. 
But no actual behaviour change.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 fs/dcache.c            |    4 
 fs/file_table.c        |   17 +--
 fs/internal.h          |    3 
 fs/namei.c             |   24 ++--
 fs/namespace.c         |  140 ++++++++++++++--------------
 fs/pnode.c             |    4 
 fs/proc_namespace.c    |    4 
 include/linux/lglock.h |  189 +++++++--------------------------------
 kernel/Makefile        |    2 
 kernel/lglock.c        |  136 ++++++++++++++++++++++++++++
 10 files changed, 269 insertions(+), 254 deletions(-)

diff -puN fs/dcache.c~brlocks-lglocks-cleanups fs/dcache.c
--- a/fs/dcache.c~brlocks-lglocks-cleanups
+++ a/fs/dcache.c
@@ -2442,7 +2442,7 @@ static int prepend_path(const struct pat
 	bool slash = false;
 	int error = 0;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -2473,7 +2473,7 @@ static int prepend_path(const struct pat
 		error = prepend(buffer, buflen, "/", 1);
 
 out:
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return error;
 
 global_root:
diff -puN fs/file_table.c~brlocks-lglocks-cleanups fs/file_table.c
--- a/fs/file_table.c~brlocks-lglocks-cleanups
+++ a/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
@@ -421,9 +420,9 @@ static inline void __file_sb_list_add(st
  */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	lg_local_lock(files_lglock);
+	lg_local_lock(&files_lglock);
 	__file_sb_list_add(file, sb);
-	lg_local_unlock(files_lglock);
+	lg_local_unlock(&files_lglock);
 }
 
 /**
@@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file,
 void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
 	}
 }
 
@@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *s
 	struct file *f;
 
 retry:
-	lg_global_lock(files_lglock);
+	lg_global_lock(&files_lglock);
 	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -502,12 +501,12 @@ retry:
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
 		/* This can sleep, so we can't hold the spinlock. */
-		lg_global_unlock(files_lglock);
+		lg_global_unlock(&files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
 	} while_file_list_for_each_entry;
-	lg_global_unlock(files_lglock);
+	lg_global_unlock(&files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -525,6 +524,6 @@ void __init files_init(unsigned long mem
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
-	lg_lock_init(files_lglock);
+	lg_lock_init(&files_lglock, "files_lglock");
 	percpu_counter_init(&nr_files, 0);
 } 
diff -puN fs/internal.h~brlocks-lglocks-cleanups fs/internal.h
--- a/fs/internal.h~brlocks-lglocks-cleanups
+++ a/fs/internal.h
@@ -56,8 +56,7 @@ extern int sb_prepare_remount_readonly(s
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
-
+extern struct lglock vfsmount_lock;
 
 /*
  * fs_struct.c
diff -puN fs/namei.c~brlocks-lglocks-cleanups fs/namei.c
--- a/fs/namei.c~brlocks-lglocks-cleanups
+++ a/fs/namei.c
@@ -462,7 +462,7 @@ static int unlazy_walk(struct nameidata 
 	mntget(nd->path.mnt);
 
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	nd->flags &= ~LOOKUP_RCU;
 	return 0;
 
@@ -520,14 +520,14 @@ static int complete_walk(struct nameidat
 		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
 			spin_unlock(&dentry->d_lock);
 			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
+			br_read_unlock(&vfsmount_lock);
 			return -ECHILD;
 		}
 		BUG_ON(nd->inode != dentry->d_inode);
 		spin_unlock(&dentry->d_lock);
 		mntget(nd->path.mnt);
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -694,15 +694,15 @@ int follow_up(struct path *path)
 	struct mount *parent;
 	struct dentry *mountpoint;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	parent = mnt->mnt_parent;
 	if (&parent->mnt == path->mnt) {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -960,7 +960,7 @@ failed:
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return -ECHILD;
 }
 
@@ -1256,7 +1256,7 @@ static void terminate_walk(struct nameid
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 }
 
@@ -1490,7 +1490,7 @@ static int path_init(int dfd, const char
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 		} else {
@@ -1503,7 +1503,7 @@ static int path_init(int dfd, const char
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			set_root_rcu(nd);
 		} else {
@@ -1516,7 +1516,7 @@ static int path_init(int dfd, const char
 			struct fs_struct *fs = current->fs;
 			unsigned seq;
 
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 
 			do {
@@ -1552,7 +1552,7 @@ static int path_init(int dfd, const char
 			if (fput_needed)
 				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 		} else {
 			path_get(&file->f_path);
diff -puN fs/namespace.c~brlocks-lglocks-cleanups fs/namespace.c
--- a/fs/namespace.c~brlocks-lglocks-cleanups
+++ a/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct moun
 {
 	int ret = 0;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct moun
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct s
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct s
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return err;
 }
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path 
 {
 	struct mount *child_mnt;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
 	if (child_mnt) {
 		mnt_add_count(child_mnt, 1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return &child_mnt->mnt;
 	} else {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return NULL;
 	}
 }
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mo
 		mnt->mnt.mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 		mnt->mnt_parent = mnt;
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (flag & CL_SLAVE) {
 			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct moun
 {
 put_again:
 #ifdef CONFIG_SMP
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (likely(atomic_read(&mnt->mnt_longterm))) {
 		mnt_add_count(mnt, -1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		return;
 	}
 #else
 	mnt_add_count(mnt, -1);
 	if (likely(mnt_get_count(mnt)))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 #endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+
 	list_del(&mnt->mnt_instance);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	mntfree(mnt);
 }
 
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	real_mount(mnt)->mnt_pinned++;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (mnt->mnt_pinned) {
 		mnt_add_count(mnt, 1);
 		mnt->mnt_pinned--;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
 	BUG_ON(!m);
 
 	/* write lock needed for mnt_get_count */
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *he
 			struct dentry *dentry;
 			struct mount *m;
 
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			dentry = mnt->mnt_mountpoint;
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 			mnt->mnt_parent = mnt;
 			m->mnt_ghosts--;
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			dput(dentry);
 			mntput(&m->mnt);
 		}
@@ -1112,12 +1113,12 @@ static int do_umount(struct mount *mnt, 
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		if (mnt_get_count(mnt) != 2) {
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			return -EBUSY;
 		}
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -1159,7 +1160,7 @@ static int do_umount(struct mount *mnt, 
 	}
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1171,7 +1172,7 @@ static int do_umount(struct mount *mnt, 
 			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1286,19 +1287,19 @@ struct mount *copy_tree(struct mount *mn
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (!q)
 				goto Enomem;
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(res, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 	return NULL;
@@ -1318,9 +1319,9 @@ void drop_collected_mounts(struct vfsmou
 {
 	LIST_HEAD(umount_list);
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(real_mount(mnt), 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
@@ -1448,7 +1449,7 @@ static int attach_recursive_mnt(struct m
 	if (err)
 		goto out_cleanup_ids;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1468,7 @@ static int attach_recursive_mnt(struct m
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return 0;
 
@@ -1565,10 +1566,10 @@ static int do_change_type(struct path *p
 			goto out_unlock;
 	}
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
  out_unlock:
 	up_write(&namespace_sem);
@@ -1617,9 +1618,9 @@ static int do_loopback(struct path *path
 
 	err = graft_tree(mnt, path);
 	if (err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 out2:
 	unlock_mount(path);
@@ -1677,16 +1678,16 @@ static int do_remount(struct path *path,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		touch_mnt_namespace(mnt->mnt_ns);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	return err;
 }
@@ -1893,9 +1894,9 @@ fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		down_write(&namespace_sem);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_del_init(&mnt->mnt_expire);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		up_write(&namespace_sem);
 	}
 	mntput(m);
@@ -1911,11 +1912,11 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1936,7 @@ void mark_mounts_for_expiry(struct list_
 		return;
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -1954,7 +1955,7 @@ void mark_mounts_for_expiry(struct list_
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1, &umounts);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 
 	release_mounts(&umounts);
@@ -2218,9 +2219,9 @@ void mnt_make_shortterm(struct vfsmount 
 	struct mount *mnt = real_mount(m);
 	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	atomic_dec(&mnt->mnt_longterm);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 #endif
 }
 
@@ -2249,10 +2250,9 @@ static struct mnt_namespace *dup_mnt_ns(
 		kfree(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
-	new_ns->root = new;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new->mnt_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2416,9 @@ bool is_path_reachable(struct mount *mnt
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2505,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
 		goto out4;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	/* mount old root on put_old */
@@ -2513,7 +2513,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* mount new_root on / */
 	attach_mnt(new_mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	chroot_fs_refs(&root, &new);
 	error = 0;
 out4:
@@ -2576,7 +2576,7 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
-	br_lock_init(vfsmount_lock);
+	br_lock_init(&vfsmount_lock);
 
 	err = sysfs_init();
 	if (err)
@@ -2596,9 +2596,9 @@ void put_mnt_ns(struct mnt_namespace *ns
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(ns->root, 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(ns);
diff -puN fs/pnode.c~brlocks-lglocks-cleanups fs/pnode.c
--- a/fs/pnode.c~brlocks-lglocks-cleanups
+++ a/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt
 		prev_src_mnt  = child;
 	}
 out:
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct mount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
diff -puN fs/proc_namespace.c~brlocks-lglocks-cleanups fs/proc_namespace.c
--- a/fs/proc_namespace.c~brlocks-lglocks-cleanups
+++ a/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file 
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (p->m.poll_event != ns->event) {
 		p->m.poll_event = ns->event;
 		res |= POLLERR | POLLPRI;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
 	return res;
 }
diff -puN include/linux/lglock.h~brlocks-lglocks-cleanups include/linux/lglock.h
--- a/include/linux/lglock.h~brlocks-lglocks-cleanups
+++ a/include/linux/lglock.h
@@ -23,28 +23,17 @@
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
-#define br_read_lock(name)	name##_local_lock()
-#define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock_online()
-#define br_write_unlock(name)	name##_global_unlock_online()
+#define br_lock_init(name)	lg_lock_init(name, #name)
+#define br_read_lock(name)	lg_local_lock(name)
+#define br_read_unlock(name)	lg_local_unlock(name)
+#define br_write_lock(name)	lg_global_lock_online(name)
+#define br_write_unlock(name)	lg_global_unlock_online(name)
 
-#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)	name##_lock_init()
-#define lg_local_lock(name)	name##_local_lock()
-#define lg_local_unlock(name)	name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)	name##_global_lock()
-#define lg_global_unlock(name)	name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
@@ -59,142 +48,34 @@
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
+struct lglock {
+	arch_spinlock_t __percpu *lock;
+	cpumask_t cpus;	/* XXX need to put on separate cacheline? */
+	spinlock_t cpu_lock;
+	struct notifier_block cpu_notifier;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key lock_key;
+	struct lockdep_map    lock_dep_map;
+#endif
+};
+
+#define DEFINE_LGLOCK(name) \
+	DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) = __ARCH_SPIN_LOCK_UNLOCKED; \
+	struct lglock name = { .lock = &name ## _lock, 				\
+			       .cpu_lock = __SPIN_LOCK_UNLOCKED(cpu_lock), 	\
+			       .cpu_notifier.notifier_call = lg_cpu_callback }
+
+/* Only valid for statics */
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock_online(struct lglock *lg);
+void lg_global_unlock_online(struct lglock *lg);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
+int lg_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu);
 
-#define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
- extern void name##_local_lock(void);					\
- extern void name##_local_unlock(void);					\
- extern void name##_local_lock_cpu(int cpu);				\
- extern void name##_local_unlock_cpu(int cpu);				\
- extern void name##_global_lock(void);					\
- extern void name##_global_unlock(void);				\
- extern void name##_global_lock_online(void);				\
- extern void name##_global_unlock_online(void);				\
-
-#define DEFINE_LGLOCK(name)						\
-									\
- DEFINE_SPINLOCK(name##_cpu_lock);					\
- cpumask_t name##_cpus __read_mostly;					\
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
- DEFINE_LGLOCK_LOCKDEP(name);						\
-									\
- static int								\
- name##_lg_cpu_callback(struct notifier_block *nb,			\
-				unsigned long action, void *hcpu)	\
- {									\
-	switch (action & ~CPU_TASKS_FROZEN) {				\
-	case CPU_UP_PREPARE:						\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_set((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-		break;							\
-	case CPU_UP_CANCELED: case CPU_DEAD:				\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_clear((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-	}								\
-	return NOTIFY_OK;						\
- }									\
- static struct notifier_block name##_lg_cpu_notifier = {		\
-	.notifier_call = name##_lg_cpu_callback,			\
- };									\
- void name##_lock_init(void) {						\
-	int i;								\
-	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
-	}								\
-	register_hotcpu_notifier(&name##_lg_cpu_notifier);		\
-	get_online_cpus();						\
-	for_each_online_cpu(i)						\
-		cpu_set(i, name##_cpus);				\
-	put_online_cpus();						\
- }									\
- EXPORT_SYMBOL(name##_lock_init);					\
-									\
- void name##_local_lock(void) {						\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock);					\
-									\
- void name##_local_unlock(void) {					\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock);					\
-									\
- void name##_local_lock_cpu(int cpu) {					\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock_cpu);					\
-									\
- void name##_local_unlock_cpu(int cpu) {				\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock_cpu);				\
-									\
- void name##_global_lock_online(void) {					\
-	int i;								\
-	spin_lock(&name##_cpu_lock);					\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock_online);				\
-									\
- void name##_global_unlock_online(void) {				\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	spin_unlock(&name##_cpu_lock);					\
- }									\
- EXPORT_SYMBOL(name##_global_unlock_online);				\
-									\
- void name##_global_lock(void) {					\
-	int i;								\
-	preempt_disable();						\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock);					\
-									\
- void name##_global_unlock(void) {					\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_global_unlock);
 #endif
diff -puN kernel/Makefile~brlocks-lglocks-cleanups kernel/Makefile
--- a/kernel/Makefile~brlocks-lglocks-cleanups
+++ a/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o
+	    async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff -puN /dev/null kernel/lglock.c
--- /dev/null
+++ a/kernel/lglock.c
@@ -0,0 +1,136 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/* Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+	int i;
+
+	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+
+	register_hotcpu_notifier(&lg->cpu_notifier);
+	get_online_cpus();
+	for_each_online_cpu (i)
+		cpu_set(i, lg->cpus);
+	put_online_cpus();
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock_online(struct lglock *lg)
+{
+	int i;
+	spin_lock(&lg->cpu_lock);
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_cpu(i, &lg->cpus) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock_online);
+
+void lg_global_unlock_online(struct lglock *lg)
+{
+	int i;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_cpu(i, &lg->cpus) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	spin_unlock(&lg->cpu_lock);
+}
+EXPORT_SYMBOL(lg_global_unlock_online);
+
+void lg_global_lock(struct lglock *lg)
+{
+	int i;
+	preempt_disable();
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+	int i;
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
+
+int lg_cpu_callback(struct notifier_block *nb,
+                              unsigned long action, void *hcpu)
+{
+	struct lglock *lglock = container_of(nb, struct lglock, cpu_notifier);
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		spin_lock(&lglock->cpu_lock);
+		cpu_set((unsigned long)hcpu, lglock->cpus);
+		spin_unlock(&lglock->cpu_lock);
+		break;
+	case CPU_UP_CANCELED: case CPU_DEAD:
+		spin_lock(&lglock->cpu_lock);
+		cpu_clear((unsigned long)hcpu, lglock->cpus);
+		spin_unlock(&lglock->cpu_lock);
+	}
+	return NOTIFY_OK;
+}
+EXPORT_SYMBOL(lg_cpu_callback);
+
_

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-27 23:53   ` Andrew Morton
  (?)
@ 2012-02-28  8:43   ` Ingo Molnar
  2012-02-28 11:25     ` Andi Kleen
  2012-02-28 21:27     ` Andrew Morton
  -1 siblings, 2 replies; 45+ messages in thread
From: Ingo Molnar @ 2012-02-28  8:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, Nick Piggin, linux-kernel, Alexander Viro,
	Andi Kleen, Srivatsa S. Bhat, linux-fsdevel


* Andrew Morton <akpm@linux-foundation.org> wrote:

> From: Andi Kleen <ak@linux.intel.com>
> Subject: brlocks/lglocks: cleanups

This patch, while I agree with the goal of de-CPP-ing the 
lglocks code, violates my sense of taste and logic in numerous 
ways. If we touch this code it should be done right.

This patch should also probably go upstream through the 
locking/lockdep tree? Mind sending it us once you think it's 
ready?

> 
>  fs/dcache.c            |    4 
>  fs/file_table.c        |   17 +--
>  fs/internal.h          |    3 
>  fs/namei.c             |   24 ++--
>  fs/namespace.c         |  140 ++++++++++++++--------------
>  fs/pnode.c             |    4 
>  fs/proc_namespace.c    |    4 
>  include/linux/lglock.h |  189 +++++++--------------------------------
>  kernel/Makefile        |    2 
>  kernel/lglock.c        |  136 ++++++++++++++++++++++++++++
>  10 files changed, 269 insertions(+), 254 deletions(-)

It's absolutely crazy to do such a large patch that both 
uninlines the code and changes the API.

There is absolutely no good reason to do it like that - and the 
resulting merge pain on Andrew is a direct result of that: had 
it been kept separate it would be much easier and safer to 
update just the API change patch ...

Do those things in at least two patches, making the bigger API 
change patch plain and *OBVIOUS*.

> +struct lglock {
> +	arch_spinlock_t __percpu *lock;
> +	cpumask_t cpus;	/* XXX need to put on separate cacheline? */
> +	spinlock_t cpu_lock;
> +	struct notifier_block cpu_notifier;
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	struct lock_class_key lock_key;
> +	struct lockdep_map    lock_dep_map;
> +#endif
> +};

This is not a readable structure definition. Plenty of readable 
ones exist in the tree, they should be inspected for clues.

> +/* Only valid for statics */

what does this comment mean?

> +void lg_lock_init(struct lglock *lg, char *name);
> +void lg_local_lock(struct lglock *lg);
> +void lg_local_unlock(struct lglock *lg);
> +void lg_local_lock_cpu(struct lglock *lg, int cpu);
> +void lg_local_unlock_cpu(struct lglock *lg, int cpu);
> +void lg_global_lock_online(struct lglock *lg);
> +void lg_global_unlock_online(struct lglock *lg);
> +void lg_global_lock(struct lglock *lg);
> +void lg_global_unlock(struct lglock *lg);
> +
> +int lg_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu);

These APIs lack documentation.

> +/* Note there is no uninit, so lglocks cannot be defined in
> + * modules (but it's fine to use them from there)
> + * Could be added though, just undo lg_lock_init
> + */

I'm sure Andi knows what the standard shape of multi-line 
comments is in the kernel? Why has he still not learned?

> +void lg_local_lock(struct lglock *lg)
> +{
> +	arch_spinlock_t *lock;
> +	preempt_disable();

I'm sure Andi knows what's wrong here, he has been reviewed 
dozens of times for similar mistakes in the x86 trees.

> +void lg_local_unlock(struct lglock *lg)
> +{
> +	arch_spinlock_t *lock;
> +	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);

ditto.

> +void lg_local_lock_cpu(struct lglock *lg, int cpu)
> +{
> +	arch_spinlock_t *lock;
> +	preempt_disable();

ditto.

> +	arch_spinlock_t *lock;
> +	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);

ditto.

> +{
> +	int i;
> +	spin_lock(&lg->cpu_lock);

ditto.

> +{
> +	int i;
> +	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);

ditto.

> +{
> +	int i;
> +	preempt_disable();

ditto.

> +{
> +	int i;
> +	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);

ditto.

> +int lg_cpu_callback(struct notifier_block *nb,
> +                              unsigned long action, void *hcpu)

ugly linebreak, we can do better.


> +{
> +	struct lglock *lglock = container_of(nb, struct lglock, cpu_notifier);
> +	switch (action & ~CPU_TASKS_FROZEN) {

sigh.

Until these defects (and probably more, haven't done a full 
check) are fixed:

Nacked-by: Ingo Molnar <mingo@elte.hu>

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-27 23:53   ` Andrew Morton
  (?)
  (?)
@ 2012-02-28 11:24   ` Andi Kleen
  2012-03-05  7:02     ` Rusty Russell
                       ` (3 more replies)
  -1 siblings, 4 replies; 45+ messages in thread
From: Andi Kleen @ 2012-02-28 11:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, Nick Piggin, linux-kernel, Alexander Viro,
	Srivatsa S. Bhat, linux-fsdevel

On Mon, Feb 27, 2012 at 03:53:38PM -0800, Andrew Morton wrote:
> On Tue, 28 Feb 2012 09:52:30 +1030
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> 
> > Use a cpumask_var_t instead of cpumask_t.  We're doing plenty of
> > allocations here anyway, so it's not really an issue, and it sets a
> > good example.
> > 
> > (cpumask_t is obsolescent, as are the cpus_* functions).
> 
> Congratulations to yourself and Andi:

Well the only way to avoid that problem is to merge it ASAP.

The patch -- like most code movement patches -- is totally unsuitable for 
keeping around for months in trees.

Usually I found the best way to do merges with code movement patches
is to edit the patches itself if possible.

Maybe some day we'll get patch like tools that can deal with this stuff
better.

Rusty: you should probably have cocci rules for this stuff, not manual
patches.

-Andi

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-28  8:43   ` Ingo Molnar
@ 2012-02-28 11:25     ` Andi Kleen
  2012-02-28 12:51       ` Ingo Molnar
  2012-02-28 21:27     ` Andrew Morton
  1 sibling, 1 reply; 45+ messages in thread
From: Andi Kleen @ 2012-02-28 11:25 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel


As far as I can tell from skimming that lengthy rant it's only about some 
white space.

So the patch must be perfect in substance!

-Andi

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-28 11:25     ` Andi Kleen
@ 2012-02-28 12:51       ` Ingo Molnar
  0 siblings, 0 replies; 45+ messages in thread
From: Ingo Molnar @ 2012-02-28 12:51 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel


* Andi Kleen <ak@linux.intel.com> wrote:

> As far as I can tell from skimming that lengthy rant it's only 
> about some white space.

Read it again.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-28  8:43   ` Ingo Molnar
  2012-02-28 11:25     ` Andi Kleen
@ 2012-02-28 21:27     ` Andrew Morton
  2012-02-29  5:44       ` Srivatsa S. Bhat
  2012-02-29  8:29       ` [PATCH] cpumask: fix lg_lock/br_lock Ingo Molnar
  1 sibling, 2 replies; 45+ messages in thread
From: Andrew Morton @ 2012-02-28 21:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Rusty Russell, Nick Piggin, linux-kernel, Alexander Viro,
	Andi Kleen, Srivatsa S. Bhat, linux-fsdevel

On Tue, 28 Feb 2012 09:43:59 +0100
Ingo Molnar <mingo@elte.hu> wrote:

> This patch should also probably go upstream through the 
> locking/lockdep tree? Mind sending it us once you think it's 
> ready?

Oh goody, that means you own
http://marc.info/?l=linux-kernel&m=131419353511653&w=2.

I do think the brlock code is sick, but Nick has turned into an ex-Nick
and nobody works on it.


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-28 21:27     ` Andrew Morton
@ 2012-02-29  5:44       ` Srivatsa S. Bhat
  2012-02-29  9:17         ` Ingo Molnar
  2012-02-29  8:29       ` [PATCH] cpumask: fix lg_lock/br_lock Ingo Molnar
  1 sibling, 1 reply; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-02-29  5:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel

Hi Andrew,

On 02/29/2012 02:57 AM, Andrew Morton wrote:

> On Tue, 28 Feb 2012 09:43:59 +0100
> Ingo Molnar <mingo@elte.hu> wrote:
> 
>> This patch should also probably go upstream through the 
>> locking/lockdep tree? Mind sending it us once you think it's 
>> ready?
> 
> Oh goody, that means you own
> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
>


That bug got fixed sometime around Dec 2011. See commit e30e2fdf
(VFS: Fix race between CPU hotplug and lglocks)

> I do think the brlock code is sick, but Nick has turned into an ex-Nick
> and nobody works on it.
> 

 
Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-28 21:27     ` Andrew Morton
  2012-02-29  5:44       ` Srivatsa S. Bhat
@ 2012-02-29  8:29       ` Ingo Molnar
  2012-02-29  8:58         ` Peter Zijlstra
  1 sibling, 1 reply; 45+ messages in thread
From: Ingo Molnar @ 2012-02-29  8:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, Nick Piggin, linux-kernel, Alexander Viro,
	Andi Kleen, Srivatsa S. Bhat, linux-fsdevel, Peter Zijlstra


* Andrew Morton <akpm@linux-foundation.org> wrote:

> On Tue, 28 Feb 2012 09:43:59 +0100
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > This patch should also probably go upstream through the 
> > locking/lockdep tree? Mind sending it us once you think it's 
> > ready?
> 
> Oh goody, that means you own 
> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
>
> I do think the brlock code is sick, but Nick has turned into 
> an ex-Nick and nobody works on it.

The main problem, highlighted by the above soft lockup report, 
is that lglocks and brlocks go outside the regular spinlock 
facilities and thus avoid quite a bit of generic lock debugging 
for no good reason. Those patches should never have gone 
upstream in their incomplete form via the VFS tree.

As part of any cleanup they should first be converted from 
arch_spinlock_t to regular spinlock_t - I bet if that is done 
then that not only simplifies the wrappers massively, it also 
turns the above soft lockup report into a nice, actionable 
lockdep splat.

Andi's deficient patch does not address that main shortcoming of 
brlocks/lglocks, it just addresses a symptom - while introducing 
a handful of new symptoms of suckage.

I'll review and process resubmitted patches from Andi if he 
wants to submit a proper, complete series - but there's a 
quality threshold and in this case I'd rather keep 1970's 
preprocessor code that sucks very visibly and possibly attracts 
someone with a clue than have a sloppy patch hiding the deeper 
design problems done by a clueless person who is also openly 
hostile towards the concept of producing quality code.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-29  8:29       ` [PATCH] cpumask: fix lg_lock/br_lock Ingo Molnar
@ 2012-02-29  8:58         ` Peter Zijlstra
  2012-02-29  9:32           ` Ingo Molnar
  0 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2012-02-29  8:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, Srivatsa S. Bhat, linux-fsdevel

On Wed, 2012-02-29 at 09:29 +0100, Ingo Molnar wrote:
> As part of any cleanup they should first be converted from 
> arch_spinlock_t to regular spinlock_t - I bet if that is done 
> then that not only simplifies the wrappers massively, it also 
> turns the above soft lockup report into a nice, actionable 
> lockdep splat. 

It might help if you'd actually read the code.. that will simply not
work.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-29  5:44       ` Srivatsa S. Bhat
@ 2012-02-29  9:17         ` Ingo Molnar
  2012-02-29 11:12           ` Srivatsa S. Bhat
  0 siblings, 1 reply; 45+ messages in thread
From: Ingo Molnar @ 2012-02-29  9:17 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel


* Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:

> Hi Andrew,
> 
> On 02/29/2012 02:57 AM, Andrew Morton wrote:
> 
> > On Tue, 28 Feb 2012 09:43:59 +0100
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> >> This patch should also probably go upstream through the 
> >> locking/lockdep tree? Mind sending it us once you think it's 
> >> ready?
> > 
> > Oh goody, that means you own
> > http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
> >
> 
> 
> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
> (VFS: Fix race between CPU hotplug and lglocks)

The lglocks code is still CPU-hotplug racy AFAICS, despite the 
->cpu_lock complication:

Consider a taken global lock on a CPU:

	CPU#1
	...
	br_write_lock(vfsmount_lock);

this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
CPU#3 comes online and takes the read lock:

			CPU#3
			br_read_lock(vfsmount_lock);

This will succeed while the br_write_lock() is still active, 
because CPU#1 has only taken the locks of CPU#1 and CPU#2. 

Crash!

The proper fix would be for CPU-online to serialize with all 
known lglocks, via the notifier callback, i.e. to do something 
like this:

        case CPU_UP_PREPARE:                                            
		for_each_online_cpu(cpu) {
	                spin_lock(&name##_cpu_lock);                            
	                spin_unlock(&name##_cpu_lock);
		}
	...

I.e. in essence do:

        case CPU_UP_PREPARE:                                            
		name##_global_lock_online();
		name##_global_unlock_online();

Another detail I noticed, this bit:

        register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
        get_online_cpus();                                              \
        for_each_online_cpu(i)                                          \
                cpu_set(i, name##_cpus);                                \
        put_online_cpus();                                              \

could be something simpler and loop-less, like:

        get_online_cpus();
	cpumask_copy(name##_cpus, cpu_online_mask);
	register_hotcpu_notifier(&name##_lg_cpu_notifier);
	put_online_cpus();

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-29  8:58         ` Peter Zijlstra
@ 2012-02-29  9:32           ` Ingo Molnar
  0 siblings, 0 replies; 45+ messages in thread
From: Ingo Molnar @ 2012-02-29  9:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, Srivatsa S. Bhat, linux-fsdevel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Wed, 2012-02-29 at 09:29 +0100, Ingo Molnar wrote:
>
> > As part of any cleanup they should first be converted from 
> > arch_spinlock_t to regular spinlock_t - I bet if that is 
> > done then that not only simplifies the wrappers massively, 
> > it also turns the above soft lockup report into a nice, 
> > actionable lockdep splat.
> 
> It might help if you'd actually read the code.. that will 
> simply not work.

It cannot find all bugs - such as the CPU hotplug race that is 
still present in the code.

Still there's no excuse to go outside regular spinlock debug 
primitives via arch_spinlock_t.

If lockdep blows up in br_write_lock() due to holding up to 4096 
individual locks then we should add the exceptions to this 
particular write lock when the CPU count is too high - but:

 - do not disable the checking on saner configs

 - not disable all the *OTHER* lock debugging checks such as:

     - spin-lockup detection [this works even without ->held_locks]

     - allocate/free failure detection:

         The percpu code could be extended to run the equivalent 
         of debug_check_no_locks_freed() over the percpu area 
         that is going away, to make sure no held locks are 
         freed.

etc.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-29  9:17         ` Ingo Molnar
@ 2012-02-29 11:12           ` Srivatsa S. Bhat
  2012-03-01  7:38             ` Ingo Molnar
  2012-03-01  8:12               ` Srivatsa S. Bhat
  0 siblings, 2 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-02-29 11:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven

On 02/29/2012 02:47 PM, Ingo Molnar wrote:

> 
> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
> 
>> Hi Andrew,
>>
>> On 02/29/2012 02:57 AM, Andrew Morton wrote:
>>
>>> On Tue, 28 Feb 2012 09:43:59 +0100
>>> Ingo Molnar <mingo@elte.hu> wrote:
>>>
>>>> This patch should also probably go upstream through the 
>>>> locking/lockdep tree? Mind sending it us once you think it's 
>>>> ready?
>>>
>>> Oh goody, that means you own
>>> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
>>>
>>
>>
>> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
>> (VFS: Fix race between CPU hotplug and lglocks)
> 
> The lglocks code is still CPU-hotplug racy AFAICS, despite the 
> ->cpu_lock complication:
> 
> Consider a taken global lock on a CPU:
> 
> 	CPU#1
> 	...
> 	br_write_lock(vfsmount_lock);
> 
> this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
> CPU#3 comes online and takes the read lock:


CPU#3 cannot come online! :-)

No new CPU can come online until that corresponding br_write_unlock()
is completed. That is because  br_write_lock acquires &name##_cpu_lock
and only br_write_unlock will release it.
And, CPU_UP_PREPARE callback tries to acquire that very same spinlock,
and hence will keep spinning until br_write_unlock() is run. And hence,
the CPU#3 or any new CPU online for that matter will not complete until
br_write_unlock() is done.

It is of course debatable as to how good this design really is, but IMHO,
the lglocks code is not CPU-hotplug racy now..

Here is the link to the original discussion during the development of
that patch: thread.gmane.org/gmane.linux.file-systems/59750/

> 
> 			CPU#3
> 			br_read_lock(vfsmount_lock);
> 
> This will succeed while the br_write_lock() is still active, 
> because CPU#1 has only taken the locks of CPU#1 and CPU#2. 
> 
> Crash!
> 
> The proper fix would be for CPU-online to serialize with all 
> known lglocks, via the notifier callback, i.e. to do something 
> like this:
> 
>         case CPU_UP_PREPARE:                                            
> 		for_each_online_cpu(cpu) {
> 	                spin_lock(&name##_cpu_lock);                            
> 	                spin_unlock(&name##_cpu_lock);
> 		}
> 	...
> 
> I.e. in essence do:
> 
>         case CPU_UP_PREPARE:                                            
> 		name##_global_lock_online();
> 		name##_global_unlock_online();
> 
> Another detail I noticed, this bit:
> 
>         register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
>         get_online_cpus();                                              \
>         for_each_online_cpu(i)                                          \
>                 cpu_set(i, name##_cpus);                                \
>         put_online_cpus();                                              \
> 
> could be something simpler and loop-less, like:
> 
>         get_online_cpus();
> 	cpumask_copy(name##_cpus, cpu_online_mask);
> 	register_hotcpu_notifier(&name##_lg_cpu_notifier);
> 	put_online_cpus();
> 


While the cpumask_copy is definitely better, we can't put the
register_hotcpu_notifier() within get/put_online_cpus() because it will
lead to ABBA deadlock with a newly initiated CPU Hotplug operation, the
2 locks involved being the cpu_add_remove_lock and the cpu_hotplug lock.

IOW, at the moment there is no "absolutely race-free way" way to do
CPU Hotplug callback registration. Some time ago, while going through the
asynchronous booting patch by Arjan [1] I had written up a patch to fix
that race because that race got transformed from "purely theoretical"
to "very real" with the async boot patch, as shown by the powerpc boot
failures [2].

But then I stopped short of posting that patch to the lists because I
started wondering how important that race would actually turn out to be,
in case the async booting design takes a totally different approach
altogether.. [And the reason why I didn't post it is also because it
would require lots of changes in many parts where CPU Hotplug registration
is done, and that wouldn't probably be justified (I don't know..) if the
race remained only theoretical, as it is now.]

[1]. http://thread.gmane.org/gmane.linux.kernel/1246209
[2]. https://lkml.org/lkml/2012/2/13/383
 
Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-29 11:12           ` Srivatsa S. Bhat
@ 2012-03-01  7:38             ` Ingo Molnar
  2012-03-01  9:15               ` Srivatsa S. Bhat
  2012-03-01  8:12               ` Srivatsa S. Bhat
  1 sibling, 1 reply; 45+ messages in thread
From: Ingo Molnar @ 2012-03-01  7:38 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven


* Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:

> On 02/29/2012 02:47 PM, Ingo Molnar wrote:
> 
> > 
> > * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
> > 
> >> Hi Andrew,
> >>
> >> On 02/29/2012 02:57 AM, Andrew Morton wrote:
> >>
> >>> On Tue, 28 Feb 2012 09:43:59 +0100
> >>> Ingo Molnar <mingo@elte.hu> wrote:
> >>>
> >>>> This patch should also probably go upstream through the 
> >>>> locking/lockdep tree? Mind sending it us once you think it's 
> >>>> ready?
> >>>
> >>> Oh goody, that means you own
> >>> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
> >>>
> >>
> >>
> >> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
> >> (VFS: Fix race between CPU hotplug and lglocks)
> > 
> > The lglocks code is still CPU-hotplug racy AFAICS, despite the 
> > ->cpu_lock complication:
> > 
> > Consider a taken global lock on a CPU:
> > 
> > 	CPU#1
> > 	...
> > 	br_write_lock(vfsmount_lock);
> > 
> > this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
> > CPU#3 comes online and takes the read lock:
> 
> 
> CPU#3 cannot come online! :-)
> 
> No new CPU can come online until that corresponding br_write_unlock()
> is completed. That is because  br_write_lock acquires &name##_cpu_lock
> and only br_write_unlock will release it.

Indeed, you are right.

Note that ->cpu_lock is an entirely superfluous complication in 
br_write_lock(): the whole CPU hotplug race can be addressed by 
doing a br_write_lock()/unlock() barrier in the hotplug callback 
...

> > Another detail I noticed, this bit:
> > 
> >         register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
> >         get_online_cpus();                                              \
> >         for_each_online_cpu(i)                                          \
> >                 cpu_set(i, name##_cpus);                                \
> >         put_online_cpus();                                              \
> > 
> > could be something simpler and loop-less, like:
> > 
> >         get_online_cpus();
> > 	cpumask_copy(name##_cpus, cpu_online_mask);
> > 	register_hotcpu_notifier(&name##_lg_cpu_notifier);
> > 	put_online_cpus();
> > 
> 
> 
> While the cpumask_copy is definitely better, we can't put the 
> register_hotcpu_notifier() within get/put_online_cpus() 
> because it will lead to ABBA deadlock with a newly initiated 
> CPU Hotplug operation, the 2 locks involved being the 
> cpu_add_remove_lock and the cpu_hotplug lock.
>
> IOW, at the moment there is no "absolutely race-free way" way 
> to do CPU Hotplug callback registration. Some time ago, while 
> going through the asynchronous booting patch by Arjan [1] I 
> had written up a patch to fix that race because that race got 
> transformed from "purely theoretical" to "very real" with the 
> async boot patch, as shown by the powerpc boot failures [2].
> 
> But then I stopped short of posting that patch to the lists 
> because I started wondering how important that race would 
> actually turn out to be, in case the async booting design 
> takes a totally different approach altogether.. [And the 
> reason why I didn't post it is also because it would require 
> lots of changes in many parts where CPU Hotplug registration 
> is done, and that wouldn't probably be justified (I don't 
> know..) if the race remained only theoretical, as it is now.]

A fairly simple solution would be to eliminate the _cpus mask as 
well, and do a for_each_possible_cpu() loop in the super-slow 
loop - like dozens and dozens of other places do it in the 
kernel.

At a first quick glance that way the code gets a lot simpler and 
the only CPU hotplug related change needed are the CPU_* 
callbacks to do the lock barrier.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-29 11:12           ` Srivatsa S. Bhat
  2012-03-01  7:38             ` Ingo Molnar
@ 2012-03-01  8:12               ` Srivatsa S. Bhat
  1 sibling, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven, Paul E. McKenney, Rafael J. Wysocki, ppc-dev,
	David S. Miller, Paul Gortmaker, Benjamin Herrenschmidt,
	KOSAKI Motohiro, sparclinux

On 02/29/2012 04:42 PM, Srivatsa S. Bhat wrote:

> On 02/29/2012 02:47 PM, Ingo Molnar wrote:
> 
>>
>> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>
>>> Hi Andrew,
>>>
>>> On 02/29/2012 02:57 AM, Andrew Morton wrote:
>>>
>>>> On Tue, 28 Feb 2012 09:43:59 +0100
>>>> Ingo Molnar <mingo@elte.hu> wrote:
>>>>
>>>>> This patch should also probably go upstream through the 
>>>>> locking/lockdep tree? Mind sending it us once you think it's 
>>>>> ready?
>>>>
>>>> Oh goody, that means you own
>>>> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
>>>>
>>>
>>>
>>> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
>>> (VFS: Fix race between CPU hotplug and lglocks)
>>
>> The lglocks code is still CPU-hotplug racy AFAICS, despite the 
>> ->cpu_lock complication:
>>
>> Consider a taken global lock on a CPU:
>>
>> 	CPU#1
>> 	...
>> 	br_write_lock(vfsmount_lock);
>>
>> this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
>> CPU#3 comes online and takes the read lock:
> 
> 
> CPU#3 cannot come online! :-)
> 
> No new CPU can come online until that corresponding br_write_unlock()
> is completed. That is because  br_write_lock acquires &name##_cpu_lock
> and only br_write_unlock will release it.
> And, CPU_UP_PREPARE callback tries to acquire that very same spinlock,
> and hence will keep spinning until br_write_unlock() is run. And hence,
> the CPU#3 or any new CPU online for that matter will not complete until
> br_write_unlock() is done.
> 
> It is of course debatable as to how good this design really is, but IMHO,
> the lglocks code is not CPU-hotplug racy now..
> 
> Here is the link to the original discussion during the development of
> that patch: thread.gmane.org/gmane.linux.file-systems/59750/
> 
>>
>> 			CPU#3
>> 			br_read_lock(vfsmount_lock);
>>
>> This will succeed while the br_write_lock() is still active, 
>> because CPU#1 has only taken the locks of CPU#1 and CPU#2. 
>>
>> Crash!
>>
>> The proper fix would be for CPU-online to serialize with all 
>> known lglocks, via the notifier callback, i.e. to do something 
>> like this:
>>
>>         case CPU_UP_PREPARE:                                            
>> 		for_each_online_cpu(cpu) {
>> 	                spin_lock(&name##_cpu_lock);                            
>> 	                spin_unlock(&name##_cpu_lock);
>> 		}
>> 	...
>>
>> I.e. in essence do:
>>
>>         case CPU_UP_PREPARE:                                            
>> 		name##_global_lock_online();
>> 		name##_global_unlock_online();
>>
>> Another detail I noticed, this bit:
>>
>>         register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
>>         get_online_cpus();                                              \
>>         for_each_online_cpu(i)                                          \
>>                 cpu_set(i, name##_cpus);                                \
>>         put_online_cpus();                                              \
>>
>> could be something simpler and loop-less, like:
>>
>>         get_online_cpus();
>> 	cpumask_copy(name##_cpus, cpu_online_mask);
>> 	register_hotcpu_notifier(&name##_lg_cpu_notifier);
>> 	put_online_cpus();
>>
> 
> 
> While the cpumask_copy is definitely better, we can't put the
> register_hotcpu_notifier() within get/put_online_cpus() because it will
> lead to ABBA deadlock with a newly initiated CPU Hotplug operation, the
> 2 locks involved being the cpu_add_remove_lock and the cpu_hotplug lock.
> 
> IOW, at the moment there is no "absolutely race-free way" way to do
> CPU Hotplug callback registration. Some time ago, while going through the
> asynchronous booting patch by Arjan [1] I had written up a patch to fix
> that race because that race got transformed from "purely theoretical"
> to "very real" with the async boot patch, as shown by the powerpc boot
> failures [2].
> 
> But then I stopped short of posting that patch to the lists because I
> started wondering how important that race would actually turn out to be,
> in case the async booting design takes a totally different approach
> altogether.. [And the reason why I didn't post it is also because it
> would require lots of changes in many parts where CPU Hotplug registration
> is done, and that wouldn't probably be justified (I don't know..) if the
> race remained only theoretical, as it is now.]
> 
> [1]. http://thread.gmane.org/gmane.linux.kernel/1246209
> [2]. https://lkml.org/lkml/2012/2/13/383
>  


Ok, now that I mentioned about my patch, let me as well show it some daylight..
It is totally untested, incomplete and probably won't even compile.. (given
that I had abandoned working on it some time ago, since I was not sure in
what direction the async boot design was headed, which was the original
motivation for me to try to fix this race)

I really hate to post it when it is in such a state, but atleast let me get
the idea out, now that the discussion is around it, atleast just to get some
thoughts about whether it is even worth pursuing! 
(I'll post the patches as a reply to this mail.)

By the way, it should solve the powerpc boot failure, atleast in principle,
considering what the root cause of the failure was..

Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
@ 2012-03-01  8:12               ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra

On 02/29/2012 04:42 PM, Srivatsa S. Bhat wrote:

> On 02/29/2012 02:47 PM, Ingo Molnar wrote:
> 
>>
>> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>
>>> Hi Andrew,
>>>
>>> On 02/29/2012 02:57 AM, Andrew Morton wrote:
>>>
>>>> On Tue, 28 Feb 2012 09:43:59 +0100
>>>> Ingo Molnar <mingo@elte.hu> wrote:
>>>>
>>>>> This patch should also probably go upstream through the 
>>>>> locking/lockdep tree? Mind sending it us once you think it's 
>>>>> ready?
>>>>
>>>> Oh goody, that means you own
>>>> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
>>>>
>>>
>>>
>>> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
>>> (VFS: Fix race between CPU hotplug and lglocks)
>>
>> The lglocks code is still CPU-hotplug racy AFAICS, despite the 
>> ->cpu_lock complication:
>>
>> Consider a taken global lock on a CPU:
>>
>> 	CPU#1
>> 	...
>> 	br_write_lock(vfsmount_lock);
>>
>> this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
>> CPU#3 comes online and takes the read lock:
> 
> 
> CPU#3 cannot come online! :-)
> 
> No new CPU can come online until that corresponding br_write_unlock()
> is completed. That is because  br_write_lock acquires &name##_cpu_lock
> and only br_write_unlock will release it.
> And, CPU_UP_PREPARE callback tries to acquire that very same spinlock,
> and hence will keep spinning until br_write_unlock() is run. And hence,
> the CPU#3 or any new CPU online for that matter will not complete until
> br_write_unlock() is done.
> 
> It is of course debatable as to how good this design really is, but IMHO,
> the lglocks code is not CPU-hotplug racy now..
> 
> Here is the link to the original discussion during the development of
> that patch: thread.gmane.org/gmane.linux.file-systems/59750/
> 
>>
>> 			CPU#3
>> 			br_read_lock(vfsmount_lock);
>>
>> This will succeed while the br_write_lock() is still active, 
>> because CPU#1 has only taken the locks of CPU#1 and CPU#2. 
>>
>> Crash!
>>
>> The proper fix would be for CPU-online to serialize with all 
>> known lglocks, via the notifier callback, i.e. to do something 
>> like this:
>>
>>         case CPU_UP_PREPARE:                                            
>> 		for_each_online_cpu(cpu) {
>> 	                spin_lock(&name##_cpu_lock);                            
>> 	                spin_unlock(&name##_cpu_lock);
>> 		}
>> 	...
>>
>> I.e. in essence do:
>>
>>         case CPU_UP_PREPARE:                                            
>> 		name##_global_lock_online();
>> 		name##_global_unlock_online();
>>
>> Another detail I noticed, this bit:
>>
>>         register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
>>         get_online_cpus();                                              \
>>         for_each_online_cpu(i)                                          \
>>                 cpu_set(i, name##_cpus);                                \
>>         put_online_cpus();                                              \
>>
>> could be something simpler and loop-less, like:
>>
>>         get_online_cpus();
>> 	cpumask_copy(name##_cpus, cpu_online_mask);
>> 	register_hotcpu_notifier(&name##_lg_cpu_notifier);
>> 	put_online_cpus();
>>
> 
> 
> While the cpumask_copy is definitely better, we can't put the
> register_hotcpu_notifier() within get/put_online_cpus() because it will
> lead to ABBA deadlock with a newly initiated CPU Hotplug operation, the
> 2 locks involved being the cpu_add_remove_lock and the cpu_hotplug lock.
> 
> IOW, at the moment there is no "absolutely race-free way" way to do
> CPU Hotplug callback registration. Some time ago, while going through the
> asynchronous booting patch by Arjan [1] I had written up a patch to fix
> that race because that race got transformed from "purely theoretical"
> to "very real" with the async boot patch, as shown by the powerpc boot
> failures [2].
> 
> But then I stopped short of posting that patch to the lists because I
> started wondering how important that race would actually turn out to be,
> in case the async booting design takes a totally different approach
> altogether.. [And the reason why I didn't post it is also because it
> would require lots of changes in many parts where CPU Hotplug registration
> is done, and that wouldn't probably be justified (I don't know..) if the
> race remained only theoretical, as it is now.]
> 
> [1]. http://thread.gmane.org/gmane.linux.kernel/1246209
> [2]. https://lkml.org/lkml/2012/2/13/383
>  


Ok, now that I mentioned about my patch, let me as well show it some daylight..
It is totally untested, incomplete and probably won't even compile.. (given
that I had abandoned working on it some time ago, since I was not sure in
what direction the async boot design was headed, which was the original
motivation for me to try to fix this race)

I really hate to post it when it is in such a state, but atleast let me get
the idea out, now that the discussion is around it, atleast just to get some
thoughts about whether it is even worth pursuing! 
(I'll post the patches as a reply to this mail.)

By the way, it should solve the powerpc boot failure, atleast in principle,
considering what the root cause of the failure was..

Regards,
Srivatsa S. Bhat

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/3] CPU hotplug: Fix issues with callback registration
  2012-03-01  8:12               ` Srivatsa S. Bhat
  (?)
@ 2012-03-01  8:15                 ` Srivatsa S. Bhat
  -1 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra


Currently, there are several intertwined problems with CPU hotplug callback
registration:

Code which needs to get notified of CPU hotplug events and additionally wants
to do something for each already online CPU, would typically do something like:

   register_cpu_notifier(&foobar_cpu_notifier);
				<============ "A"
   get_online_cpus();
   for_each_online_cpu(cpu) {
	/* Do something */
   }
   put_online_cpus();

At the point marked as "A", a CPU hotplug event could sneak in, leaving the
code confused. Moving the registration to after put_online_cpus() won't help
either, because we could be losing a CPU hotplug event between put_online_cpus()
and the callback registration. Also, doing the registration inside the
get/put_online_cpus() block is also not going to help, because it will lead to
ABBA deadlock with CPU hotplug, the 2 locks being cpu_add_remove_lock and
cpu_hotplug lock.

It is also to be noted that, at times, we might want to do different setups
or initializations depending on whether a CPU is coming online for the first
time (as part of booting) or whether it is being only soft-onlined at a later
point in time. To achieve this, doing something like the code shown above,
with the "Do something" being different than what the registered callback
does wouldn't work out, because of the race conditions mentioned above.

The solution to all this is to include "history replay upon request" within
the CPU hotplug callback registration code, while also providing an option
for a different callback to be invoked while replaying history.

Though the above mentioned race condition was mostly theoretical before, it
gets all real when things like asynchronous booting[1] come into the picture,
as shown by the PowerPC boot failure in [2]. So this fix is also a step forward
in getting cool things like asynchronous booting to work properly.

References:
[1]. https://lkml.org/lkml/2012/2/14/62

---

 include/linux/cpu.h |   15 +++++++++++++++
 kernel/cpu.c        |   49 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6e53b48..90a6d76 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -124,16 +124,25 @@ enum {
 #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
 #ifdef CONFIG_HOTPLUG_CPU
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void));
 extern void unregister_cpu_notifier(struct notifier_block *nb);
 #else
 
 #ifndef MODULE
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void));
 #else
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+static inline int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	return 0;
+}
 #endif
 
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
@@ -155,6 +164,12 @@ static inline int register_cpu_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	return 0;
+}
+
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d520d34..1564c1d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -132,12 +132,56 @@ static void cpu_hotplug_done(void) {}
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
 {
-	int ret;
+	return register_allcpu_notifier(nb, false, NULL);
+}
+EXPORT_SYMBOL(register_cpu_notifier);
+
+int __ref register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	int cpu, ret = 0;
+
+	if (!replay_history && history_setup)
+		return -EINVAL;
+
 	cpu_maps_update_begin();
-	ret = raw_notifier_chain_register(&cpu_chain, nb);
+	/*
+	 * We don't race with CPU hotplug, because we just took the
+	 * cpu_add_remove_lock.
+	 */
+
+	if (!replay_history)
+		goto Register;
+
+	if (history_setup) {
+		/*
+		 * The caller has a special setup routine to rewrite
+		 * history as he desires. Just invoke it. Don't
+		 * proceed with callback registration if this setup is
+		 * unsuccessful.
+		 */
+		ret = history_setup();
+	} else {
+		/*
+		 * Fallback to the usual callback, if a special handler
+		 * for past CPU hotplug events is not specified.
+		 * In this case, we will replay only past CPU bring-up
+		 * events.
+		 */
+		for_each_online_cpu(cpu) {
+			nb->notifier_call(nb, CPU_UP_PREPARE, cpu);
+			nb->notifier_call(nb, CPU_ONLINE, cpu);
+		}
+	}
+
+ Register:
+	if (!ret)
+		ret = raw_notifier_chain_register(&cpu_chain, nb);
+
 	cpu_maps_update_done();
 	return ret;
 }
+EXPORT_SYMBOL(register_allcpu_notifier);
 
 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 			int *nr_calls)
@@ -161,7 +205,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
 	BUG_ON(cpu_notify(val, v));
 }
-EXPORT_SYMBOL(register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {




^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 1/3] CPU hotplug: Fix issues with callback registration
@ 2012-03-01  8:15                 ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Nick Piggin, Paul E. McKenney, Rusty Russell,
	linux-kernel, Rafael J. Wysocki, Paul Gortmaker, Alexander Viro,
	KOSAKI Motohiro, sparclinux, linux-fsdevel, Andrew Morton,
	Arjan van de Ven, ppc-dev, David S. Miller, Peter Zijlstra


Currently, there are several intertwined problems with CPU hotplug callback
registration:

Code which needs to get notified of CPU hotplug events and additionally wants
to do something for each already online CPU, would typically do something like:

   register_cpu_notifier(&foobar_cpu_notifier);
				<============ "A"
   get_online_cpus();
   for_each_online_cpu(cpu) {
	/* Do something */
   }
   put_online_cpus();

At the point marked as "A", a CPU hotplug event could sneak in, leaving the
code confused. Moving the registration to after put_online_cpus() won't help
either, because we could be losing a CPU hotplug event between put_online_cpus()
and the callback registration. Also, doing the registration inside the
get/put_online_cpus() block is also not going to help, because it will lead to
ABBA deadlock with CPU hotplug, the 2 locks being cpu_add_remove_lock and
cpu_hotplug lock.

It is also to be noted that, at times, we might want to do different setups
or initializations depending on whether a CPU is coming online for the first
time (as part of booting) or whether it is being only soft-onlined at a later
point in time. To achieve this, doing something like the code shown above,
with the "Do something" being different than what the registered callback
does wouldn't work out, because of the race conditions mentioned above.

The solution to all this is to include "history replay upon request" within
the CPU hotplug callback registration code, while also providing an option
for a different callback to be invoked while replaying history.

Though the above mentioned race condition was mostly theoretical before, it
gets all real when things like asynchronous booting[1] come into the picture,
as shown by the PowerPC boot failure in [2]. So this fix is also a step forward
in getting cool things like asynchronous booting to work properly.

References:
[1]. https://lkml.org/lkml/2012/2/14/62

---

 include/linux/cpu.h |   15 +++++++++++++++
 kernel/cpu.c        |   49 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6e53b48..90a6d76 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -124,16 +124,25 @@ enum {
 #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
 #ifdef CONFIG_HOTPLUG_CPU
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void));
 extern void unregister_cpu_notifier(struct notifier_block *nb);
 #else
 
 #ifndef MODULE
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void));
 #else
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+static inline int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	return 0;
+}
 #endif
 
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
@@ -155,6 +164,12 @@ static inline int register_cpu_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	return 0;
+}
+
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d520d34..1564c1d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -132,12 +132,56 @@ static void cpu_hotplug_done(void) {}
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
 {
-	int ret;
+	return register_allcpu_notifier(nb, false, NULL);
+}
+EXPORT_SYMBOL(register_cpu_notifier);
+
+int __ref register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	int cpu, ret = 0;
+
+	if (!replay_history && history_setup)
+		return -EINVAL;
+
 	cpu_maps_update_begin();
-	ret = raw_notifier_chain_register(&cpu_chain, nb);
+	/*
+	 * We don't race with CPU hotplug, because we just took the
+	 * cpu_add_remove_lock.
+	 */
+
+	if (!replay_history)
+		goto Register;
+
+	if (history_setup) {
+		/*
+		 * The caller has a special setup routine to rewrite
+		 * history as he desires. Just invoke it. Don't
+		 * proceed with callback registration if this setup is
+		 * unsuccessful.
+		 */
+		ret = history_setup();
+	} else {
+		/*
+		 * Fallback to the usual callback, if a special handler
+		 * for past CPU hotplug events is not specified.
+		 * In this case, we will replay only past CPU bring-up
+		 * events.
+		 */
+		for_each_online_cpu(cpu) {
+			nb->notifier_call(nb, CPU_UP_PREPARE, cpu);
+			nb->notifier_call(nb, CPU_ONLINE, cpu);
+		}
+	}
+
+ Register:
+	if (!ret)
+		ret = raw_notifier_chain_register(&cpu_chain, nb);
+
 	cpu_maps_update_done();
 	return ret;
 }
+EXPORT_SYMBOL(register_allcpu_notifier);
 
 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 			int *nr_calls)
@@ -161,7 +205,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
 	BUG_ON(cpu_notify(val, v));
 }
-EXPORT_SYMBOL(register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 2/3] CPU hotplug, arch/powerpc: Fix CPU hotplug callback registration
  2012-03-01  8:12               ` Srivatsa S. Bhat
  (?)
@ 2012-03-01  8:16                 ` Srivatsa S. Bhat
  -1 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra



Restructure CPU hotplug setup and callback registration in topology_init
so as to be race-free.

---

 arch/powerpc/kernel/sysfs.c |   44 +++++++++++++++++++++++++++++++++++--------
 arch/powerpc/mm/numa.c      |   11 ++++++++---
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 883e74c..5838b33 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -496,6 +496,38 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static void cpu_register_helper(struct cpu *c, int cpu)
+{
+	register_cpu(c, cpu);
+	device_create_file(&c->dev, &dev_attr_physical_id);
+}
+
+static int __cpuinit sysfs_cpu_notify_first_time(struct notifier_block *self,
+				      unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+	if (action == CPU_ONLINE)
+		if (!c->hotpluggable) /* Avoid duplicate registrations */
+			cpu_register_helper(c, cpu);
+		register_cpu_online(cpu);
+	}
+	return NOTIFY_OK;
+}
+static int __cpuinit sysfs_cpu_notify_setup(void)
+{
+	int cpu;
+
+	/*
+	 * We don't race with CPU hotplug because we are called from
+	 * the CPU hotplug callback registration function.
+	 */
+	for_each_online_cpu(cpu)
+		sysfs_cpu_notify_first_time(NULL, CPU_ONLINE, cpu);
+
+	return 0;
+}
 static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
@@ -637,7 +669,6 @@ static int __init topology_init(void)
 	int cpu;
 
 	register_nodes();
-	register_cpu_notifier(&sysfs_cpu_nb);
 
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -652,15 +683,12 @@ static int __init topology_init(void)
 		if (ppc_md.cpu_die)
 			c->hotpluggable = 1;
 
-		if (cpu_online(cpu) || c->hotpluggable) {
-			register_cpu(c, cpu);
+		if (c->hotpluggable)
+			cpu_register_helper(c, cpu);
+	}
 
-			device_create_file(&c->dev, &dev_attr_physical_id);
-		}
+	register_allcpu_notifier(&sysfs_cpu_nb, true, &sysfs_cpu_notify_setup);
 
-		if (cpu_online(cpu))
-			register_cpu_online(cpu);
-	}
 #ifdef CONFIG_PPC64
 	sysfs_create_dscr_default();
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3feefc3..e326455 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1014,6 +1014,13 @@ static void __init mark_reserved_regions_for_nid(int nid)
 	}
 }
 
+static int __cpuinit cpu_numa_callback_setup(void)
+{
+	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+			(void *)(unsigned long)boot_cpuid);
+	return 0;
+}
+
 
 void __init do_init_bootmem(void)
 {
@@ -1088,9 +1095,7 @@ void __init do_init_bootmem(void)
 	 */
 	setup_node_to_cpumask_map();
 
-	register_cpu_notifier(&ppc64_numa_nb);
-	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
-			  (void *)(unsigned long)boot_cpuid);
+	register_allcpu_notifier(&ppc64_numa_nb, true, &cpu_numa_callback_setup);
 }
 
 void __init paging_init(void)




^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 2/3] CPU hotplug, arch/powerpc: Fix CPU hotplug callback registration
@ 2012-03-01  8:16                 ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Nick Piggin, Paul E. McKenney, Rusty Russell,
	linux-kernel, Rafael J. Wysocki, Paul Gortmaker, Alexander Viro,
	KOSAKI Motohiro, sparclinux, linux-fsdevel, Andrew Morton,
	Arjan van de Ven, ppc-dev, David S. Miller, Peter Zijlstra



Restructure CPU hotplug setup and callback registration in topology_init
so as to be race-free.

---

 arch/powerpc/kernel/sysfs.c |   44 +++++++++++++++++++++++++++++++++++--------
 arch/powerpc/mm/numa.c      |   11 ++++++++---
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 883e74c..5838b33 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -496,6 +496,38 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static void cpu_register_helper(struct cpu *c, int cpu)
+{
+	register_cpu(c, cpu);
+	device_create_file(&c->dev, &dev_attr_physical_id);
+}
+
+static int __cpuinit sysfs_cpu_notify_first_time(struct notifier_block *self,
+				      unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+	if (action == CPU_ONLINE)
+		if (!c->hotpluggable) /* Avoid duplicate registrations */
+			cpu_register_helper(c, cpu);
+		register_cpu_online(cpu);
+	}
+	return NOTIFY_OK;
+}
+static int __cpuinit sysfs_cpu_notify_setup(void)
+{
+	int cpu;
+
+	/*
+	 * We don't race with CPU hotplug because we are called from
+	 * the CPU hotplug callback registration function.
+	 */
+	for_each_online_cpu(cpu)
+		sysfs_cpu_notify_first_time(NULL, CPU_ONLINE, cpu);
+
+	return 0;
+}
 static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
@@ -637,7 +669,6 @@ static int __init topology_init(void)
 	int cpu;
 
 	register_nodes();
-	register_cpu_notifier(&sysfs_cpu_nb);
 
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -652,15 +683,12 @@ static int __init topology_init(void)
 		if (ppc_md.cpu_die)
 			c->hotpluggable = 1;
 
-		if (cpu_online(cpu) || c->hotpluggable) {
-			register_cpu(c, cpu);
+		if (c->hotpluggable)
+			cpu_register_helper(c, cpu);
+	}
 
-			device_create_file(&c->dev, &dev_attr_physical_id);
-		}
+	register_allcpu_notifier(&sysfs_cpu_nb, true, &sysfs_cpu_notify_setup);
 
-		if (cpu_online(cpu))
-			register_cpu_online(cpu);
-	}
 #ifdef CONFIG_PPC64
 	sysfs_create_dscr_default();
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3feefc3..e326455 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1014,6 +1014,13 @@ static void __init mark_reserved_regions_for_nid(int nid)
 	}
 }
 
+static int __cpuinit cpu_numa_callback_setup(void)
+{
+	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+			(void *)(unsigned long)boot_cpuid);
+	return 0;
+}
+
 
 void __init do_init_bootmem(void)
 {
@@ -1088,9 +1095,7 @@ void __init do_init_bootmem(void)
 	 */
 	setup_node_to_cpumask_map();
 
-	register_cpu_notifier(&ppc64_numa_nb);
-	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
-			  (void *)(unsigned long)boot_cpuid);
+	register_allcpu_notifier(&ppc64_numa_nb, true, &cpu_numa_callback_setup);
 }
 
 void __init paging_init(void)

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 3/3] CPU hotplug, arch/sparc: Fix CPU hotplug callback registration
  2012-03-01  8:12               ` Srivatsa S. Bhat
  (?)
@ 2012-03-01  8:18                 ` Srivatsa S. Bhat
  -1 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra


Restructure CPU hotplug setup and callback registration in topology_init
so as to be race-free.

---

 arch/sparc/kernel/sysfs.c |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 654e8aa..22cb881 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -300,16 +300,14 @@ static int __init topology_init(void)
 
 	check_mmu_stats();
 
-	register_cpu_notifier(&sysfs_cpu_nb);
-
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
 
 		register_cpu(c, cpu);
-		if (cpu_online(cpu))
-			register_cpu_online(cpu);
 	}
 
+	register_allcpu_notifier(&sysfs_cpu_nb, true, NULL);
+
 	return 0;
 }
 



^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 3/3] CPU hotplug, arch/sparc: Fix CPU hotplug callback registration
@ 2012-03-01  8:18                 ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Nick Piggin, Paul E. McKenney, Rusty Russell,
	linux-kernel, Rafael J. Wysocki, Paul Gortmaker, Alexander Viro,
	KOSAKI Motohiro, sparclinux, linux-fsdevel, Andrew Morton,
	Arjan van de Ven, ppc-dev, David S. Miller, Peter Zijlstra


Restructure CPU hotplug setup and callback registration in topology_init
so as to be race-free.

---

 arch/sparc/kernel/sysfs.c |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 654e8aa..22cb881 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -300,16 +300,14 @@ static int __init topology_init(void)
 
 	check_mmu_stats();
 
-	register_cpu_notifier(&sysfs_cpu_nb);
-
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
 
 		register_cpu(c, cpu);
-		if (cpu_online(cpu))
-			register_cpu_online(cpu);
 	}
 
+	register_allcpu_notifier(&sysfs_cpu_nb, true, NULL);
+
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
@ 2012-03-01  8:12               ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven, Paul E. McKenney, Rafael J. Wysocki, ppc-dev,
	David S. Miller, Paul Gortmaker, Benjamin Herrenschmidt,
	KOSAKI Motohiro, sparclinux

On 02/29/2012 04:42 PM, Srivatsa S. Bhat wrote:

> On 02/29/2012 02:47 PM, Ingo Molnar wrote:
> 
>>
>> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>
>>> Hi Andrew,
>>>
>>> On 02/29/2012 02:57 AM, Andrew Morton wrote:
>>>
>>>> On Tue, 28 Feb 2012 09:43:59 +0100
>>>> Ingo Molnar <mingo@elte.hu> wrote:
>>>>
>>>>> This patch should also probably go upstream through the 
>>>>> locking/lockdep tree? Mind sending it us once you think it's 
>>>>> ready?
>>>>
>>>> Oh goody, that means you own
>>>> http://marc.info/?l=linux-kernel&m\x131419353511653&w=2.
>>>>
>>>
>>>
>>> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
>>> (VFS: Fix race between CPU hotplug and lglocks)
>>
>> The lglocks code is still CPU-hotplug racy AFAICS, despite the 
>> ->cpu_lock complication:
>>
>> Consider a taken global lock on a CPU:
>>
>> 	CPU#1
>> 	...
>> 	br_write_lock(vfsmount_lock);
>>
>> this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
>> CPU#3 comes online and takes the read lock:
> 
> 
> CPU#3 cannot come online! :-)
> 
> No new CPU can come online until that corresponding br_write_unlock()
> is completed. That is because  br_write_lock acquires &name##_cpu_lock
> and only br_write_unlock will release it.
> And, CPU_UP_PREPARE callback tries to acquire that very same spinlock,
> and hence will keep spinning until br_write_unlock() is run. And hence,
> the CPU#3 or any new CPU online for that matter will not complete until
> br_write_unlock() is done.
> 
> It is of course debatable as to how good this design really is, but IMHO,
> the lglocks code is not CPU-hotplug racy now..
> 
> Here is the link to the original discussion during the development of
> that patch: thread.gmane.org/gmane.linux.file-systems/59750/
> 
>>
>> 			CPU#3
>> 			br_read_lock(vfsmount_lock);
>>
>> This will succeed while the br_write_lock() is still active, 
>> because CPU#1 has only taken the locks of CPU#1 and CPU#2. 
>>
>> Crash!
>>
>> The proper fix would be for CPU-online to serialize with all 
>> known lglocks, via the notifier callback, i.e. to do something 
>> like this:
>>
>>         case CPU_UP_PREPARE:                                            
>> 		for_each_online_cpu(cpu) {
>> 	                spin_lock(&name##_cpu_lock);                            
>> 	                spin_unlock(&name##_cpu_lock);
>> 		}
>> 	...
>>
>> I.e. in essence do:
>>
>>         case CPU_UP_PREPARE:                                            
>> 		name##_global_lock_online();
>> 		name##_global_unlock_online();
>>
>> Another detail I noticed, this bit:
>>
>>         register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
>>         get_online_cpus();                                              \
>>         for_each_online_cpu(i)                                          \
>>                 cpu_set(i, name##_cpus);                                \
>>         put_online_cpus();                                              \
>>
>> could be something simpler and loop-less, like:
>>
>>         get_online_cpus();
>> 	cpumask_copy(name##_cpus, cpu_online_mask);
>> 	register_hotcpu_notifier(&name##_lg_cpu_notifier);
>> 	put_online_cpus();
>>
> 
> 
> While the cpumask_copy is definitely better, we can't put the
> register_hotcpu_notifier() within get/put_online_cpus() because it will
> lead to ABBA deadlock with a newly initiated CPU Hotplug operation, the
> 2 locks involved being the cpu_add_remove_lock and the cpu_hotplug lock.
> 
> IOW, at the moment there is no "absolutely race-free way" way to do
> CPU Hotplug callback registration. Some time ago, while going through the
> asynchronous booting patch by Arjan [1] I had written up a patch to fix
> that race because that race got transformed from "purely theoretical"
> to "very real" with the async boot patch, as shown by the powerpc boot
> failures [2].
> 
> But then I stopped short of posting that patch to the lists because I
> started wondering how important that race would actually turn out to be,
> in case the async booting design takes a totally different approach
> altogether.. [And the reason why I didn't post it is also because it
> would require lots of changes in many parts where CPU Hotplug registration
> is done, and that wouldn't probably be justified (I don't know..) if the
> race remained only theoretical, as it is now.]
> 
> [1]. http://thread.gmane.org/gmane.linux.kernel/1246209
> [2]. https://lkml.org/lkml/2012/2/13/383
>  


Ok, now that I mentioned about my patch, let me as well show it some daylight..
It is totally untested, incomplete and probably won't even compile.. (given
that I had abandoned working on it some time ago, since I was not sure in
what direction the async boot design was headed, which was the original
motivation for me to try to fix this race)

I really hate to post it when it is in such a state, but atleast let me get
the idea out, now that the discussion is around it, atleast just to get some
thoughts about whether it is even worth pursuing! 
(I'll post the patches as a reply to this mail.)

By the way, it should solve the powerpc boot failure, atleast in principle,
considering what the root cause of the failure was..

Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/3] CPU hotplug: Fix issues with callback registration
@ 2012-03-01  8:15                 ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra


Currently, there are several intertwined problems with CPU hotplug callback
registration:

Code which needs to get notified of CPU hotplug events and additionally wants
to do something for each already online CPU, would typically do something like:

   register_cpu_notifier(&foobar_cpu_notifier);
				<====== "A"
   get_online_cpus();
   for_each_online_cpu(cpu) {
	/* Do something */
   }
   put_online_cpus();

At the point marked as "A", a CPU hotplug event could sneak in, leaving the
code confused. Moving the registration to after put_online_cpus() won't help
either, because we could be losing a CPU hotplug event between put_online_cpus()
and the callback registration. Also, doing the registration inside the
get/put_online_cpus() block is also not going to help, because it will lead to
ABBA deadlock with CPU hotplug, the 2 locks being cpu_add_remove_lock and
cpu_hotplug lock.

It is also to be noted that, at times, we might want to do different setups
or initializations depending on whether a CPU is coming online for the first
time (as part of booting) or whether it is being only soft-onlined at a later
point in time. To achieve this, doing something like the code shown above,
with the "Do something" being different than what the registered callback
does wouldn't work out, because of the race conditions mentioned above.

The solution to all this is to include "history replay upon request" within
the CPU hotplug callback registration code, while also providing an option
for a different callback to be invoked while replaying history.

Though the above mentioned race condition was mostly theoretical before, it
gets all real when things like asynchronous booting[1] come into the picture,
as shown by the PowerPC boot failure in [2]. So this fix is also a step forward
in getting cool things like asynchronous booting to work properly.

References:
[1]. https://lkml.org/lkml/2012/2/14/62

---

 include/linux/cpu.h |   15 +++++++++++++++
 kernel/cpu.c        |   49 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6e53b48..90a6d76 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -124,16 +124,25 @@ enum {
 #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
 #ifdef CONFIG_HOTPLUG_CPU
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void));
 extern void unregister_cpu_notifier(struct notifier_block *nb);
 #else
 
 #ifndef MODULE
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void));
 #else
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+static inline int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	return 0;
+}
 #endif
 
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
@@ -155,6 +164,12 @@ static inline int register_cpu_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline int register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	return 0;
+}
+
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d520d34..1564c1d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -132,12 +132,56 @@ static void cpu_hotplug_done(void) {}
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
 {
-	int ret;
+	return register_allcpu_notifier(nb, false, NULL);
+}
+EXPORT_SYMBOL(register_cpu_notifier);
+
+int __ref register_allcpu_notifier(struct notifier_block *nb,
+			bool replay_history, int (*history_setup)(void))
+{
+	int cpu, ret = 0;
+
+	if (!replay_history && history_setup)
+		return -EINVAL;
+
 	cpu_maps_update_begin();
-	ret = raw_notifier_chain_register(&cpu_chain, nb);
+	/*
+	 * We don't race with CPU hotplug, because we just took the
+	 * cpu_add_remove_lock.
+	 */
+
+	if (!replay_history)
+		goto Register;
+
+	if (history_setup) {
+		/*
+		 * The caller has a special setup routine to rewrite
+		 * history as he desires. Just invoke it. Don't
+		 * proceed with callback registration if this setup is
+		 * unsuccessful.
+		 */
+		ret = history_setup();
+	} else {
+		/*
+		 * Fallback to the usual callback, if a special handler
+		 * for past CPU hotplug events is not specified.
+		 * In this case, we will replay only past CPU bring-up
+		 * events.
+		 */
+		for_each_online_cpu(cpu) {
+			nb->notifier_call(nb, CPU_UP_PREPARE, cpu);
+			nb->notifier_call(nb, CPU_ONLINE, cpu);
+		}
+	}
+
+ Register:
+	if (!ret)
+		ret = raw_notifier_chain_register(&cpu_chain, nb);
+
 	cpu_maps_update_done();
 	return ret;
 }
+EXPORT_SYMBOL(register_allcpu_notifier);
 
 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 			int *nr_calls)
@@ -161,7 +205,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
 	BUG_ON(cpu_notify(val, v));
 }
-EXPORT_SYMBOL(register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {




^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 2/3] CPU hotplug, arch/powerpc: Fix CPU hotplug callback registration
@ 2012-03-01  8:16                 ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra



Restructure CPU hotplug setup and callback registration in topology_init
so as to be race-free.

---

 arch/powerpc/kernel/sysfs.c |   44 +++++++++++++++++++++++++++++++++++--------
 arch/powerpc/mm/numa.c      |   11 ++++++++---
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 883e74c..5838b33 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -496,6 +496,38 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static void cpu_register_helper(struct cpu *c, int cpu)
+{
+	register_cpu(c, cpu);
+	device_create_file(&c->dev, &dev_attr_physical_id);
+}
+
+static int __cpuinit sysfs_cpu_notify_first_time(struct notifier_block *self,
+				      unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+	if (action = CPU_ONLINE)
+		if (!c->hotpluggable) /* Avoid duplicate registrations */
+			cpu_register_helper(c, cpu);
+		register_cpu_online(cpu);
+	}
+	return NOTIFY_OK;
+}
+static int __cpuinit sysfs_cpu_notify_setup(void)
+{
+	int cpu;
+
+	/*
+	 * We don't race with CPU hotplug because we are called from
+	 * the CPU hotplug callback registration function.
+	 */
+	for_each_online_cpu(cpu)
+		sysfs_cpu_notify_first_time(NULL, CPU_ONLINE, cpu);
+
+	return 0;
+}
 static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
@@ -637,7 +669,6 @@ static int __init topology_init(void)
 	int cpu;
 
 	register_nodes();
-	register_cpu_notifier(&sysfs_cpu_nb);
 
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -652,15 +683,12 @@ static int __init topology_init(void)
 		if (ppc_md.cpu_die)
 			c->hotpluggable = 1;
 
-		if (cpu_online(cpu) || c->hotpluggable) {
-			register_cpu(c, cpu);
+		if (c->hotpluggable)
+			cpu_register_helper(c, cpu);
+	}
 
-			device_create_file(&c->dev, &dev_attr_physical_id);
-		}
+	register_allcpu_notifier(&sysfs_cpu_nb, true, &sysfs_cpu_notify_setup);
 
-		if (cpu_online(cpu))
-			register_cpu_online(cpu);
-	}
 #ifdef CONFIG_PPC64
 	sysfs_create_dscr_default();
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3feefc3..e326455 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1014,6 +1014,13 @@ static void __init mark_reserved_regions_for_nid(int nid)
 	}
 }
 
+static int __cpuinit cpu_numa_callback_setup(void)
+{
+	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+			(void *)(unsigned long)boot_cpuid);
+	return 0;
+}
+
 
 void __init do_init_bootmem(void)
 {
@@ -1088,9 +1095,7 @@ void __init do_init_bootmem(void)
 	 */
 	setup_node_to_cpumask_map();
 
-	register_cpu_notifier(&ppc64_numa_nb);
-	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
-			  (void *)(unsigned long)boot_cpuid);
+	register_allcpu_notifier(&ppc64_numa_nb, true, &cpu_numa_callback_setup);
 }
 
 void __init paging_init(void)




^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 3/3] CPU hotplug, arch/sparc: Fix CPU hotplug callback registration
@ 2012-03-01  8:18                 ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  8:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: sparclinux, Andi Kleen, Nick Piggin, KOSAKI Motohiro,
	Rusty Russell, linux-kernel, Rafael J. Wysocki, Paul Gortmaker,
	Alexander Viro, Arjan van de Ven, linux-fsdevel, Andrew Morton,
	Paul E. McKenney, ppc-dev, David S. Miller, Peter Zijlstra


Restructure CPU hotplug setup and callback registration in topology_init
so as to be race-free.

---

 arch/sparc/kernel/sysfs.c |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 654e8aa..22cb881 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -300,16 +300,14 @@ static int __init topology_init(void)
 
 	check_mmu_stats();
 
-	register_cpu_notifier(&sysfs_cpu_nb);
-
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
 
 		register_cpu(c, cpu);
-		if (cpu_online(cpu))
-			register_cpu_online(cpu);
 	}
 
+	register_allcpu_notifier(&sysfs_cpu_nb, true, NULL);
+
 	return 0;
 }
 



^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-03-01  7:38             ` Ingo Molnar
@ 2012-03-01  9:15               ` Srivatsa S. Bhat
  2012-03-01  9:45                 ` Ingo Molnar
  0 siblings, 1 reply; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  9:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven, Paul E. McKenney, mc

On 03/01/2012 01:08 PM, Ingo Molnar wrote:

> 
> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
> 
>> On 02/29/2012 02:47 PM, Ingo Molnar wrote:
>>
>>>
>>> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>
>>>> Hi Andrew,
>>>>
>>>> On 02/29/2012 02:57 AM, Andrew Morton wrote:
>>>>
>>>>> On Tue, 28 Feb 2012 09:43:59 +0100
>>>>> Ingo Molnar <mingo@elte.hu> wrote:
>>>>>
>>>>>> This patch should also probably go upstream through the 
>>>>>> locking/lockdep tree? Mind sending it us once you think it's 
>>>>>> ready?
>>>>>
>>>>> Oh goody, that means you own
>>>>> http://marc.info/?l=linux-kernel&m=131419353511653&w=2.
>>>>>
>>>>
>>>>
>>>> That bug got fixed sometime around Dec 2011. See commit e30e2fdf
>>>> (VFS: Fix race between CPU hotplug and lglocks)
>>>
>>> The lglocks code is still CPU-hotplug racy AFAICS, despite the 
>>> ->cpu_lock complication:
>>>
>>> Consider a taken global lock on a CPU:
>>>
>>> 	CPU#1
>>> 	...
>>> 	br_write_lock(vfsmount_lock);
>>>
>>> this takes the lock of all online CPUs: say CPU#1 and CPU#2. Now 
>>> CPU#3 comes online and takes the read lock:
>>
>>
>> CPU#3 cannot come online! :-)
>>
>> No new CPU can come online until that corresponding br_write_unlock()
>> is completed. That is because  br_write_lock acquires &name##_cpu_lock
>> and only br_write_unlock will release it.
> 
> Indeed, you are right.
> 
> Note that ->cpu_lock is an entirely superfluous complication in 
> br_write_lock(): the whole CPU hotplug race can be addressed by 
> doing a br_write_lock()/unlock() barrier in the hotplug callback 


I don't think I understood your point completely, but please see below...

> ...

> 
>>> Another detail I noticed, this bit:
>>>
>>>         register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
>>>         get_online_cpus();                                              \
>>>         for_each_online_cpu(i)                                          \
>>>                 cpu_set(i, name##_cpus);                                \
>>>         put_online_cpus();                                              \
>>>
>>> could be something simpler and loop-less, like:
>>>
>>>         get_online_cpus();
>>> 	cpumask_copy(name##_cpus, cpu_online_mask);
>>> 	register_hotcpu_notifier(&name##_lg_cpu_notifier);
>>> 	put_online_cpus();
>>>
>>
>>
>> While the cpumask_copy is definitely better, we can't put the 
>> register_hotcpu_notifier() within get/put_online_cpus() 
>> because it will lead to ABBA deadlock with a newly initiated 
>> CPU Hotplug operation, the 2 locks involved being the 
>> cpu_add_remove_lock and the cpu_hotplug lock.
>>
>> IOW, at the moment there is no "absolutely race-free way" way 
>> to do CPU Hotplug callback registration. Some time ago, while 
>> going through the asynchronous booting patch by Arjan [1] I 
>> had written up a patch to fix that race because that race got 
>> transformed from "purely theoretical" to "very real" with the 
>> async boot patch, as shown by the powerpc boot failures [2].
>>
>> But then I stopped short of posting that patch to the lists 
>> because I started wondering how important that race would 
>> actually turn out to be, in case the async booting design 
>> takes a totally different approach altogether.. [And the 
>> reason why I didn't post it is also because it would require 
>> lots of changes in many parts where CPU Hotplug registration 
>> is done, and that wouldn't probably be justified (I don't 
>> know..) if the race remained only theoretical, as it is now.]
> 
> A fairly simple solution would be to eliminate the _cpus mask as 
> well, and do a for_each_possible_cpu() loop in the super-slow 
> loop - like dozens and dozens of other places do it in the 
> kernel.
> 


(I am assuming you are referring to the lglocks problem here, and not to the
ABBA deadlock/racy registration etc discussed immediately above.)

We wanted to avoid doing for_each_possible_cpu() to avoid the unnecessary
performance hit. In fact, that was the very first solution proposed, by
Cong Meng. See this:

http://thread.gmane.org/gmane.linux.file-systems/59750/
http://thread.gmane.org/gmane.linux.file-systems/59750/focus=59751


So we developed a solution that avoids for_each_possible_cpu(), and yet
works. Also, another point to be noted is that (referring to your previous
mail actually), doing for_each_online_cpu() at CPU_UP_PREPARE time won't
really work since the cpus are marked online only much later. So, the
solution we chose was to keep a consistent _cpus mask throughout the
lock-unlock sequence and perform the per-cpu lock/unlock only on the cpus
in that cpu mask; and ensuring that that mask won't change in between...
and also by delaying any new CPU online event during that time period using
the new ->cpu_lock spinlock as I mentioned in the other mail.

This (complexity) explains why the commit message of e30e2fdf looks more
like a mathematical theorem ;-)

> At a first quick glance that way the code gets a lot simpler and 
> the only CPU hotplug related change needed are the CPU_* 
> callbacks to do the lock barrier.
> 


 

Regards,
Srivatsa S. Bhat
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-03-01  9:15               ` Srivatsa S. Bhat
@ 2012-03-01  9:45                 ` Ingo Molnar
  2012-03-01  9:56                   ` Srivatsa S. Bhat
  0 siblings, 1 reply; 45+ messages in thread
From: Ingo Molnar @ 2012-03-01  9:45 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven, Paul E. McKenney, mc


* Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:

> We wanted to avoid doing for_each_possible_cpu() to avoid the 
> unnecessary performance hit. [...]

That was done at the cost of making the code rather complex.

The thing is, *ANY* cpu-mask loop is an utter slowpath, so the 
"performance hit" is an overstatement. There's already dozens of 
of for_each_possible_cpu() loops in the kernel, and it's a 
perfectly acceptable solution in such cases.

I suspect it does not matter much now as the code appears to be 
correct, but in general we want to opt for simpler designs for 
rare and fragile codepaths.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-03-01  9:45                 ` Ingo Molnar
@ 2012-03-01  9:56                   ` Srivatsa S. Bhat
  0 siblings, 0 replies; 45+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-01  9:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Rusty Russell, Nick Piggin, linux-kernel,
	Alexander Viro, Andi Kleen, linux-fsdevel, Peter Zijlstra,
	Arjan van de Ven, Paul E. McKenney, mc

On 03/01/2012 03:15 PM, Ingo Molnar wrote:

> 
> * Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> wrote:
> 
>> We wanted to avoid doing for_each_possible_cpu() to avoid the 
>> unnecessary performance hit. [...]
> 
> That was done at the cost of making the code rather complex.
>


Sadly, yes..

 
> The thing is, *ANY* cpu-mask loop is an utter slowpath, so the 
> "performance hit" is an overstatement. There's already dozens of 
> of for_each_possible_cpu() loops in the kernel, and it's a 
> perfectly acceptable solution in such cases.
> 
> I suspect it does not matter much now as the code appears to be 
> correct, but in general we want to opt for simpler designs for 
> rare and fragile codepaths.
> 


Ok, makes sense. And now looking back at the amount of complexity
we built into this just to avoid the for_each_possible_cpu() loop,
I wonder why we went to such lengths at all! (considering what you
said above about any cpu-mask loop..)

Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH] cpumask: fix lg_lock/br_lock.
  2012-02-28 11:24   ` Andi Kleen
@ 2012-03-05  7:02     ` Rusty Russell
  2012-03-05  7:03     ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 45+ messages in thread
From: Rusty Russell @ 2012-03-05  7:02 UTC (permalink / raw)
  To: Andi Kleen, Andrew Morton
  Cc: Nick Piggin, linux-kernel, Alexander Viro, Srivatsa S. Bhat,
	linux-fsdevel

On Tue, 28 Feb 2012 03:24:22 -0800, Andi Kleen <ak@linux.intel.com> wrote:
> Rusty: you should probably have cocci rules for this stuff, not manual
> patches.

Yes, though hopefully this is the last.

I've updated your patch based on Ingo's peevish feedback and a patch
to rip out the online stuff; posting now...

Thanks,
Rusty.
-- 
  Please can you help my fiance Alex?  http://baldalex.org

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/3] lglock: remove online variants of lock
  2012-02-28 11:24   ` Andi Kleen
  2012-03-05  7:02     ` Rusty Russell
@ 2012-03-05  7:03     ` Rusty Russell
  2012-04-20 11:12       ` Nick Piggin
  2012-03-05  7:04     ` [PATCH 2/3] brlocks/lglocks: API cleanups Rusty Russell
  2012-03-05  7:05     ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
  3 siblings, 1 reply; 45+ messages in thread
From: Rusty Russell @ 2012-03-05  7:03 UTC (permalink / raw)
  To: Andi Kleen, Andrew Morton
  Cc: Nick Piggin, linux-kernel, Alexander Viro, Srivatsa S. Bhat,
	linux-fsdevel

Optimizing the slow paths adds a lot of complexity.  If you need to
grab every lock often, you have other problems.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/lglock.h |   58 +------------------------------------------------
 1 file changed, 2 insertions(+), 56 deletions(-)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -28,8 +28,8 @@
 #define br_lock_init(name)	name##_lock_init()
 #define br_read_lock(name)	name##_local_lock()
 #define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock_online()
-#define br_write_unlock(name)	name##_global_unlock_online()
+#define br_write_lock(name)	name##_global_lock()
+#define br_write_unlock(name)	name##_global_unlock()
 
 #define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
@@ -42,8 +42,6 @@
 #define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
 #define lg_global_lock(name)	name##_global_lock()
 #define lg_global_unlock(name)	name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
@@ -68,36 +66,13 @@
  extern void name##_local_unlock_cpu(int cpu);				\
  extern void name##_global_lock(void);					\
  extern void name##_global_unlock(void);				\
- extern void name##_global_lock_online(void);				\
- extern void name##_global_unlock_online(void);				\
 
 #define DEFINE_LGLOCK(name)						\
 									\
  DEFINE_SPINLOCK(name##_cpu_lock);					\
- cpumask_t name##_cpus __read_mostly;					\
  DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
  DEFINE_LGLOCK_LOCKDEP(name);						\
 									\
- static int								\
- name##_lg_cpu_callback(struct notifier_block *nb,			\
-				unsigned long action, void *hcpu)	\
- {									\
-	switch (action & ~CPU_TASKS_FROZEN) {				\
-	case CPU_UP_PREPARE:						\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_set((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-		break;							\
-	case CPU_UP_CANCELED: case CPU_DEAD:				\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_clear((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-	}								\
-	return NOTIFY_OK;						\
- }									\
- static struct notifier_block name##_lg_cpu_notifier = {		\
-	.notifier_call = name##_lg_cpu_callback,			\
- };									\
  void name##_lock_init(void) {						\
 	int i;								\
 	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
@@ -106,11 +81,6 @@
 		lock = &per_cpu(name##_lock, i);			\
 		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
 	}								\
-	register_hotcpu_notifier(&name##_lg_cpu_notifier);		\
-	get_online_cpus();						\
-	for_each_online_cpu(i)						\
-		cpu_set(i, name##_cpus);				\
-	put_online_cpus();						\
  }									\
  EXPORT_SYMBOL(name##_lock_init);					\
 									\
@@ -150,30 +120,6 @@
  }									\
  EXPORT_SYMBOL(name##_local_unlock_cpu);				\
 									\
- void name##_global_lock_online(void) {					\
-	int i;								\
-	spin_lock(&name##_cpu_lock);					\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock_online);				\
-									\
- void name##_global_unlock_online(void) {				\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	spin_unlock(&name##_cpu_lock);					\
- }									\
- EXPORT_SYMBOL(name##_global_unlock_online);				\
-									\
  void name##_global_lock(void) {					\
 	int i;								\
 	preempt_disable();						\

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 2/3] brlocks/lglocks: API cleanups
  2012-02-28 11:24   ` Andi Kleen
  2012-03-05  7:02     ` Rusty Russell
  2012-03-05  7:03     ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
@ 2012-03-05  7:04     ` Rusty Russell
  2012-03-05  7:05     ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
  3 siblings, 0 replies; 45+ messages in thread
From: Rusty Russell @ 2012-03-05  7:04 UTC (permalink / raw)
  To: Andi Kleen, Andrew Morton
  Cc: Nick Piggin, linux-kernel, Alexander Viro, Srivatsa S. Bhat,
	linux-fsdevel

From: Andi Kleen <ak@linux.intel.com>

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

In preparation, this patch changes the API to look more like normal
function calls with pointers, not magic macros.

The patch is rather large because I move over all users in one go to keep
it bisectable.  This impacts the VFS somewhat in terms of lines changed.
But no actual behaviour change.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/dcache.c         |    4 -
 fs/file_table.c     |   16 ++---
 fs/namei.c          |   24 ++++----
 fs/namespace.c      |  140 ++++++++++++++++++++++++++--------------------------
 fs/pnode.c          |    4 -
 fs/proc_namespace.c |    4 -
 6 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2489,7 +2489,7 @@ static int prepend_path(const struct pat
 	bool slash = false;
 	int error = 0;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -2520,7 +2520,7 @@ static int prepend_path(const struct pat
 		error = prepend(buffer, buflen, "/", 1);
 
 out:
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return error;
 
 global_root:
diff --git a/fs/file_table.c b/fs/file_table.c
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -422,9 +422,9 @@ static inline void __file_sb_list_add(st
  */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	lg_local_lock(files_lglock);
+	lg_local_lock(&files_lglock);
 	__file_sb_list_add(file, sb);
-	lg_local_unlock(files_lglock);
+	lg_local_unlock(&files_lglock);
 }
 
 /**
@@ -437,9 +437,9 @@ void file_sb_list_add(struct file *file,
 void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
 	}
 }
 
@@ -486,7 +486,7 @@ void mark_files_ro(struct super_block *s
 	struct file *f;
 
 retry:
-	lg_global_lock(files_lglock);
+	lg_global_lock(&files_lglock);
 	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -503,12 +503,12 @@ retry:
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
 		/* This can sleep, so we can't hold the spinlock. */
-		lg_global_unlock(files_lglock);
+		lg_global_unlock(&files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
 	} while_file_list_for_each_entry;
-	lg_global_unlock(files_lglock);
+	lg_global_unlock(&files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -526,6 +526,6 @@ void __init files_init(unsigned long mem
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
-	lg_lock_init(files_lglock);
+	lg_lock_init(&files_lglock, "files_lglock");
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/namei.c b/fs/namei.c
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -462,7 +462,7 @@ static int unlazy_walk(struct nameidata 
 	mntget(nd->path.mnt);
 
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	nd->flags &= ~LOOKUP_RCU;
 	return 0;
 
@@ -520,14 +520,14 @@ static int complete_walk(struct nameidat
 		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
 			spin_unlock(&dentry->d_lock);
 			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
+			br_read_unlock(&vfsmount_lock);
 			return -ECHILD;
 		}
 		BUG_ON(nd->inode != dentry->d_inode);
 		spin_unlock(&dentry->d_lock);
 		mntget(nd->path.mnt);
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -694,15 +694,15 @@ int follow_up(struct path *path)
 	struct mount *parent;
 	struct dentry *mountpoint;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	parent = mnt->mnt_parent;
 	if (&parent->mnt == path->mnt) {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -960,7 +960,7 @@ failed:
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return -ECHILD;
 }
 
@@ -1256,7 +1256,7 @@ static void terminate_walk(struct nameid
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 }
 
@@ -1515,7 +1515,7 @@ static int path_init(int dfd, const char
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 		} else {
@@ -1528,7 +1528,7 @@ static int path_init(int dfd, const char
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			set_root_rcu(nd);
 		} else {
@@ -1541,7 +1541,7 @@ static int path_init(int dfd, const char
 			struct fs_struct *fs = current->fs;
 			unsigned seq;
 
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 
 			do {
@@ -1577,7 +1577,7 @@ static int path_init(int dfd, const char
 			if (fput_needed)
 				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 		} else {
 			path_get(&file->f_path);
diff --git a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct moun
 {
 	int ret = 0;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct moun
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct s
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct s
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return err;
 }
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path 
 {
 	struct mount *child_mnt;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
 	if (child_mnt) {
 		mnt_add_count(child_mnt, 1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return &child_mnt->mnt;
 	} else {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return NULL;
 	}
 }
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mo
 		mnt->mnt.mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 		mnt->mnt_parent = mnt;
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (flag & CL_SLAVE) {
 			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct moun
 {
 put_again:
 #ifdef CONFIG_SMP
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (likely(atomic_read(&mnt->mnt_longterm))) {
 		mnt_add_count(mnt, -1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		return;
 	}
 #else
 	mnt_add_count(mnt, -1);
 	if (likely(mnt_get_count(mnt)))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 #endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+
 	list_del(&mnt->mnt_instance);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	mntfree(mnt);
 }
 
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	real_mount(mnt)->mnt_pinned++;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (mnt->mnt_pinned) {
 		mnt_add_count(mnt, 1);
 		mnt->mnt_pinned--;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
 	BUG_ON(!m);
 
 	/* write lock needed for mnt_get_count */
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *he
 			struct dentry *dentry;
 			struct mount *m;
 
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			dentry = mnt->mnt_mountpoint;
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 			mnt->mnt_parent = mnt;
 			m->mnt_ghosts--;
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			dput(dentry);
 			mntput(&m->mnt);
 		}
@@ -1112,12 +1113,12 @@ static int do_umount(struct mount *mnt, 
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		if (mnt_get_count(mnt) != 2) {
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			return -EBUSY;
 		}
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -1159,7 +1160,7 @@ static int do_umount(struct mount *mnt, 
 	}
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1171,7 +1172,7 @@ static int do_umount(struct mount *mnt, 
 			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1286,19 +1287,19 @@ struct mount *copy_tree(struct mount *mn
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (!q)
 				goto Enomem;
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(res, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 	return NULL;
@@ -1318,9 +1319,9 @@ void drop_collected_mounts(struct vfsmou
 {
 	LIST_HEAD(umount_list);
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(real_mount(mnt), 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
@@ -1448,7 +1449,7 @@ static int attach_recursive_mnt(struct m
 	if (err)
 		goto out_cleanup_ids;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1468,7 @@ static int attach_recursive_mnt(struct m
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return 0;
 
@@ -1565,10 +1566,10 @@ static int do_change_type(struct path *p
 			goto out_unlock;
 	}
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
  out_unlock:
 	up_write(&namespace_sem);
@@ -1617,9 +1618,9 @@ static int do_loopback(struct path *path
 
 	err = graft_tree(mnt, path);
 	if (err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 out2:
 	unlock_mount(path);
@@ -1677,16 +1678,16 @@ static int do_remount(struct path *path,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		touch_mnt_namespace(mnt->mnt_ns);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	return err;
 }
@@ -1893,9 +1894,9 @@ fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		down_write(&namespace_sem);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_del_init(&mnt->mnt_expire);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		up_write(&namespace_sem);
 	}
 	mntput(m);
@@ -1911,11 +1912,11 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1936,7 @@ void mark_mounts_for_expiry(struct list_
 		return;
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -1954,7 +1955,7 @@ void mark_mounts_for_expiry(struct list_
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1, &umounts);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 
 	release_mounts(&umounts);
@@ -2218,9 +2219,9 @@ void mnt_make_shortterm(struct vfsmount 
 	struct mount *mnt = real_mount(m);
 	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	atomic_dec(&mnt->mnt_longterm);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 #endif
 }
 
@@ -2249,10 +2250,9 @@ static struct mnt_namespace *dup_mnt_ns(
 		kfree(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
-	new_ns->root = new;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new->mnt_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2416,9 @@ bool is_path_reachable(struct mount *mnt
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2505,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
 		goto out4;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	/* mount old root on put_old */
@@ -2513,7 +2513,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* mount new_root on / */
 	attach_mnt(new_mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	chroot_fs_refs(&root, &new);
 	error = 0;
 out4:
@@ -2576,7 +2576,7 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
-	br_lock_init(vfsmount_lock);
+	br_lock_init(&vfsmount_lock);
 
 	err = sysfs_init();
 	if (err)
@@ -2596,9 +2596,9 @@ void put_mnt_ns(struct mnt_namespace *ns
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(ns->root, 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(ns);
diff --git a/fs/pnode.c b/fs/pnode.c
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt
 		prev_src_mnt  = child;
 	}
 out:
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct mount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file 
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (p->m.poll_event != ns->event) {
 		p->m.poll_event = ns->event;
 		res |= POLLERR | POLLPRI;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
 	return res;
 }
-- 
  Please can you help my fiance Alex?  http://baldalex.org

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 3/3] brlocks/lglocks: turn into functions
  2012-02-28 11:24   ` Andi Kleen
                       ` (2 preceding siblings ...)
  2012-03-05  7:04     ` [PATCH 2/3] brlocks/lglocks: API cleanups Rusty Russell
@ 2012-03-05  7:05     ` Rusty Russell
  2012-04-20 11:21       ` Nick Piggin
  3 siblings, 1 reply; 45+ messages in thread
From: Rusty Russell @ 2012-03-05  7:05 UTC (permalink / raw)
  To: Andi Kleen, Andrew Morton
  Cc: Nick Piggin, linux-kernel, Alexander Viro, Srivatsa S. Bhat,
	linux-fsdevel

From: Andi Kleen <ak@linux.intel.com>

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

Since there are at least two users it makes sense to share this code in a
library.  This is also easier maintainable than a macro forest.

This will also make it later possible to dynamically allocate lglocks and
also use them in modules (this would both still need some additional, but
now straightforward, code)

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (split and completed)
---
 fs/file_table.c        |    1 
 fs/internal.h          |    2 
 include/linux/lglock.h |  125 ++++++++++---------------------------------------
 kernel/Makefile        |    2 
 kernel/lglock.c        |   89 ++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 102 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
diff --git a/fs/internal.h b/fs/internal.h
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(s
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
 
 
 /*
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -23,26 +23,17 @@
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
-#define br_read_lock(name)	name##_local_lock()
-#define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock()
-#define br_write_unlock(name)	name##_global_unlock()
+#define br_lock_init(name)	lg_lock_init(name, #name)
+#define br_read_lock(name)	lg_local_lock(name)
+#define br_read_unlock(name)	lg_local_unlock(name)
+#define br_write_lock(name)	lg_global_lock(name)
+#define br_write_unlock(name)	lg_global_unlock(name)
 
-#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)	name##_lock_init()
-#define lg_local_lock(name)	name##_local_lock()
-#define lg_local_unlock(name)	name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)	name##_global_lock()
-#define lg_global_unlock(name)	name##_global_unlock()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
@@ -57,90 +48,26 @@
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
-
-#define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
- extern void name##_local_lock(void);					\
- extern void name##_local_unlock(void);					\
- extern void name##_local_lock_cpu(int cpu);				\
- extern void name##_local_unlock_cpu(int cpu);				\
- extern void name##_global_lock(void);					\
- extern void name##_global_unlock(void);				\
+struct lglock {
+	arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key lock_key;
+	struct lockdep_map    lock_dep_map;
+#endif
+};
 
 #define DEFINE_LGLOCK(name)						\
-									\
- DEFINE_SPINLOCK(name##_cpu_lock);					\
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
- DEFINE_LGLOCK_LOCKDEP(name);						\
-									\
- void name##_lock_init(void) {						\
-	int i;								\
-	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_lock_init);					\
-									\
- void name##_local_lock(void) {						\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock);					\
-									\
- void name##_local_unlock(void) {					\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock);					\
-									\
- void name##_local_lock_cpu(int cpu) {					\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock_cpu);					\
-									\
- void name##_local_unlock_cpu(int cpu) {				\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock_cpu);				\
-									\
- void name##_global_lock(void) {					\
-	int i;								\
-	preempt_disable();						\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock);					\
-									\
- void name##_global_unlock(void) {					\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_global_unlock);
+	DEFINE_LGLOCK_LOCKDEP(name);					\
+	DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)			\
+	= __ARCH_SPIN_LOCK_UNLOCKED;					\
+	struct lglock name = { .lock = &name ## _lock }
+
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o
+	    async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+	int i;
+
+	preempt_disable();
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+	int i;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 1/3] lglock: remove online variants of lock
  2012-03-05  7:03     ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
@ 2012-04-20 11:12       ` Nick Piggin
  0 siblings, 0 replies; 45+ messages in thread
From: Nick Piggin @ 2012-04-20 11:12 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Andrew Morton, Nick Piggin, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel

On Mon, Mar 05, 2012 at 05:33:41PM +1030, Rusty Russell wrote:
> Optimizing the slow paths adds a lot of complexity.  If you need to
> grab every lock often, you have other problems.

> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>


Someone might cry about online vs possible discrepancy, but
I don't really mind.

Acked-by: Nick Piggin <npiggin@kernel.dk>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] brlocks/lglocks: turn into functions
  2012-03-05  7:05     ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
@ 2012-04-20 11:21       ` Nick Piggin
  2012-05-07  3:39         ` Rusty Russell
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2012-04-20 11:21 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Andrew Morton, Nick Piggin, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel

This still not merged?

On Mon, Mar 05, 2012 at 05:35:28PM +1030, Rusty Russell wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> lglocks and brlocks are currently generated with some complicated macros
> in lglock.h.  But there's no reason to not just use common utility
> functions and put all the data into a common data structure.

There is a reason, which is performance. Extra function call, but also
IIRC the percpu accessor was not so fast doing it this way. Maybe
that's improved...

So what's the performance difference?

> 
> Since there are at least two users it makes sense to share this code in a
> library.  This is also easier maintainable than a macro forest.
> 
> This will also make it later possible to dynamically allocate lglocks and
> also use them in modules (this would both still need some additional, but
> now straightforward, code)

Yes, but let's not do either of those things :)

I was slightly crazy when committing that patch to the kernel, I'll
admit. So if performance isn't significantly affected, then definitely.
If it is... well, it's much easier to gain 1% performance by maintaining
100 self contained lines of hilarious code like this than to actually
use your brain to improve somewhere else!

Thanks,
Nick

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] brlocks/lglocks: turn into functions
  2012-04-20 11:21       ` Nick Piggin
@ 2012-05-07  3:39         ` Rusty Russell
  2012-05-07  5:46           ` Al Viro
  2012-05-09  7:35             ` Nick Piggin
  0 siblings, 2 replies; 45+ messages in thread
From: Rusty Russell @ 2012-05-07  3:39 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Andrew Morton, Nick Piggin, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel

On Fri, 20 Apr 2012 21:21:49 +1000, Nick Piggin <npiggin@kernel.dk> wrote:
> This still not merged?

No, I've been away.  I've put it in -next for tomorrow, though I'm not
sure what the best way to get it to Linus next merge window.

> There is a reason, which is performance. Extra function call, but also
> IIRC the percpu accessor was not so fast doing it this way. Maybe
> that's improved...
> 
> So what's the performance difference?

What benchmarks you usually run?  Feel free to try it out and report
back; I only have small hardware here.

> > 
> > Since there are at least two users it makes sense to share this code in a
> > library.  This is also easier maintainable than a macro forest.
> > 
> > This will also make it later possible to dynamically allocate lglocks and
> > also use them in modules (this would both still need some additional, but
> > now straightforward, code)
> 
> Yes, but let's not do either of those things :)
> 
> I was slightly crazy when committing that patch to the kernel, I'll
> admit. So if performance isn't significantly affected, then definitely.
> If it is... well, it's much easier to gain 1% performance by maintaining
> 100 self contained lines of hilarious code like this than to actually
> use your brain to improve somewhere else!

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] brlocks/lglocks: turn into functions
  2012-05-07  3:39         ` Rusty Russell
@ 2012-05-07  5:46           ` Al Viro
  2012-05-08  3:59             ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
                               ` (2 more replies)
  2012-05-09  7:35             ` Nick Piggin
  1 sibling, 3 replies; 45+ messages in thread
From: Al Viro @ 2012-05-07  5:46 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Srivatsa S. Bhat, linux-fsdevel

On Mon, May 07, 2012 at 01:09:04PM +0930, Rusty Russell wrote:
> On Fri, 20 Apr 2012 21:21:49 +1000, Nick Piggin <npiggin@kernel.dk> wrote:
> > This still not merged?
> 
> No, I've been away.  I've put it in -next for tomorrow, though I'm not
> sure what the best way to get it to Linus next merge window.

Resend and I'll pick it.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/3] lglock: remove online variants of lock
  2012-05-07  5:46           ` Al Viro
@ 2012-05-08  3:59             ` Rusty Russell
  2012-05-08  4:50               ` Al Viro
  2012-05-08  4:02             ` [PATCH 2/3] brlocks/lglocks: API cleanups Rusty Russell
  2012-05-08  4:02             ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
  2 siblings, 1 reply; 45+ messages in thread
From: Rusty Russell @ 2012-05-08  3:59 UTC (permalink / raw)
  To: Al Viro
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Srivatsa S. Bhat, linux-fsdevel

Optimizing the slow paths adds a lot of complexity.  If you need to
grab every lock often, you have other problems.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Nick Piggin <npiggin@kernel.dk>
---
 include/linux/lglock.h |   58 +------------------------------------------------
 1 file changed, 2 insertions(+), 56 deletions(-)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -28,8 +28,8 @@
 #define br_lock_init(name)	name##_lock_init()
 #define br_read_lock(name)	name##_local_lock()
 #define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock_online()
-#define br_write_unlock(name)	name##_global_unlock_online()
+#define br_write_lock(name)	name##_global_lock()
+#define br_write_unlock(name)	name##_global_unlock()
 
 #define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
@@ -42,8 +42,6 @@
 #define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
 #define lg_global_lock(name)	name##_global_lock()
 #define lg_global_unlock(name)	name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
@@ -68,36 +66,13 @@
  extern void name##_local_unlock_cpu(int cpu);				\
  extern void name##_global_lock(void);					\
  extern void name##_global_unlock(void);				\
- extern void name##_global_lock_online(void);				\
- extern void name##_global_unlock_online(void);				\
 
 #define DEFINE_LGLOCK(name)						\
 									\
  DEFINE_SPINLOCK(name##_cpu_lock);					\
- cpumask_t name##_cpus __read_mostly;					\
  DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
  DEFINE_LGLOCK_LOCKDEP(name);						\
 									\
- static int								\
- name##_lg_cpu_callback(struct notifier_block *nb,			\
-				unsigned long action, void *hcpu)	\
- {									\
-	switch (action & ~CPU_TASKS_FROZEN) {				\
-	case CPU_UP_PREPARE:						\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_set((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-		break;							\
-	case CPU_UP_CANCELED: case CPU_DEAD:				\
-		spin_lock(&name##_cpu_lock);				\
-		cpu_clear((unsigned long)hcpu, name##_cpus);		\
-		spin_unlock(&name##_cpu_lock);				\
-	}								\
-	return NOTIFY_OK;						\
- }									\
- static struct notifier_block name##_lg_cpu_notifier = {		\
-	.notifier_call = name##_lg_cpu_callback,			\
- };									\
  void name##_lock_init(void) {						\
 	int i;								\
 	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
@@ -106,11 +81,6 @@
 		lock = &per_cpu(name##_lock, i);			\
 		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
 	}								\
-	register_hotcpu_notifier(&name##_lg_cpu_notifier);		\
-	get_online_cpus();						\
-	for_each_online_cpu(i)						\
-		cpu_set(i, name##_cpus);				\
-	put_online_cpus();						\
  }									\
  EXPORT_SYMBOL(name##_lock_init);					\
 									\
@@ -150,30 +120,6 @@
  }									\
  EXPORT_SYMBOL(name##_local_unlock_cpu);				\
 									\
- void name##_global_lock_online(void) {					\
-	int i;								\
-	spin_lock(&name##_cpu_lock);					\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock_online);				\
-									\
- void name##_global_unlock_online(void) {				\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_cpu(i, &name##_cpus) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	spin_unlock(&name##_cpu_lock);					\
- }									\
- EXPORT_SYMBOL(name##_global_unlock_online);				\
-									\
  void name##_global_lock(void) {					\
 	int i;								\
 	preempt_disable();						\

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 2/3] brlocks/lglocks: API cleanups
  2012-05-07  5:46           ` Al Viro
  2012-05-08  3:59             ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
@ 2012-05-08  4:02             ` Rusty Russell
  2012-05-08  4:02             ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
  2 siblings, 0 replies; 45+ messages in thread
From: Rusty Russell @ 2012-05-08  4:02 UTC (permalink / raw)
  To: Al Viro
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Srivatsa S. Bhat, linux-fsdevel

From: Andi Kleen <ak@linux.intel.com>

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

In preparation, this patch changes the API to look more like normal
function calls with pointers, not magic macros.

The patch is rather large because I move over all users in one go to keep
it bisectable.  This impacts the VFS somewhat in terms of lines changed.
But no actual behaviour change.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/dcache.c         |    4 -
 fs/file_table.c     |   16 ++---
 fs/namei.c          |   24 ++++----
 fs/namespace.c      |  140 ++++++++++++++++++++++++++--------------------------
 fs/pnode.c          |    4 -
 fs/proc_namespace.c |    4 -
 6 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2489,7 +2489,7 @@ static int prepend_path(const struct pat
 	bool slash = false;
 	int error = 0;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -2520,7 +2520,7 @@ static int prepend_path(const struct pat
 		error = prepend(buffer, buflen, "/", 1);
 
 out:
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return error;
 
 global_root:
diff --git a/fs/file_table.c b/fs/file_table.c
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -422,9 +422,9 @@ static inline void __file_sb_list_add(st
  */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	lg_local_lock(files_lglock);
+	lg_local_lock(&files_lglock);
 	__file_sb_list_add(file, sb);
-	lg_local_unlock(files_lglock);
+	lg_local_unlock(&files_lglock);
 }
 
 /**
@@ -437,9 +437,9 @@ void file_sb_list_add(struct file *file,
 void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+		lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
 	}
 }
 
@@ -486,7 +486,7 @@ void mark_files_ro(struct super_block *s
 	struct file *f;
 
 retry:
-	lg_global_lock(files_lglock);
+	lg_global_lock(&files_lglock);
 	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -503,12 +503,12 @@ retry:
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
 		/* This can sleep, so we can't hold the spinlock. */
-		lg_global_unlock(files_lglock);
+		lg_global_unlock(&files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
 	} while_file_list_for_each_entry;
-	lg_global_unlock(files_lglock);
+	lg_global_unlock(&files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -526,6 +526,6 @@ void __init files_init(unsigned long mem
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
-	lg_lock_init(files_lglock);
+	lg_lock_init(&files_lglock, "files_lglock");
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/namei.c b/fs/namei.c
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -462,7 +462,7 @@ static int unlazy_walk(struct nameidata 
 	mntget(nd->path.mnt);
 
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	nd->flags &= ~LOOKUP_RCU;
 	return 0;
 
@@ -520,14 +520,14 @@ static int complete_walk(struct nameidat
 		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
 			spin_unlock(&dentry->d_lock);
 			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
+			br_read_unlock(&vfsmount_lock);
 			return -ECHILD;
 		}
 		BUG_ON(nd->inode != dentry->d_inode);
 		spin_unlock(&dentry->d_lock);
 		mntget(nd->path.mnt);
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -694,15 +694,15 @@ int follow_up(struct path *path)
 	struct mount *parent;
 	struct dentry *mountpoint;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	parent = mnt->mnt_parent;
 	if (&parent->mnt == path->mnt) {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -960,7 +960,7 @@ failed:
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
 	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return -ECHILD;
 }
 
@@ -1256,7 +1256,7 @@ static void terminate_walk(struct nameid
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
 		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 }
 
@@ -1515,7 +1515,7 @@ static int path_init(int dfd, const char
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 		} else {
@@ -1528,7 +1528,7 @@ static int path_init(int dfd, const char
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 			set_root_rcu(nd);
 		} else {
@@ -1541,7 +1541,7 @@ static int path_init(int dfd, const char
 			struct fs_struct *fs = current->fs;
 			unsigned seq;
 
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 
 			do {
@@ -1577,7 +1577,7 @@ static int path_init(int dfd, const char
 			if (fput_needed)
 				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			br_read_lock(vfsmount_lock);
+			br_read_lock(&vfsmount_lock);
 			rcu_read_lock();
 		} else {
 			path_get(&file->f_path);
diff --git a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct moun
 {
 	int ret = 0;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct moun
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct s
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct s
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return err;
 }
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path 
 {
 	struct mount *child_mnt;
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
 	if (child_mnt) {
 		mnt_add_count(child_mnt, 1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return &child_mnt->mnt;
 	} else {
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return NULL;
 	}
 }
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mo
 		mnt->mnt.mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 		mnt->mnt_parent = mnt;
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (flag & CL_SLAVE) {
 			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct moun
 {
 put_again:
 #ifdef CONFIG_SMP
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (likely(atomic_read(&mnt->mnt_longterm))) {
 		mnt_add_count(mnt, -1);
-		br_read_unlock(vfsmount_lock);
+		br_read_unlock(&vfsmount_lock);
 		return;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		return;
 	}
 #else
 	mnt_add_count(mnt, -1);
 	if (likely(mnt_get_count(mnt)))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 #endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+
 	list_del(&mnt->mnt_instance);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	mntfree(mnt);
 }
 
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	real_mount(mnt)->mnt_pinned++;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (mnt->mnt_pinned) {
 		mnt_add_count(mnt, 1);
 		mnt->mnt_pinned--;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
 	BUG_ON(!m);
 
 	/* write lock needed for mnt_get_count */
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *he
 			struct dentry *dentry;
 			struct mount *m;
 
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			dentry = mnt->mnt_mountpoint;
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 			mnt->mnt_parent = mnt;
 			m->mnt_ghosts--;
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			dput(dentry);
 			mntput(&m->mnt);
 		}
@@ -1112,12 +1113,12 @@ static int do_umount(struct mount *mnt, 
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		if (mnt_get_count(mnt) != 2) {
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 			return -EBUSY;
 		}
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -1159,7 +1160,7 @@ static int do_umount(struct mount *mnt, 
 	}
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1171,7 +1172,7 @@ static int do_umount(struct mount *mnt, 
 			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1286,19 +1287,19 @@ struct mount *copy_tree(struct mount *mn
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (!q)
 				goto Enomem;
-			br_write_lock(vfsmount_lock);
+			br_write_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
-			br_write_unlock(vfsmount_lock);
+			br_write_unlock(&vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(res, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 	return NULL;
@@ -1318,9 +1319,9 @@ void drop_collected_mounts(struct vfsmou
 {
 	LIST_HEAD(umount_list);
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(real_mount(mnt), 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
@@ -1448,7 +1449,7 @@ static int attach_recursive_mnt(struct m
 	if (err)
 		goto out_cleanup_ids;
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1468,7 @@ static int attach_recursive_mnt(struct m
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	return 0;
 
@@ -1565,10 +1566,10 @@ static int do_change_type(struct path *p
 			goto out_unlock;
 	}
 
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
  out_unlock:
 	up_write(&namespace_sem);
@@ -1617,9 +1618,9 @@ static int do_loopback(struct path *path
 
 	err = graft_tree(mnt, path);
 	if (err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 out2:
 	unlock_mount(path);
@@ -1677,16 +1678,16 @@ static int do_remount(struct path *path,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		touch_mnt_namespace(mnt->mnt_ns);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 	}
 	return err;
 }
@@ -1893,9 +1894,9 @@ fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		down_write(&namespace_sem);
-		br_write_lock(vfsmount_lock);
+		br_write_lock(&vfsmount_lock);
 		list_del_init(&mnt->mnt_expire);
-		br_write_unlock(vfsmount_lock);
+		br_write_unlock(&vfsmount_lock);
 		up_write(&namespace_sem);
 	}
 	mntput(m);
@@ -1911,11 +1912,11 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1936,7 @@ void mark_mounts_for_expiry(struct list_
 		return;
 
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -1954,7 +1955,7 @@ void mark_mounts_for_expiry(struct list_
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1, &umounts);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 
 	release_mounts(&umounts);
@@ -2218,9 +2219,9 @@ void mnt_make_shortterm(struct vfsmount 
 	struct mount *mnt = real_mount(m);
 	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
 		return;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	atomic_dec(&mnt->mnt_longterm);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 #endif
 }
 
@@ -2249,10 +2250,9 @@ static struct mnt_namespace *dup_mnt_ns(
 		kfree(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
-	new_ns->root = new;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new->mnt_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2416,9 @@ bool is_path_reachable(struct mount *mnt
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2505,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
 		goto out4;
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	/* mount old root on put_old */
@@ -2513,7 +2513,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	/* mount new_root on / */
 	attach_mnt(new_mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	chroot_fs_refs(&root, &new);
 	error = 0;
 out4:
@@ -2576,7 +2576,7 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
-	br_lock_init(vfsmount_lock);
+	br_lock_init(&vfsmount_lock);
 
 	err = sysfs_init();
 	if (err)
@@ -2596,9 +2596,9 @@ void put_mnt_ns(struct mnt_namespace *ns
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	down_write(&namespace_sem);
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	umount_tree(ns->root, 0, &umount_list);
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(ns);
diff --git a/fs/pnode.c b/fs/pnode.c
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt
 		prev_src_mnt  = child;
 	}
 out:
-	br_write_lock(vfsmount_lock);
+	br_write_lock(&vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct mount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
-	br_write_unlock(vfsmount_lock);
+	br_write_unlock(&vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file 
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	br_read_lock(vfsmount_lock);
+	br_read_lock(&vfsmount_lock);
 	if (p->m.poll_event != ns->event) {
 		p->m.poll_event = ns->event;
 		res |= POLLERR | POLLPRI;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_read_unlock(&vfsmount_lock);
 
 	return res;
 }

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 3/3] brlocks/lglocks: turn into functions
  2012-05-07  5:46           ` Al Viro
  2012-05-08  3:59             ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
  2012-05-08  4:02             ` [PATCH 2/3] brlocks/lglocks: API cleanups Rusty Russell
@ 2012-05-08  4:02             ` Rusty Russell
  2 siblings, 0 replies; 45+ messages in thread
From: Rusty Russell @ 2012-05-08  4:02 UTC (permalink / raw)
  To: Al Viro
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Srivatsa S. Bhat, linux-fsdevel

From: Andi Kleen <ak@linux.intel.com>

lglocks and brlocks are currently generated with some complicated macros
in lglock.h.  But there's no reason to not just use common utility
functions and put all the data into a common data structure.

Since there are at least two users it makes sense to share this code in a
library.  This is also easier maintainable than a macro forest.

This will also make it later possible to dynamically allocate lglocks and
also use them in modules (this would both still need some additional, but
now straightforward, code)

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/file_table.c        |    1 
 fs/internal.h          |    2 
 include/linux/lglock.h |  125 ++++++++++---------------------------------------
 kernel/Makefile        |    2 
 kernel/lglock.c        |   89 ++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 102 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
diff --git a/fs/internal.h b/fs/internal.h
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(s
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
 
 
 /*
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -23,26 +23,17 @@
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)	name##_lock_init()
-#define br_read_lock(name)	name##_local_lock()
-#define br_read_unlock(name)	name##_local_unlock()
-#define br_write_lock(name)	name##_global_lock()
-#define br_write_unlock(name)	name##_global_unlock()
+#define br_lock_init(name)	lg_lock_init(name, #name)
+#define br_read_lock(name)	lg_local_lock(name)
+#define br_read_unlock(name)	lg_local_unlock(name)
+#define br_write_lock(name)	lg_global_lock(name)
+#define br_write_unlock(name)	lg_global_unlock(name)
 
-#define DECLARE_BRLOCK(name)	DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)	DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)	name##_lock_init()
-#define lg_local_lock(name)	name##_local_lock()
-#define lg_local_unlock(name)	name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)	name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu)	name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)	name##_global_lock()
-#define lg_global_unlock(name)	name##_global_unlock()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
@@ -57,90 +48,26 @@
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
-
-#define DECLARE_LGLOCK(name)						\
- extern void name##_lock_init(void);					\
- extern void name##_local_lock(void);					\
- extern void name##_local_unlock(void);					\
- extern void name##_local_lock_cpu(int cpu);				\
- extern void name##_local_unlock_cpu(int cpu);				\
- extern void name##_global_lock(void);					\
- extern void name##_global_unlock(void);				\
+struct lglock {
+	arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key lock_key;
+	struct lockdep_map    lock_dep_map;
+#endif
+};
 
 #define DEFINE_LGLOCK(name)						\
-									\
- DEFINE_SPINLOCK(name##_cpu_lock);					\
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
- DEFINE_LGLOCK_LOCKDEP(name);						\
-									\
- void name##_lock_init(void) {						\
-	int i;								\
-	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_lock_init);					\
-									\
- void name##_local_lock(void) {						\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock);					\
-									\
- void name##_local_unlock(void) {					\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &__get_cpu_var(name##_lock);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock);					\
-									\
- void name##_local_lock_cpu(int cpu) {					\
-	arch_spinlock_t *lock;						\
-	preempt_disable();						\
-	rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);	\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_lock(lock);						\
- }									\
- EXPORT_SYMBOL(name##_local_lock_cpu);					\
-									\
- void name##_local_unlock_cpu(int cpu) {				\
-	arch_spinlock_t *lock;						\
-	rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);		\
-	lock = &per_cpu(name##_lock, cpu);				\
-	arch_spin_unlock(lock);						\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_local_unlock_cpu);				\
-									\
- void name##_global_lock(void) {					\
-	int i;								\
-	preempt_disable();						\
-	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_lock(lock);					\
-	}								\
- }									\
- EXPORT_SYMBOL(name##_global_lock);					\
-									\
- void name##_global_unlock(void) {					\
-	int i;								\
-	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_possible_cpu(i) {					\
-		arch_spinlock_t *lock;					\
-		lock = &per_cpu(name##_lock, i);			\
-		arch_spin_unlock(lock);					\
-	}								\
-	preempt_enable();						\
- }									\
- EXPORT_SYMBOL(name##_global_unlock);
+	DEFINE_LGLOCK_LOCKDEP(name);					\
+	DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)			\
+	= __ARCH_SPIN_LOCK_UNLOCKED;					\
+	struct lglock name = { .lock = &name ## _lock }
+
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o
+	    async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = this_cpu_ptr(lg->lock);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	preempt_disable();
+	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+	arch_spinlock_t *lock;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	lock = per_cpu_ptr(lg->lock, cpu);
+	arch_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+	int i;
+
+	preempt_disable();
+	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_lock(lock);
+	}
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+	int i;
+
+	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	for_each_possible_cpu(i) {
+		arch_spinlock_t *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		arch_spin_unlock(lock);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 1/3] lglock: remove online variants of lock
  2012-05-08  3:59             ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
@ 2012-05-08  4:50               ` Al Viro
  2012-05-08  6:12                 ` Rusty Russell
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2012-05-08  4:50 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Srivatsa S. Bhat, linux-fsdevel

On Tue, May 08, 2012 at 01:29:45PM +0930, Rusty Russell wrote:
> Optimizing the slow paths adds a lot of complexity.  If you need to
> grab every lock often, you have other problems.

Applied, but I'm not too happy about the situation with vfsmount_lock ;-/
On kernels built for a lot of possible CPUs the loss of ..._online()
versions will get painful.  OTOH, we can always put the map + single
spinlock + single notifier into lglock.c and reproduce the old logics.
I'll do a patch along those lines and put it on a separate branch;
then we'll be able to test and compare.

Contention from cpu map spinlock becoming shared between different
lglocks (all two of them) is not an issue - we never use the
files_lock one anyway (there we can't use _online variants).

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 1/3] lglock: remove online variants of lock
  2012-05-08  4:50               ` Al Viro
@ 2012-05-08  6:12                 ` Rusty Russell
  0 siblings, 0 replies; 45+ messages in thread
From: Rusty Russell @ 2012-05-08  6:12 UTC (permalink / raw)
  To: Al Viro
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Srivatsa S. Bhat, linux-fsdevel

On Tue, 8 May 2012 05:50:44 +0100, Al Viro <viro@ZenIV.linux.org.uk> wrote:
> On Tue, May 08, 2012 at 01:29:45PM +0930, Rusty Russell wrote:
> > Optimizing the slow paths adds a lot of complexity.  If you need to
> > grab every lock often, you have other problems.
> 
> Applied, but I'm not too happy about the situation with vfsmount_lock ;-/
> On kernels built for a lot of possible CPUs the loss of ..._online()
> versions will get painful.

My original motivation was to get rid of that cpumask_t (and replace it
with a cpumask_var_t).  A simple enough patch, but I couldn't bring
myself to leave that complex logic in place without clear justification.

> OTOH, we can always put the map + single
> spinlock + single notifier into lglock.c and reproduce the old logics.
> I'll do a patch along those lines and put it on a separate branch;
> then we'll be able to test and compare.

I'll be interested in the results: virtual systems are classic for
wanting large # of CPUs which aren't actually online, so we might
actually care.

I'd also like to get rid of the cpu_possible_map altogether, and just
have NR_CPUS/nr_cpu_ids, since last I checked no arch really wants
sparse numbers.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] brlocks/lglocks: turn into functions
  2012-05-07  3:39         ` Rusty Russell
@ 2012-05-09  7:35             ` Nick Piggin
  2012-05-09  7:35             ` Nick Piggin
  1 sibling, 0 replies; 45+ messages in thread
From: Nick Piggin @ 2012-05-09  7:35 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel

On 7 May 2012 13:39, Rusty Russell <rusty@rustcorp.com.au> wrote:
> On Fri, 20 Apr 2012 21:21:49 +1000, Nick Piggin <npiggin@kernel.dk> wrote:
>> This still not merged?
>
> No, I've been away.  I've put it in -next for tomorrow, though I'm not
> sure what the best way to get it to Linus next merge window.
>
>> There is a reason, which is performance. Extra function call, but also
>> IIRC the percpu accessor was not so fast doing it this way. Maybe
>> that's improved...
>>
>> So what's the performance difference?
>
> What benchmarks you usually run?  Feel free to try it out and report
> back; I only have small hardware here.

Nothing big. The actual scalability should be unchanged, because you're
not going to be dirtying any shared cachelines. It's the use of dynamic
percpu allocator and out of line calls etc which will slow down straight
line performance.

How many microseconds does it take to stat("./dir1/dir2/myfile"), for
example?

`git diff` of a large tree, for something more realistic but still going to
exercise that path.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/3] brlocks/lglocks: turn into functions
@ 2012-05-09  7:35             ` Nick Piggin
  0 siblings, 0 replies; 45+ messages in thread
From: Nick Piggin @ 2012-05-09  7:35 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Nick Piggin, Andi Kleen, Andrew Morton, linux-kernel,
	Alexander Viro, Srivatsa S. Bhat, linux-fsdevel

On 7 May 2012 13:39, Rusty Russell <rusty@rustcorp.com.au> wrote:
> On Fri, 20 Apr 2012 21:21:49 +1000, Nick Piggin <npiggin@kernel.dk> wrote:
>> This still not merged?
>
> No, I've been away.  I've put it in -next for tomorrow, though I'm not
> sure what the best way to get it to Linus next merge window.
>
>> There is a reason, which is performance. Extra function call, but also
>> IIRC the percpu accessor was not so fast doing it this way. Maybe
>> that's improved...
>>
>> So what's the performance difference?
>
> What benchmarks you usually run?  Feel free to try it out and report
> back; I only have small hardware here.

Nothing big. The actual scalability should be unchanged, because you're
not going to be dirtying any shared cachelines. It's the use of dynamic
percpu allocator and out of line calls etc which will slow down straight
line performance.

How many microseconds does it take to stat("./dir1/dir2/myfile"), for
example?

`git diff` of a large tree, for something more realistic but still going to
exercise that path.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2012-05-09  7:35 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-02-27 23:22 [PATCH] cpumask: fix lg_lock/br_lock Rusty Russell
2012-02-27 23:53 ` Andrew Morton
2012-02-27 23:53   ` Andrew Morton
2012-02-28  8:43   ` Ingo Molnar
2012-02-28 11:25     ` Andi Kleen
2012-02-28 12:51       ` Ingo Molnar
2012-02-28 21:27     ` Andrew Morton
2012-02-29  5:44       ` Srivatsa S. Bhat
2012-02-29  9:17         ` Ingo Molnar
2012-02-29 11:12           ` Srivatsa S. Bhat
2012-03-01  7:38             ` Ingo Molnar
2012-03-01  9:15               ` Srivatsa S. Bhat
2012-03-01  9:45                 ` Ingo Molnar
2012-03-01  9:56                   ` Srivatsa S. Bhat
2012-03-01  8:12             ` Srivatsa S. Bhat
2012-03-01  8:24               ` Srivatsa S. Bhat
2012-03-01  8:12               ` Srivatsa S. Bhat
2012-03-01  8:15               ` [PATCH 1/3] CPU hotplug: Fix issues with callback registration Srivatsa S. Bhat
2012-03-01  8:27                 ` Srivatsa S. Bhat
2012-03-01  8:15                 ` Srivatsa S. Bhat
2012-03-01  8:16               ` [PATCH 2/3] CPU hotplug, arch/powerpc: Fix CPU hotplug " Srivatsa S. Bhat
2012-03-01  8:28                 ` Srivatsa S. Bhat
2012-03-01  8:16                 ` Srivatsa S. Bhat
2012-03-01  8:18               ` [PATCH 3/3] CPU hotplug, arch/sparc: " Srivatsa S. Bhat
2012-03-01  8:30                 ` Srivatsa S. Bhat
2012-03-01  8:18                 ` Srivatsa S. Bhat
2012-02-29  8:29       ` [PATCH] cpumask: fix lg_lock/br_lock Ingo Molnar
2012-02-29  8:58         ` Peter Zijlstra
2012-02-29  9:32           ` Ingo Molnar
2012-02-28 11:24   ` Andi Kleen
2012-03-05  7:02     ` Rusty Russell
2012-03-05  7:03     ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
2012-04-20 11:12       ` Nick Piggin
2012-03-05  7:04     ` [PATCH 2/3] brlocks/lglocks: API cleanups Rusty Russell
2012-03-05  7:05     ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
2012-04-20 11:21       ` Nick Piggin
2012-05-07  3:39         ` Rusty Russell
2012-05-07  5:46           ` Al Viro
2012-05-08  3:59             ` [PATCH 1/3] lglock: remove online variants of lock Rusty Russell
2012-05-08  4:50               ` Al Viro
2012-05-08  6:12                 ` Rusty Russell
2012-05-08  4:02             ` [PATCH 2/3] brlocks/lglocks: API cleanups Rusty Russell
2012-05-08  4:02             ` [PATCH 3/3] brlocks/lglocks: turn into functions Rusty Russell
2012-05-09  7:35           ` Nick Piggin
2012-05-09  7:35             ` Nick Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.