linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] track number of mnts writing to superblocks
@ 2008-01-02 21:51 Dave Hansen
  2008-01-03  4:02 ` Serge E. Hallyn
  2008-01-10  7:42 ` Andrew Morton
  0 siblings, 2 replies; 4+ messages in thread
From: Dave Hansen @ 2008-01-02 21:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, miklos, hch, serue, Dave Hansen


One of the benefits of the r/o bind mount patches is that they
make it explicit when a write to a superblock might occur.
We currently search sb->s_files when remounting rw->ro to look
for writable files.  But, that search is not comprehensive, and
it is racy.  This replaces that search.

The idea is to keep a bit in each mount saying whether the
mount has any writers on it.  When the bit is set the first
time, we also increment a counter in the superblock.  That
sb counter is the number of mounts with that bit set and
thus, potential writers.

The other problem is that, after we make this check for
the number of writable mounts, we need to exclude all new
writers on those mounts.  We do this by requring that the
superblock mnt writer count be incremented under a
lock_super() and also holding that lock over the remount
operation.  Effectively, this keeps us from *adding* to
the sb's writable mounts during a remount.

The alternative to doing this is to do a much simpler list
of mounts for each superblock.  I could also code that up
to see what it look like.  Shouldn't be too bad.


Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 linux-2.6.git-dave/fs/file_table.c       |   24 -----
 linux-2.6.git-dave/fs/namespace.c        |  134 +++++++++++++++++++++++++------
 linux-2.6.git-dave/fs/super.c            |   61 +++++++++++---
 linux-2.6.git-dave/include/linux/fs.h    |    5 -
 linux-2.6.git-dave/include/linux/mount.h |    3 
 5 files changed, 163 insertions(+), 64 deletions(-)

diff -puN fs/file_table.c~track_sb_mnt_writers fs/file_table.c
--- linux-2.6.git/fs/file_table.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/fs/file_table.c	2008-01-02 10:49:11.000000000 -0800
@@ -374,30 +374,6 @@ void file_kill(struct file *file)
 	}
 }
 
-int fs_may_remount_ro(struct super_block *sb)
-{
-	struct file *file;
-
-	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
-		struct inode *inode = file->f_path.dentry->d_inode;
-
-		/* File with pending delete? */
-		if (inode->i_nlink == 0)
-			goto too_bad;
-
-		/* Writeable file? */
-		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
-			goto too_bad;
-	}
-	file_list_unlock();
-	return 1; /* Tis' cool bro. */
-too_bad:
-	file_list_unlock();
-	return 0;
-}
-
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
diff -puN fs/file_table.c.orig~track_sb_mnt_writers fs/file_table.c.orig
diff -puN fs/namespace.c~track_sb_mnt_writers fs/namespace.c
--- linux-2.6.git/fs/namespace.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c	2008-01-02 13:39:52.000000000 -0800
@@ -118,7 +118,7 @@ struct mnt_writer {
 	 * If holding multiple instances of this lock, they
 	 * must be ordered by cpu number.
 	 */
-	spinlock_t lock;
+	struct mutex lock;
 	struct lock_class_key lock_class; /* compiles out with !lockdep */
 	unsigned long count;
 	struct vfsmount *mnt;
@@ -130,7 +130,7 @@ static int __init init_mnt_writers(void)
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
-		spin_lock_init(&writer->lock);
+		mutex_init(&writer->lock);
 		lockdep_set_class(&writer->lock, &writer->lock_class);
 		writer->count = 0;
 	}
@@ -145,11 +145,52 @@ static void mnt_unlock_cpus(void)
 
 	for_each_possible_cpu(cpu) {
 		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_unlock(&cpu_writer->lock);
+		mutex_unlock(&cpu_writer->lock);
 	}
 }
 
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+static int mark_mnt_has_writer(struct vfsmount *mnt,
+			       struct mnt_writer *cpu_writer)
+{
+	/*
+	 * Ensure that if there are people racing to set
+	 * the bit that only one of them succeeds and can
+	 * increment the sb count.
+	 */
+	if (test_and_set_bit(ilog2(MNT_MAY_HAVE_WRITERS), &mnt->mnt_flags))
+		return 0;
+	if (cpu_writer == NULL)
+		return 0;
+
+	/*
+	 * Our goal here is to get exclusion from a superblock
+	 * remount operation (fs_may_remount_ro()).  This is
+	 * effectively a slow path that we must go through the
+	 * first time we set the bit on each mount, but never
+	 * again unless the writer counts get coalesced.
+	 */
+
+	mutex_unlock(&cpu_writer->lock);
+	lock_super(mnt->mnt_sb);
+
+	atomic_inc(&mnt->mnt_sb->__s_mnt_writers);
+
+	unlock_super(mnt->mnt_sb);
+	mutex_lock(&cpu_writer->lock);
+	return -EAGAIN;
+}
+
+static void __mark_sb_if_writable(struct vfsmount *mnt)
+{
+	int bitnr = ilog2(MNT_MAY_HAVE_WRITERS);
+
+	if (atomic_read(&mnt->__mnt_writers))
+		mark_mnt_has_writer(mnt, NULL);
+	else if (test_and_clear_bit(bitnr, &mnt->mnt_flags))
+		atomic_dec(&mnt->mnt_sb->__s_mnt_writers);
+}
+
+static inline void __move_cpu_mnt_count(struct mnt_writer *cpu_writer)
 {
 	if (!cpu_writer->mnt)
 		return;
@@ -164,7 +205,7 @@ static inline void use_cpu_writer_for_mo
 {
 	if (cpu_writer->mnt == mnt)
 		return;
-	__clear_mnt_count(cpu_writer);
+	__move_cpu_mnt_count(cpu_writer);
 	cpu_writer->mnt = mnt;
 }
 
@@ -188,20 +229,31 @@ static inline void use_cpu_writer_for_mo
  */
 int mnt_want_write(struct vfsmount *mnt)
 {
-	int ret = 0;
+	int ret;
 	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+	/*
+	 * It is OK to not disable preemption here.  We
+	 * only use the per-cpu vars to reduce cache
+	 * effects, and not for any real exclusion.
+	 * We use the mutexes for that.
+	 */
+	cpu_writer = &__get_cpu_var(mnt_writers);
+	mutex_lock(&cpu_writer->lock);
+again:
+	ret = 0;
 	if (__mnt_is_readonly(mnt)) {
 		ret = -EROFS;
 		goto out;
 	}
+	ret = mark_mnt_has_writer(mnt, cpu_writer);
+	/* did we hit the slow path and re-do the lock? */
+	if (ret == -EAGAIN)
+		goto again;
 	use_cpu_writer_for_mount(cpu_writer, mnt);
 	cpu_writer->count++;
 out:
-	spin_unlock(&cpu_writer->lock);
-	put_cpu_var(mnt_writers);
+	mutex_unlock(&cpu_writer->lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -213,13 +265,37 @@ static void lock_and_coalesce_cpu_mnt_wr
 
 	for_each_possible_cpu(cpu) {
 		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		__clear_mnt_count(cpu_writer);
+		mutex_lock(&cpu_writer->lock);
+		__move_cpu_mnt_count(cpu_writer);
+		/*
+		 * __mnt_writers may temporarily hit zero
+		 * (and trigger MNT_MAY_HAVE_WRITERS to get
+		 * cleared), but it will get set again if
+		 * and when another mnt_writer[] has an
+		 * entry for that mnt later in this loop.
+		 * Holding the locks keeps anyone from
+		 * seeing this.
+		 */
+		__mark_sb_if_writable(cpu_writer->mnt);
 		cpu_writer->mnt = NULL;
 	}
 }
 
 /*
+ * This is just an external interface.  I want
+ * to use the long names in here, but leave the
+ * simpler names for external users.
+ */
+void lock_mnt_writers()
+{
+	lock_and_coalesce_cpu_mnt_writer_counts();
+}
+void unlock_mnt_writers()
+{
+	mnt_unlock_cpus();
+}
+
+/*
  * These per-cpu write counts are not guaranteed to have
  * matched increments and decrements on any given cpu.
  * A file open()ed for write on one cpu and close()d on
@@ -262,23 +338,40 @@ static void handle_write_count_underflow
  * performing writes to it.  Must be matched with
  * mnt_want_write() call above.
  */
+#define MNT_WRITERS_UNDERFLOW_LIMIT 1024
 void mnt_drop_write(struct vfsmount *mnt)
 {
 	int must_check_underflow = 0;
 	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+retry:
+	cpu_writer = &__get_cpu_var(mnt_writers);
+	mutex_lock(&cpu_writer->lock);
 
 	use_cpu_writer_for_mount(cpu_writer, mnt);
 	if (cpu_writer->count > 0) {
 		cpu_writer->count--;
 	} else {
-		must_check_underflow = 1;
+		/*
+		 * Without this check, it is theoretically
+		 * possible for large number of processes
+		 * to atomic_dec(__mnt_writers) and cause
+		 * it to underflow.  Because we are under
+		 * the mutex (and there are NR_CPUS
+		 * mutexes), this limits the number of
+		 * concurrent decrements and underflow
+		 * level to NR_CPUS.
+		 */
+		if (atomic_read(&mnt->__mnt_writers) <
+			MNT_WRITER_UNDERFLOW_LIMIT) {
+			mutex_unlock(&cpu_writer->lock);
+			goto retry;
+		}
 		atomic_dec(&mnt->__mnt_writers);
+		must_check_underflow = 1;
 	}
 
-	spin_unlock(&cpu_writer->lock);
+	mutex_unlock(&cpu_writer->lock);
 	/*
 	 * Logically, we could call this each time,
 	 * but the __mnt_writers cacheline tends to
@@ -286,15 +379,6 @@ void mnt_drop_write(struct vfsmount *mnt
 	 */
 	if (must_check_underflow)
 		handle_write_count_underflow(mnt);
-	/*
-	 * This could be done right after the spinlock
-	 * is taken because the spinlock keeps us on
-	 * the cpu, and disables preemption.  However,
-	 * putting it here bounds the amount that
-	 * __mnt_writers can underflow.  Without it,
-	 * we could theoretically wrap __mnt_writers.
-	 */
-	put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
diff -puN fs/namespace.c.orig~track_sb_mnt_writers fs/namespace.c.orig
diff -puN include/linux/fs.h~track_sb_mnt_writers include/linux/fs.h
--- linux-2.6.git/include/linux/fs.h~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fs.h	2008-01-02 10:49:11.000000000 -0800
@@ -1005,6 +1005,9 @@ struct super_block {
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
 #endif
+	atomic_t		__s_mnt_writers;/* how many mounts of this sb
+						 * might have writers.  Only
+						 * stable with lock_mnt_writers() */
 	struct xattr_handler	**s_xattr;
 
 	struct list_head	s_inodes;	/* all inodes */
@@ -1650,8 +1653,6 @@ extern const struct file_operations read
 extern const struct file_operations write_fifo_fops;
 extern const struct file_operations rdwr_fifo_fops;
 
-extern int fs_may_remount_ro(struct super_block *);
-
 #ifdef CONFIG_BLOCK
 /*
  * return READ, READA, or WRITE
diff -puN include/linux/fs.h.orig~track_sb_mnt_writers include/linux/fs.h.orig
diff -puN include/linux/mount.h~track_sb_mnt_writers include/linux/mount.h
--- linux-2.6.git/include/linux/mount.h~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/include/linux/mount.h	2008-01-02 10:49:11.000000000 -0800
@@ -33,6 +33,7 @@ struct mnt_namespace;
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
+#define MNT_MAY_HAVE_WRITERS		0x400 /* did this ever have a writer? */
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -80,6 +81,8 @@ static inline struct vfsmount *mntget(st
 
 extern int mnt_want_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
+extern void lock_mnt_writers(void);
+extern void unlock_mnt_writers(void);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
 extern void mnt_unpin(struct vfsmount *mnt);
diff -puN include/linux/mount.h.orig~track_sb_mnt_writers include/linux/mount.h.orig
diff -puN fs/super.c~track_sb_mnt_writers fs/super.c
--- linux-2.6.git/fs/super.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
+++ linux-2.6.git-dave/fs/super.c	2008-01-02 13:38:15.000000000 -0800
@@ -574,6 +574,41 @@ static void mark_files_ro(struct super_b
 	file_list_unlock();
 }
 
+static inline int fs_may_remount_ro(struct super_block *sb)
+{
+	int ret = 1;
+	lock_mnt_writers();
+	if (atomic_read(&sb->__s_mnt_writers))
+		ret = 0;
+	unlock_mnt_writers();
+	return ret;
+}
+
+/*
+ * This unconditionally acquires the sb lock,
+ * even if it returns an error code, and should
+ * not be used generally.
+ *
+ * This must be done to ensure that no new
+ * writers come in on the mounts after the
+ * check is performed.
+ */
+static int __try_to_make_sb_ro(struct super_block *sb, int force)
+{
+	int retval = 0;
+
+	if (force) {
+		mark_files_ro(sb);
+		lock_super(sb);
+	} else {
+		lock_super(sb);
+		/* Make sure there are no mounts that have writers */
+		if (!fs_may_remount_ro(sb))
+			retval = -EBUSY;
+	}
+	return retval;
+}
+
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
@@ -585,7 +620,7 @@ static void mark_files_ro(struct super_b
  */
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
-	int retval;
+	int retval = 0;
 	
 #ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
@@ -596,23 +631,23 @@ int do_remount_sb(struct super_block *sb
 	shrink_dcache_sb(sb);
 	fsync_super(sb);
 
-	/* If we are remounting RDONLY and current sb is read/write,
-	   make sure there are no rw files opened */
-	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
-		if (force)
-			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
-			return -EBUSY;
-	}
-
-	if (sb->s_op->remount_fs) {
+	/* Are we remounting RDONLY when current sb is read/write? */
+	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
+		retval = __try_to_make_sb_ro(sb, force);
+	else
 		lock_super(sb);
+
+	if (retval)
+		goto out;
+	if (sb->s_op->remount_fs)
 		retval = sb->s_op->remount_fs(sb, &flags, data);
+out:
+	if (retval) {
 		unlock_super(sb);
-		if (retval)
-			return retval;
+		return retval;
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	unlock_super(sb);
 	return 0;
 }
 
diff -puN block/cfq-iosched.c~track_sb_mnt_writers block/cfq-iosched.c
diff -puN fs/namei.c~track_sb_mnt_writers fs/namei.c
diff -puN fs/buffer.c~track_sb_mnt_writers fs/buffer.c
diff -puN kernel/Makefile~track_sb_mnt_writers kernel/Makefile
_

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] track number of mnts writing to superblocks
  2008-01-02 21:51 [PATCH] track number of mnts writing to superblocks Dave Hansen
@ 2008-01-03  4:02 ` Serge E. Hallyn
  2008-01-03 19:37   ` Dave Hansen
  2008-01-10  7:42 ` Andrew Morton
  1 sibling, 1 reply; 4+ messages in thread
From: Serge E. Hallyn @ 2008-01-03  4:02 UTC (permalink / raw)
  To: Dave Hansen; +Cc: akpm, linux-kernel, miklos, hch, serue

Quoting Dave Hansen (haveblue@us.ibm.com):
> 
> One of the benefits of the r/o bind mount patches is that they
> make it explicit when a write to a superblock might occur.
> We currently search sb->s_files when remounting rw->ro to look
> for writable files.  But, that search is not comprehensive, and
> it is racy.  This replaces that search.
> 
> The idea is to keep a bit in each mount saying whether the
> mount has any writers on it.  When the bit is set the first
> time, we also increment a counter in the superblock.  That
> sb counter is the number of mounts with that bit set and
> thus, potential writers.
> 
> The other problem is that, after we make this check for
> the number of writable mounts, we need to exclude all new
> writers on those mounts.  We do this by requring that the
> superblock mnt writer count be incremented under a
> lock_super() and also holding that lock over the remount
> operation.  Effectively, this keeps us from *adding* to
> the sb's writable mounts during a remount.
> 
> The alternative to doing this is to do a much simpler list
> of mounts for each superblock.  I could also code that up
> to see what it look like.  Shouldn't be too bad.

Ok I'm blabbing quite a bit here while trying to figure out
the patch, and maybe there are some useful hints for where more
comments would be useful.  But other than the fact that
mark_mnt_has_writer() needs to the atomic_inc() even if
cpu_writer was passed in as NULL, the patch seems good.

thanks,
-serge

> Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> ---
> 
>  linux-2.6.git-dave/fs/file_table.c       |   24 -----
>  linux-2.6.git-dave/fs/namespace.c        |  134 +++++++++++++++++++++++++------
>  linux-2.6.git-dave/fs/super.c            |   61 +++++++++++---
>  linux-2.6.git-dave/include/linux/fs.h    |    5 -
>  linux-2.6.git-dave/include/linux/mount.h |    3 
>  5 files changed, 163 insertions(+), 64 deletions(-)
> 
> diff -puN fs/file_table.c~track_sb_mnt_writers fs/file_table.c
> --- linux-2.6.git/fs/file_table.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> +++ linux-2.6.git-dave/fs/file_table.c	2008-01-02 10:49:11.000000000 -0800
> @@ -374,30 +374,6 @@ void file_kill(struct file *file)
>  	}
>  }
> 
> -int fs_may_remount_ro(struct super_block *sb)
> -{
> -	struct file *file;
> -
> -	/* Check that no files are currently opened for writing. */
> -	file_list_lock();
> -	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
> -		struct inode *inode = file->f_path.dentry->d_inode;
> -
> -		/* File with pending delete? */
> -		if (inode->i_nlink == 0)
> -			goto too_bad;
> -
> -		/* Writeable file? */
> -		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))

(why did this originally skip directories?)

> -			goto too_bad;
> -	}
> -	file_list_unlock();
> -	return 1; /* Tis' cool bro. */
> -too_bad:
> -	file_list_unlock();
> -	return 0;
> -}
> -
>  void __init files_init(unsigned long mempages)
>  { 
>  	int n; 
> diff -puN fs/file_table.c.orig~track_sb_mnt_writers fs/file_table.c.orig
> diff -puN fs/namespace.c~track_sb_mnt_writers fs/namespace.c
> --- linux-2.6.git/fs/namespace.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> +++ linux-2.6.git-dave/fs/namespace.c	2008-01-02 13:39:52.000000000 -0800
> @@ -118,7 +118,7 @@ struct mnt_writer {
>  	 * If holding multiple instances of this lock, they
>  	 * must be ordered by cpu number.
>  	 */
> -	spinlock_t lock;
> +	struct mutex lock;
>  	struct lock_class_key lock_class; /* compiles out with !lockdep */
>  	unsigned long count;
>  	struct vfsmount *mnt;
> @@ -130,7 +130,7 @@ static int __init init_mnt_writers(void)
>  	int cpu;
>  	for_each_possible_cpu(cpu) {
>  		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
> -		spin_lock_init(&writer->lock);
> +		mutex_init(&writer->lock);
>  		lockdep_set_class(&writer->lock, &writer->lock_class);
>  		writer->count = 0;
>  	}
> @@ -145,11 +145,52 @@ static void mnt_unlock_cpus(void)
> 
>  	for_each_possible_cpu(cpu) {
>  		cpu_writer = &per_cpu(mnt_writers, cpu);
> -		spin_unlock(&cpu_writer->lock);
> +		mutex_unlock(&cpu_writer->lock);
>  	}
>  }
> 
> -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
> +static int mark_mnt_has_writer(struct vfsmount *mnt,
> +			       struct mnt_writer *cpu_writer)
> +{
> +	/*
> +	 * Ensure that if there are people racing to set
> +	 * the bit that only one of them succeeds and can
> +	 * increment the sb count.
> +	 */
> +	if (test_and_set_bit(ilog2(MNT_MAY_HAVE_WRITERS), &mnt->mnt_flags))
> +		return 0;

Comment isn't entirely clear, but you're returning 0 here because
someone else has already set the flag and incremented
sb->__s_mnt_writers so you don't have to and you're all set to go on
with writing?

> +	if (cpu_writer == NULL)
> +		return 0;
> +
> +	/*
> +	 * Our goal here is to get exclusion from a superblock
> +	 * remount operation (fs_may_remount_ro()).  This is
> +	 * effectively a slow path that we must go through the
> +	 * first time we set the bit on each mount, but never
> +	 * again unless the writer counts get coalesced.
> +	 */
> +
> +	mutex_unlock(&cpu_writer->lock);
> +	lock_super(mnt->mnt_sb);
> +
> +	atomic_inc(&mnt->mnt_sb->__s_mnt_writers);

The atomic_inc of course should be done even if cpu_writer was passed
in as NULL, you just don't need to do the locking then, and can
return 0 in that case?

> +
> +	unlock_super(mnt->mnt_sb);
> +	mutex_lock(&cpu_writer->lock);
> +	return -EAGAIN;
> +}
> +
> +static void __mark_sb_if_writable(struct vfsmount *mnt)

This function is taking the writable state from a mnt and marking it in
the sb.  So the name should be a shorter verison of something like
"commit_mnt_writable_state_to_sb"

> +{
> +	int bitnr = ilog2(MNT_MAY_HAVE_WRITERS);
> +
> +	if (atomic_read(&mnt->__mnt_writers))
> +		mark_mnt_has_writer(mnt, NULL);
> +	else if (test_and_clear_bit(bitnr, &mnt->mnt_flags))
> +		atomic_dec(&mnt->mnt_sb->__s_mnt_writers);

And after staring at this code for awhile it did make sense, but a
comment above it would help saying that

	/* If mnt has writers, mark_mnt_has_writer() will make
	   sure that is marked in the sb.  If mnt has no writers,
	   then if mnt->mnt_flags was previously set, that means
	   that mnt->mnt_sb->__s_mnt_writers reflects our having
	   writers, so we decrement it
	 */

Ok, maybe it's clearer to other people and the comment isn't needed...

> +}
> +
> +static inline void __move_cpu_mnt_count(struct mnt_writer *cpu_writer)
>  {
>  	if (!cpu_writer->mnt)
>  		return;
> @@ -164,7 +205,7 @@ static inline void use_cpu_writer_for_mo
>  {
>  	if (cpu_writer->mnt == mnt)
>  		return;
> -	__clear_mnt_count(cpu_writer);
> +	__move_cpu_mnt_count(cpu_writer);
>  	cpu_writer->mnt = mnt;
>  }
> 
> @@ -188,20 +229,31 @@ static inline void use_cpu_writer_for_mo
>   */
>  int mnt_want_write(struct vfsmount *mnt)
>  {
> -	int ret = 0;
> +	int ret;
>  	struct mnt_writer *cpu_writer;
> 
> -	cpu_writer = &get_cpu_var(mnt_writers);
> -	spin_lock(&cpu_writer->lock);
> +	/*
> +	 * It is OK to not disable preemption here.  We
> +	 * only use the per-cpu vars to reduce cache
> +	 * effects, and not for any real exclusion.
> +	 * We use the mutexes for that.
> +	 */
> +	cpu_writer = &__get_cpu_var(mnt_writers);
> +	mutex_lock(&cpu_writer->lock);
> +again:
> +	ret = 0;
>  	if (__mnt_is_readonly(mnt)) {
>  		ret = -EROFS;
>  		goto out;
>  	}
> +	ret = mark_mnt_has_writer(mnt, cpu_writer);
> +	/* did we hit the slow path and re-do the lock? */
> +	if (ret == -EAGAIN)
> +		goto again;
>  	use_cpu_writer_for_mount(cpu_writer, mnt);
>  	cpu_writer->count++;
>  out:
> -	spin_unlock(&cpu_writer->lock);
> -	put_cpu_var(mnt_writers);
> +	mutex_unlock(&cpu_writer->lock);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(mnt_want_write);
> @@ -213,13 +265,37 @@ static void lock_and_coalesce_cpu_mnt_wr
> 
>  	for_each_possible_cpu(cpu) {
>  		cpu_writer = &per_cpu(mnt_writers, cpu);
> -		spin_lock(&cpu_writer->lock);
> -		__clear_mnt_count(cpu_writer);
> +		mutex_lock(&cpu_writer->lock);
> +		__move_cpu_mnt_count(cpu_writer);
> +		/*
> +		 * __mnt_writers may temporarily hit zero
> +		 * (and trigger MNT_MAY_HAVE_WRITERS to get
> +		 * cleared), but it will get set again if
> +		 * and when another mnt_writer[] has an
> +		 * entry for that mnt later in this loop.
> +		 * Holding the locks keeps anyone from
> +		 * seeing this.
> +		 */
> +		__mark_sb_if_writable(cpu_writer->mnt);
>  		cpu_writer->mnt = NULL;
>  	}
>  }
> 
>  /*
> + * This is just an external interface.  I want
> + * to use the long names in here, but leave the
> + * simpler names for external users.
> + */
> +void lock_mnt_writers()
> +{
> +	lock_and_coalesce_cpu_mnt_writer_counts();
> +}
> +void unlock_mnt_writers()
> +{
> +	mnt_unlock_cpus();
> +}
> +
> +/*
>   * These per-cpu write counts are not guaranteed to have
>   * matched increments and decrements on any given cpu.
>   * A file open()ed for write on one cpu and close()d on
> @@ -262,23 +338,40 @@ static void handle_write_count_underflow
>   * performing writes to it.  Must be matched with
>   * mnt_want_write() call above.
>   */
> +#define MNT_WRITERS_UNDERFLOW_LIMIT 1024
>  void mnt_drop_write(struct vfsmount *mnt)
>  {
>  	int must_check_underflow = 0;
>  	struct mnt_writer *cpu_writer;
> 
> -	cpu_writer = &get_cpu_var(mnt_writers);
> -	spin_lock(&cpu_writer->lock);
> +retry:
> +	cpu_writer = &__get_cpu_var(mnt_writers);
> +	mutex_lock(&cpu_writer->lock);
> 
>  	use_cpu_writer_for_mount(cpu_writer, mnt);
>  	if (cpu_writer->count > 0) {
>  		cpu_writer->count--;
>  	} else {
> -		must_check_underflow = 1;
> +		/*
> +		 * Without this check, it is theoretically
> +		 * possible for large number of processes
> +		 * to atomic_dec(__mnt_writers) and cause
> +		 * it to underflow.  Because we are under
> +		 * the mutex (and there are NR_CPUS
> +		 * mutexes), this limits the number of
> +		 * concurrent decrements and underflow
> +		 * level to NR_CPUS.
> +		 */
> +		if (atomic_read(&mnt->__mnt_writers) <
> +			MNT_WRITER_UNDERFLOW_LIMIT) {
> +			mutex_unlock(&cpu_writer->lock);
> +			goto retry;
> +		}
>  		atomic_dec(&mnt->__mnt_writers);
> +		must_check_underflow = 1;
>  	}
> 
> -	spin_unlock(&cpu_writer->lock);
> +	mutex_unlock(&cpu_writer->lock);
>  	/*
>  	 * Logically, we could call this each time,
>  	 * but the __mnt_writers cacheline tends to
> @@ -286,15 +379,6 @@ void mnt_drop_write(struct vfsmount *mnt
>  	 */
>  	if (must_check_underflow)
>  		handle_write_count_underflow(mnt);
> -	/*
> -	 * This could be done right after the spinlock
> -	 * is taken because the spinlock keeps us on
> -	 * the cpu, and disables preemption.  However,
> -	 * putting it here bounds the amount that
> -	 * __mnt_writers can underflow.  Without it,
> -	 * we could theoretically wrap __mnt_writers.
> -	 */
> -	put_cpu_var(mnt_writers);
>  }
>  EXPORT_SYMBOL_GPL(mnt_drop_write);
> 
> diff -puN fs/namespace.c.orig~track_sb_mnt_writers fs/namespace.c.orig
> diff -puN include/linux/fs.h~track_sb_mnt_writers include/linux/fs.h
> --- linux-2.6.git/include/linux/fs.h~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> +++ linux-2.6.git-dave/include/linux/fs.h	2008-01-02 10:49:11.000000000 -0800
> @@ -1005,6 +1005,9 @@ struct super_block {
>  #ifdef CONFIG_SECURITY
>  	void                    *s_security;
>  #endif
> +	atomic_t		__s_mnt_writers;/* how many mounts of this sb
> +						 * might have writers.  Only
> +						 * stable with lock_mnt_writers() */
>  	struct xattr_handler	**s_xattr;
> 
>  	struct list_head	s_inodes;	/* all inodes */
> @@ -1650,8 +1653,6 @@ extern const struct file_operations read
>  extern const struct file_operations write_fifo_fops;
>  extern const struct file_operations rdwr_fifo_fops;
> 
> -extern int fs_may_remount_ro(struct super_block *);
> -
>  #ifdef CONFIG_BLOCK
>  /*
>   * return READ, READA, or WRITE
> diff -puN include/linux/fs.h.orig~track_sb_mnt_writers include/linux/fs.h.orig
> diff -puN include/linux/mount.h~track_sb_mnt_writers include/linux/mount.h
> --- linux-2.6.git/include/linux/mount.h~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> +++ linux-2.6.git-dave/include/linux/mount.h	2008-01-02 10:49:11.000000000 -0800
> @@ -33,6 +33,7 @@ struct mnt_namespace;
> 
>  #define MNT_SHRINKABLE	0x100
>  #define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
> +#define MNT_MAY_HAVE_WRITERS		0x400 /* did this ever have a writer? */
> 
>  #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
>  #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
> @@ -80,6 +81,8 @@ static inline struct vfsmount *mntget(st
> 
>  extern int mnt_want_write(struct vfsmount *mnt);
>  extern void mnt_drop_write(struct vfsmount *mnt);
> +extern void lock_mnt_writers(void);
> +extern void unlock_mnt_writers(void);
>  extern void mntput_no_expire(struct vfsmount *mnt);
>  extern void mnt_pin(struct vfsmount *mnt);
>  extern void mnt_unpin(struct vfsmount *mnt);
> diff -puN include/linux/mount.h.orig~track_sb_mnt_writers include/linux/mount.h.orig
> diff -puN fs/super.c~track_sb_mnt_writers fs/super.c
> --- linux-2.6.git/fs/super.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> +++ linux-2.6.git-dave/fs/super.c	2008-01-02 13:38:15.000000000 -0800
> @@ -574,6 +574,41 @@ static void mark_files_ro(struct super_b
>  	file_list_unlock();
>  }
> 
> +static inline int fs_may_remount_ro(struct super_block *sb)
> +{
> +	int ret = 1;
> +	lock_mnt_writers();
> +	if (atomic_read(&sb->__s_mnt_writers))
> +		ret = 0;
> +	unlock_mnt_writers();
> +	return ret;
> +}
> +
> +/*
> + * This unconditionally acquires the sb lock,
> + * even if it returns an error code, and should
> + * not be used generally.
> + *
> + * This must be done to ensure that no new
> + * writers come in on the mounts after the
> + * check is performed.
> + */
> +static int __try_to_make_sb_ro(struct super_block *sb, int force)
> +{
> +	int retval = 0;
> +
> +	if (force) {
> +		mark_files_ro(sb);
> +		lock_super(sb);
> +	} else {
> +		lock_super(sb);
> +		/* Make sure there are no mounts that have writers */
> +		if (!fs_may_remount_ro(sb))
> +			retval = -EBUSY;
> +	}
> +	return retval;
> +}
> +
>  /**
>   *	do_remount_sb - asks filesystem to change mount options.
>   *	@sb:	superblock in question
> @@ -585,7 +620,7 @@ static void mark_files_ro(struct super_b
>   */
>  int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
>  {
> -	int retval;
> +	int retval = 0;
>  	
>  #ifdef CONFIG_BLOCK
>  	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
> @@ -596,23 +631,23 @@ int do_remount_sb(struct super_block *sb
>  	shrink_dcache_sb(sb);
>  	fsync_super(sb);
> 
> -	/* If we are remounting RDONLY and current sb is read/write,
> -	   make sure there are no rw files opened */
> -	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
> -		if (force)
> -			mark_files_ro(sb);
> -		else if (!fs_may_remount_ro(sb))
> -			return -EBUSY;
> -	}
> -
> -	if (sb->s_op->remount_fs) {
> +	/* Are we remounting RDONLY when current sb is read/write? */
> +	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
> +		retval = __try_to_make_sb_ro(sb, force);
> +	else
>  		lock_super(sb);
> +
> +	if (retval)
> +		goto out;
> +	if (sb->s_op->remount_fs)
>  		retval = sb->s_op->remount_fs(sb, &flags, data);
> +out:
> +	if (retval) {
>  		unlock_super(sb);
> -		if (retval)
> -			return retval;
> +		return retval;
>  	}
>  	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
> +	unlock_super(sb);
>  	return 0;
>  }
> 
> diff -puN block/cfq-iosched.c~track_sb_mnt_writers block/cfq-iosched.c
> diff -puN fs/namei.c~track_sb_mnt_writers fs/namei.c
> diff -puN fs/buffer.c~track_sb_mnt_writers fs/buffer.c
> diff -puN kernel/Makefile~track_sb_mnt_writers kernel/Makefile
> _

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] track number of mnts writing to superblocks
  2008-01-03  4:02 ` Serge E. Hallyn
@ 2008-01-03 19:37   ` Dave Hansen
  0 siblings, 0 replies; 4+ messages in thread
From: Dave Hansen @ 2008-01-03 19:37 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: akpm, linux-kernel, miklos, hch

On Wed, 2008-01-02 at 22:02 -0600, Serge E. Hallyn wrote:
> Ok I'm blabbing quite a bit here while trying to figure out
> the patch, and maybe there are some useful hints for where more
> comments would be useful.  But other than the fact that
> mark_mnt_has_writer() needs to the atomic_inc() even if
> cpu_writer was passed in as NULL, the patch seems good.

Yeah, I screwed that up.  Should be fixed now.

> > Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
> > ---
> > 
> >  linux-2.6.git-dave/fs/file_table.c       |   24 -----
> >  linux-2.6.git-dave/fs/namespace.c        |  134 +++++++++++++++++++++++++------
> >  linux-2.6.git-dave/fs/super.c            |   61 +++++++++++---
> >  linux-2.6.git-dave/include/linux/fs.h    |    5 -
> >  linux-2.6.git-dave/include/linux/mount.h |    3 
> >  5 files changed, 163 insertions(+), 64 deletions(-)
> > 
> > diff -puN fs/file_table.c~track_sb_mnt_writers fs/file_table.c
> > --- linux-2.6.git/fs/file_table.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> > +++ linux-2.6.git-dave/fs/file_table.c	2008-01-02 10:49:11.000000000 -0800
> > @@ -374,30 +374,6 @@ void file_kill(struct file *file)
> >  	}
> >  }
> > 
> > -int fs_may_remount_ro(struct super_block *sb)
> > -{
> > -	struct file *file;
> > -
> > -	/* Check that no files are currently opened for writing. */
> > -	file_list_lock();
> > -	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
> > -		struct inode *inode = file->f_path.dentry->d_inode;
> > -
> > -		/* File with pending delete? */
> > -		if (inode->i_nlink == 0)
> > -			goto too_bad;
> > -
> > -		/* Writeable file? */
> > -		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
> 
> (why did this originally skip directories?)

I think it's more to skip device files and named pipes than directories.
I don't even know what happens offhand if you try a plain write() to an
open directory.

> > diff -puN fs/file_table.c.orig~track_sb_mnt_writers fs/file_table.c.orig
> > diff -puN fs/namespace.c~track_sb_mnt_writers fs/namespace.c
> > --- linux-2.6.git/fs/namespace.c~track_sb_mnt_writers	2008-01-02 10:49:11.000000000 -0800
> > +++ linux-2.6.git-dave/fs/namespace.c	2008-01-02 13:39:52.000000000 -0800
> > @@ -118,7 +118,7 @@ struct mnt_writer {
> >  	 * If holding multiple instances of this lock, they
> >  	 * must be ordered by cpu number.
> >  	 */
> > -	spinlock_t lock;
> > +	struct mutex lock;
> >  	struct lock_class_key lock_class; /* compiles out with !lockdep */
> >  	unsigned long count;
> >  	struct vfsmount *mnt;
> > @@ -130,7 +130,7 @@ static int __init init_mnt_writers(void)
> >  	int cpu;
> >  	for_each_possible_cpu(cpu) {
> >  		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
> > -		spin_lock_init(&writer->lock);
> > +		mutex_init(&writer->lock);
> >  		lockdep_set_class(&writer->lock, &writer->lock_class);
> >  		writer->count = 0;
> >  	}
> > @@ -145,11 +145,52 @@ static void mnt_unlock_cpus(void)
> > 
> >  	for_each_possible_cpu(cpu) {
> >  		cpu_writer = &per_cpu(mnt_writers, cpu);
> > -		spin_unlock(&cpu_writer->lock);
> > +		mutex_unlock(&cpu_writer->lock);
> >  	}
> >  }
> > 
> > -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
> > +static int mark_mnt_has_writer(struct vfsmount *mnt,
> > +			       struct mnt_writer *cpu_writer)
> > +{
> > +	/*
> > +	 * Ensure that if there are people racing to set
> > +	 * the bit that only one of them succeeds and can
> > +	 * increment the sb count.
> > +	 */
> > +	if (test_and_set_bit(ilog2(MNT_MAY_HAVE_WRITERS), &mnt->mnt_flags))
> > +		return 0;
> 
> Comment isn't entirely clear, but you're returning 0 here because
> someone else has already set the flag and incremented
> sb->__s_mnt_writers so you don't have to and you're all set to go on
> with writing?

Yeah.  This function is all about making sure that the sb is marked
properly because the mnt is writable.  If it's already marked, then
we're good to go.

> > +	if (cpu_writer == NULL)
> > +		return 0;
> > +
> > +	/*
> > +	 * Our goal here is to get exclusion from a superblock
> > +	 * remount operation (fs_may_remount_ro()).  This is
> > +	 * effectively a slow path that we must go through the
> > +	 * first time we set the bit on each mount, but never
> > +	 * again unless the writer counts get coalesced.
> > +	 */
> > +
> > +	mutex_unlock(&cpu_writer->lock);
> > +	lock_super(mnt->mnt_sb);
> > +
> > +	atomic_inc(&mnt->mnt_sb->__s_mnt_writers);
> 
> The atomic_inc of course should be done even if cpu_writer was passed
> in as NULL, you just don't need to do the locking then, and can
> return 0 in that case?

Yep.  I'll fix that.

> > +
> > +	unlock_super(mnt->mnt_sb);
> > +	mutex_lock(&cpu_writer->lock);
> > +	return -EAGAIN;
> > +}
> > +
> > +static void __mark_sb_if_writable(struct vfsmount *mnt)
> 
> This function is taking the writable state from a mnt and marking it in
> the sb.  So the name should be a shorter verison of something like
> "commit_mnt_writable_state_to_sb"

Here's what I have now:

static void __sync_mnt_writable_to_sb(struct vfsmount *mnt)

> > +{
> > +	int bitnr = ilog2(MNT_MAY_HAVE_WRITERS);
> > +
> > +	if (atomic_read(&mnt->__mnt_writers))
> > +		mark_mnt_has_writer(mnt, NULL);
> > +	else if (test_and_clear_bit(bitnr, &mnt->mnt_flags))
> > +		atomic_dec(&mnt->mnt_sb->__s_mnt_writers);
> 
> And after staring at this code for awhile it did make sense, but a
> comment above it would help saying that
> 
> 	/* If mnt has writers, mark_mnt_has_writer() will make
> 	   sure that is marked in the sb.  If mnt has no writers,
> 	   then if mnt->mnt_flags was previously set, that means
> 	   that mnt->mnt_sb->__s_mnt_writers reflects our having
> 	   writers, so we decrement it
> 	 */
> 
> Ok, maybe it's clearer to other people and the comment isn't needed...

You couldn't figure it out at first glance, and I tend to think that
most people will have the same experience.  I'll see if I can add a
better comment.

I'll send out an updated patch in a bit along with a simpler alternative
patch.

-- Dave


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] track number of mnts writing to superblocks
  2008-01-02 21:51 [PATCH] track number of mnts writing to superblocks Dave Hansen
  2008-01-03  4:02 ` Serge E. Hallyn
@ 2008-01-10  7:42 ` Andrew Morton
  1 sibling, 0 replies; 4+ messages in thread
From: Andrew Morton @ 2008-01-10  7:42 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, miklos, hch, serue

On Wed, 02 Jan 2008 13:51:10 -0800 Dave Hansen <haveblue@us.ibm.com> wrote:

> 
> One of the benefits of the r/o bind mount patches is that they
> make it explicit when a write to a superblock might occur.
> We currently search sb->s_files when remounting rw->ro to look
> for writable files.  But, that search is not comprehensive, and
> it is racy.  This replaces that search.
> 
> The idea is to keep a bit in each mount saying whether the
> mount has any writers on it.  When the bit is set the first
> time, we also increment a counter in the superblock.  That
> sb counter is the number of mounts with that bit set and
> thus, potential writers.
> 
> The other problem is that, after we make this check for
> the number of writable mounts, we need to exclude all new
> writers on those mounts.  We do this by requring that the
> superblock mnt writer count be incremented under a
> lock_super() and also holding that lock over the remount
> operation.  Effectively, this keeps us from *adding* to
> the sb's writable mounts during a remount.
> 
> The alternative to doing this is to do a much simpler list
> of mounts for each superblock.  I could also code that up
> to see what it look like.  Shouldn't be too bad.
> 
>  ...
>
> -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
> +static int mark_mnt_has_writer(struct vfsmount *mnt,
> +			       struct mnt_writer *cpu_writer)
> +{
> +	/*
> +	 * Ensure that if there are people racing to set
> +	 * the bit that only one of them succeeds and can
> +	 * increment the sb count.
> +	 */
> +	if (test_and_set_bit(ilog2(MNT_MAY_HAVE_WRITERS), &mnt->mnt_flags))
> +		return 0;

boggle.

a) bitops are to be used on unsigned long only, and mnt_flags is int.

b) nowhere else is mnt_flags used as a bitfield.  Presumably (hopefully)
   it already has some locking.  Suggest that the same locking be used
   here.

c) Given that all other modifiers of mnt_flags are using non-atomic
   rmw's, they can trash this function's rmw.  What happens then?

> +static void __mark_sb_if_writable(struct vfsmount *mnt)
> +{
> +	int bitnr = ilog2(MNT_MAY_HAVE_WRITERS);
> +
> +	if (atomic_read(&mnt->__mnt_writers))
> +		mark_mnt_has_writer(mnt, NULL);
> +	else if (test_and_clear_bit(bitnr, &mnt->mnt_flags))
> +		atomic_dec(&mnt->mnt_sb->__s_mnt_writers);
> +}

and elsewhere.


I'm a bit surprised to see such a thing coming from your direction, Dave.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-01-10  7:53 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-02 21:51 [PATCH] track number of mnts writing to superblocks Dave Hansen
2008-01-03  4:02 ` Serge E. Hallyn
2008-01-03 19:37   ` Dave Hansen
2008-01-10  7:42 ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).