[PATCH v5 0/4] read-only remount race fix

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v5 0/4] read-only remount race fix
@ 2011-08-03 10:48 Miklos Szeredi
  2011-08-03 10:48 ` [PATCH 1/4] vfs: ignore error on forced remount Miklos Szeredi
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-03 10:48 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

Al,

Here's an update of the read-only remount race fixes.

  git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git read-only-remount-fixes.v5

Please apply.

Thanks,
Miklos
---


Miklos Szeredi (4):
      vfs: ignore error on forced remount
      vfs: keep list of mounts for each superblock
      vfs: protect remounting superblock read-only
      vfs: fs_may_remount_ro: turn unnecessary check into a WARN_ON

---
 fs/file_table.c       |    7 ++++-
 fs/internal.h         |    1 +
 fs/namespace.c        |   50 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/super.c            |   27 +++++++++++++++++++++----
 include/linux/fs.h    |    4 +++
 include/linux/mount.h |    1 +
 6 files changed, 82 insertions(+), 8 deletions(-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/4] vfs: ignore error on forced remount
  2011-08-03 10:48 [PATCH v5 0/4] read-only remount race fix Miklos Szeredi
@ 2011-08-03 10:48 ` Miklos Szeredi
  2011-08-03 10:49   ` Christoph Hellwig
  2011-08-03 10:48 ` [PATCH 2/4] vfs: keep list of mounts for each superblock Miklos Szeredi
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-03 10:48 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

From: Miklos Szeredi <mszeredi@suse.cz>

On emergency remount we want to force MS_RDONLY on the super block
even if ->remount_fs() failed for some reason.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/super.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 3f56a26..5398975 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -727,7 +727,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval)
+		/* If forced remount, go ahead despite any errors */
+		if (retval && !force)
 			return retval;
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/4] vfs: keep list of mounts for each superblock
  2011-08-03 10:48 [PATCH v5 0/4] read-only remount race fix Miklos Szeredi
  2011-08-03 10:48 ` [PATCH 1/4] vfs: ignore error on forced remount Miklos Szeredi
@ 2011-08-03 10:48 ` Miklos Szeredi
  2011-08-03 21:53   ` Christoph Hellwig
  2011-08-03 22:17   ` Al Viro
  2011-08-03 10:48 ` [PATCH 3/4] vfs: protect remounting superblock read-only Miklos Szeredi
  2011-08-03 10:48 ` [PATCH 4/4] vfs: fs_may_remount_ro: turn unnecessary check into a WARN_ON Miklos Szeredi
  3 siblings, 2 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-03 10:48 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

From: Miklos Szeredi <mszeredi@suse.cz>

Keep track of vfsmounts belonging to a superblock.  List is protected
by vfsmount_lock.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namespace.c        |   10 ++++++++++
 fs/super.c            |    2 ++
 include/linux/fs.h    |    1 +
 include/linux/mount.h |    1 +
 4 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 22bfe82..c369c32 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -696,6 +696,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	mnt->mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
+
+	br_write_lock(vfsmount_lock);
+	list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
+	br_write_unlock(vfsmount_lock);
+
 	return mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,6 +750,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 			if (!list_empty(&old->mnt_expire))
 				list_add(&mnt->mnt_expire, &old->mnt_expire);
 		}
+
+		br_write_lock(vfsmount_lock);
+		list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
+		br_write_unlock(vfsmount_lock);
 	}
 	return mnt;
 
@@ -805,6 +814,7 @@ put_again:
 		acct_auto_close_mnt(mnt);
 		goto put_again;
 	}
+	list_del(&mnt->mnt_instance);
 	br_write_unlock(vfsmount_lock);
 	mntfree(mnt);
 }
diff --git a/fs/super.c b/fs/super.c
index 5398975..994003b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -142,6 +142,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_LIST_HEAD(&s->s_dentry_lru);
 		INIT_LIST_HEAD(&s->s_inode_lru);
 		spin_lock_init(&s->s_inode_lru_lock);
+		INIT_LIST_HEAD(&s->s_mounts);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -200,6 +201,7 @@ static inline void destroy_super(struct super_block *s)
 	free_percpu(s->s_files);
 #endif
 	security_sb_free(s);
+	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
 	kfree(s->s_options);
 	kfree(s);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 786b3b1..ca046b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1395,6 +1395,7 @@ struct super_block {
 #else
 	struct list_head	s_files;
 #endif
+	struct list_head	s_mounts;	/* list of mounts */
 	/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 33fe53d..f88c726 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -67,6 +67,7 @@ struct vfsmount {
 #endif
 	struct list_head mnt_mounts;	/* list of children, anchored here */
 	struct list_head mnt_child;	/* and going through their mnt_child */
+	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
 	int mnt_flags;
 	/* 4 bytes hole on 64bits arches without fsnotify */
 #ifdef CONFIG_FSNOTIFY
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] vfs: protect remounting superblock read-only
  2011-08-03 10:48 [PATCH v5 0/4] read-only remount race fix Miklos Szeredi
  2011-08-03 10:48 ` [PATCH 1/4] vfs: ignore error on forced remount Miklos Szeredi
  2011-08-03 10:48 ` [PATCH 2/4] vfs: keep list of mounts for each superblock Miklos Szeredi
@ 2011-08-03 10:48 ` Miklos Szeredi
  2011-08-03 10:48 ` [PATCH 4/4] vfs: fs_may_remount_ro: turn unnecessary check into a WARN_ON Miklos Szeredi
  3 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-03 10:48 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

From: Miklos Szeredi <mszeredi@suse.cz>

Currently remouting superblock read-only is racy in a major way.

With the per mount read-only infrastructure it is now possible to
prevent most races, which this patch attempts.

Before starting the remount read-only, iterate through all mounts
belonging to the superblock and if none of them have any pending
writes, set sb->s_readonly_remount.  This indicates that remount is in
progress and no further write requests are allowed.  If the remount
succeeds set MS_RDONLY and reset s_readonly_remount.

If the remounting is unsuccessful just reset s_readonly_remount.
This can result in transient EROFS errors, despite the fact the
remount failed.  Unfortunately hodling off writes is difficult as
remount itself may touch the filesystem (e.g. through load_nls())
which would deadlock.

Small races still remain because delayed writes due to nlink going to
zero but inode still having a reference are not dealt with by this
patch.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/internal.h      |    1 +
 fs/namespace.c     |   40 +++++++++++++++++++++++++++++++++++++++-
 fs/super.c         |   22 ++++++++++++++++++----
 include/linux/fs.h |    3 +++
 4 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index fe327c2..f925271 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -74,6 +74,7 @@ extern int finish_automount(struct vfsmount *, struct path *);
 
 extern void mnt_make_longterm(struct vfsmount *);
 extern void mnt_make_shortterm(struct vfsmount *);
+extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index c369c32..263201c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -311,6 +311,15 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
 #endif
 }
 
+static int mnt_is_readonly(struct vfsmount *mnt)
+{
+	if (mnt->mnt_sb->s_readonly_remount)
+		return 1;
+	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
+	smp_rmb();
+	return __mnt_is_readonly(mnt);
+}
+
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
@@ -349,7 +358,7 @@ int mnt_want_write(struct vfsmount *mnt)
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
-	if (__mnt_is_readonly(mnt)) {
+	if (mnt_is_readonly(mnt)) {
 		mnt_dec_writers(mnt);
 		ret = -EROFS;
 		goto out;
@@ -466,6 +475,35 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	br_write_unlock(vfsmount_lock);
 }
 
+int sb_prepare_remount_readonly(struct super_block *sb)
+{
+	struct vfsmount *mnt;
+	int err = 0;
+
+	br_write_lock(vfsmount_lock);
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (!(mnt->mnt_flags & MNT_READONLY)) {
+			mnt->mnt_flags |= MNT_WRITE_HOLD;
+			smp_mb();
+			if (mnt_get_writers(mnt) > 0) {
+				err = -EBUSY;
+				break;
+			}
+		}
+	}
+	if (!err) {
+		sb->s_readonly_remount = 1;
+		smp_wmb();
+	}
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_flags & MNT_WRITE_HOLD)
+			mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	}
+	br_write_unlock(vfsmount_lock);
+
+	return err;
+}
+
 static void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/fs/super.c b/fs/super.c
index 994003b..62c6aa9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -721,19 +721,29 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if (remount_ro) {
-		if (force)
+		if (force) {
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
-			return -EBUSY;
+		} else {
+			retval = sb_prepare_remount_readonly(sb);
+			if (retval)
+				return retval;
+
+			retval = -EBUSY;
+			if (!fs_may_remount_ro(sb))
+				goto cancel_readonly;
+		}
 	}
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		/* If forced remount, go ahead despite any errors */
 		if (retval && !force)
-			return retval;
+			goto cancel_readonly;
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	/* Needs to be ordered wrt mnt_is_readonly() */
+	smp_wmb();
+	sb->s_readonly_remount = 0;
 
 	/*
 	 * Some filesystems modify their metadata via some other path than the
@@ -746,6 +756,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (remount_ro && sb->s_bdev)
 		invalidate_bdev(sb->s_bdev);
 	return 0;
+
+cancel_readonly:
+	sb->s_readonly_remount = 0;
+	return retval;
 }
 
 static void do_emergency_remount(struct work_struct *work)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca046b4..9d2dba5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1449,6 +1449,9 @@ struct super_block {
 	int cleancache_poolid;
 
 	struct shrinker s_shrink;	/* per-sb shrinker handle */
+
+	/* Being remounted read-only */
+	int s_readonly_remount;
 };
 
 /* superblock cache pruning functions */
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/4] vfs: fs_may_remount_ro: turn unnecessary check into a WARN_ON
  2011-08-03 10:48 [PATCH v5 0/4] read-only remount race fix Miklos Szeredi
                   ` (2 preceding siblings ...)
  2011-08-03 10:48 ` [PATCH 3/4] vfs: protect remounting superblock read-only Miklos Szeredi
@ 2011-08-03 10:48 ` Miklos Szeredi
  3 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-03 10:48 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

From: Miklos Szeredi <mszeredi@suse.cz>

Now a successful sb_prepare_remount_readonly() should ensure that no
writable files remain for this superblock.  So turn this check into a
WARN_ON.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/file_table.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index c322794..dc9f437 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -486,8 +486,11 @@ int fs_may_remount_ro(struct super_block *sb)
 		if (inode->i_nlink == 0)
 			goto too_bad;
 
-		/* Writeable file? */
-		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
+		/*
+		 * Writable file?
+		 * Should be caught by sb_prepare_remount_readonly().
+		 */
+		if (WARN_ON(S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)))
 			goto too_bad;
 	} while_file_list_for_each_entry;
 	lg_global_unlock(files_lglock);
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] vfs: ignore error on forced remount
  2011-08-03 10:48 ` [PATCH 1/4] vfs: ignore error on forced remount Miklos Szeredi
@ 2011-08-03 10:49   ` Christoph Hellwig
  2011-08-03 16:15     ` Miklos Szeredi
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2011-08-03 10:49 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: viro, linux-fsdevel, linux-kernel, jack, hch, akpm,
	toshi.okajima, mszeredi

On Wed, Aug 03, 2011 at 12:48:38PM +0200, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> On emergency remount we want to force MS_RDONLY on the super block
> even if ->remount_fs() failed for some reason.

Eww, dangerous territory.  At the very least we should log something
in the kernel log for thise case.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] vfs: ignore error on forced remount
  2011-08-03 10:49   ` Christoph Hellwig
@ 2011-08-03 16:15     ` Miklos Szeredi
  0 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-03 16:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: viro, linux-fsdevel, linux-kernel, jack, akpm, toshi.okajima

Christoph Hellwig <hch@infradead.org> writes:

> On Wed, Aug 03, 2011 at 12:48:38PM +0200, Miklos Szeredi wrote:
>> From: Miklos Szeredi <mszeredi@suse.cz>
>> 
>> On emergency remount we want to force MS_RDONLY on the super block
>> even if ->remount_fs() failed for some reason.
>
> Eww, dangerous territory.  At the very least we should log something
> in the kernel log for thise case.

Like this?

Thanks,
Miklos
----

Subject: vfs: ignore error on forced remount

From: Miklos Szeredi <mszeredi@suse.cz>

On emergency remount we want to force MS_RDONLY on the super block
even if ->remount_fs() failed for some reason.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/super.c |    9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c	2011-08-03 16:54:26.000000000 +0200
+++ linux-2.6/fs/super.c	2011-08-03 17:11:56.000000000 +0200
@@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval)
-			return retval;
+		if (retval) {
+			if(!force)
+				return retval;
+			/* If forced remount, go ahead despite any errors */
+			WARN(1, "forced remount of a %s fs returned %i\n",
+			     sb->s_type->name, retval);
+		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] vfs: keep list of mounts for each superblock
  2011-08-03 10:48 ` [PATCH 2/4] vfs: keep list of mounts for each superblock Miklos Szeredi
@ 2011-08-03 21:53   ` Christoph Hellwig
  2011-08-03 22:17   ` Al Viro
  1 sibling, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2011-08-03 21:53 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: viro, linux-fsdevel, linux-kernel, jack, hch, akpm,
	toshi.okajima, mszeredi

On Wed, Aug 03, 2011 at 12:48:39PM +0200, Miklos Szeredi wrote:
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Keep track of vfsmounts belonging to a superblock.  List is protected
> by vfsmount_lock.

Looks reasonable to me.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] vfs: keep list of mounts for each superblock
  2011-08-03 10:48 ` [PATCH 2/4] vfs: keep list of mounts for each superblock Miklos Szeredi
  2011-08-03 21:53   ` Christoph Hellwig
@ 2011-08-03 22:17   ` Al Viro
  2011-08-04  9:10     ` Miklos Szeredi
  1 sibling, 1 reply; 14+ messages in thread
From: Al Viro @ 2011-08-03 22:17 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

On Wed, Aug 03, 2011 at 12:48:39PM +0200, Miklos Szeredi wrote:
> @@ -696,6 +696,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
>  	mnt->mnt_sb = root->d_sb;
>  	mnt->mnt_mountpoint = mnt->mnt_root;
>  	mnt->mnt_parent = mnt;
> +
> +	br_write_lock(vfsmount_lock);
> +	list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> +	br_write_unlock(vfsmount_lock);

Racy.

> @@ -745,6 +750,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
>  			if (!list_empty(&old->mnt_expire))
>  				list_add(&mnt->mnt_expire, &old->mnt_expire);
>  		}
> +
> +		br_write_lock(vfsmount_lock);
> +		list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> +		br_write_unlock(vfsmount_lock);

Ditto.  If you expect to be able to find *all* vfsmounts over given sb,
this locking is simply wrong.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] vfs: keep list of mounts for each superblock
  2011-08-03 22:17   ` Al Viro
@ 2011-08-04  9:10     ` Miklos Szeredi
  2011-08-04  9:46       ` Jan Kara
  0 siblings, 1 reply; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-04  9:10 UTC (permalink / raw)
  To: Al Viro
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, jack, hch, akpm,
	toshi.okajima

On Wed, 2011-08-03 at 23:17 +0100, Al Viro wrote:
> On Wed, Aug 03, 2011 at 12:48:39PM +0200, Miklos Szeredi wrote:
> > @@ -696,6 +696,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
> >  	mnt->mnt_sb = root->d_sb;
> >  	mnt->mnt_mountpoint = mnt->mnt_root;
> >  	mnt->mnt_parent = mnt;
> > +
> > +	br_write_lock(vfsmount_lock);
> > +	list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> > +	br_write_unlock(vfsmount_lock);
> 
> Racy.
> 
> > @@ -745,6 +750,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
> >  			if (!list_empty(&old->mnt_expire))
> >  				list_add(&mnt->mnt_expire, &old->mnt_expire);
> >  		}
> > +
> > +		br_write_lock(vfsmount_lock);
> > +		list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> > +		br_write_unlock(vfsmount_lock);
> 
> Ditto.  If you expect to be able to find *all* vfsmounts over given sb,
> this locking is simply wrong.

I don't understand.  All accesses to mnt_instance/s_mounts are protected
by vfsmount_lock.  What else is needed?

Thanks,
Miklos



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] vfs: keep list of mounts for each superblock
  2011-08-04  9:10     ` Miklos Szeredi
@ 2011-08-04  9:46       ` Jan Kara
  2011-08-04  9:58         ` Miklos Szeredi
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Kara @ 2011-08-04  9:46 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Al Viro, Miklos Szeredi, linux-fsdevel, linux-kernel, jack, hch,
	akpm, toshi.okajima

On Thu 04-08-11 11:10:54, Miklos Szeredi wrote:
> On Wed, 2011-08-03 at 23:17 +0100, Al Viro wrote:
> > On Wed, Aug 03, 2011 at 12:48:39PM +0200, Miklos Szeredi wrote:
> > > @@ -696,6 +696,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
> > >  	mnt->mnt_sb = root->d_sb;
> > >  	mnt->mnt_mountpoint = mnt->mnt_root;
> > >  	mnt->mnt_parent = mnt;
> > > +
> > > +	br_write_lock(vfsmount_lock);
> > > +	list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> > > +	br_write_unlock(vfsmount_lock);
> > 
> > Racy.
> > 
> > > @@ -745,6 +750,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
> > >  			if (!list_empty(&old->mnt_expire))
> > >  				list_add(&mnt->mnt_expire, &old->mnt_expire);
> > >  		}
> > > +
> > > +		br_write_lock(vfsmount_lock);
> > > +		list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> > > +		br_write_unlock(vfsmount_lock);
> > 
> > Ditto.  If you expect to be able to find *all* vfsmounts over given sb,
> > this locking is simply wrong.
> 
> I don't understand.  All accesses to mnt_instance/s_mounts are protected
> by vfsmount_lock.  What else is needed?
  I guess Al meant that sb_prepare_remount_readonly() from the next patch
could race with new mountpoint being added to the list and the check is
thus still unreliable? Quickly checking the locking seems to confirm that
since e.g. clone_mnt() does not hold s_umount semaphore (only
namespace_sem?) and do_remount_sb() has it the other way around...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] vfs: keep list of mounts for each superblock
  2011-08-04  9:46       ` Jan Kara
@ 2011-08-04  9:58         ` Miklos Szeredi
  0 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-04  9:58 UTC (permalink / raw)
  To: Jan Kara
  Cc: Al Viro, Miklos Szeredi, linux-fsdevel, linux-kernel, hch, akpm,
	toshi.okajima

On Thu, 2011-08-04 at 11:46 +0200, Jan Kara wrote:
> On Thu 04-08-11 11:10:54, Miklos Szeredi wrote:
> > On Wed, 2011-08-03 at 23:17 +0100, Al Viro wrote:
> > > On Wed, Aug 03, 2011 at 12:48:39PM +0200, Miklos Szeredi wrote:
> > > > @@ -696,6 +696,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
> > > >  	mnt->mnt_sb = root->d_sb;
> > > >  	mnt->mnt_mountpoint = mnt->mnt_root;
> > > >  	mnt->mnt_parent = mnt;
> > > > +
> > > > +	br_write_lock(vfsmount_lock);
> > > > +	list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> > > > +	br_write_unlock(vfsmount_lock);
> > > 
> > > Racy.
> > > 
> > > > @@ -745,6 +750,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
> > > >  			if (!list_empty(&old->mnt_expire))
> > > >  				list_add(&mnt->mnt_expire, &old->mnt_expire);
> > > >  		}
> > > > +
> > > > +		br_write_lock(vfsmount_lock);
> > > > +		list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
> > > > +		br_write_unlock(vfsmount_lock);
> > > 
> > > Ditto.  If you expect to be able to find *all* vfsmounts over given sb,
> > > this locking is simply wrong.
> > 
> > I don't understand.  All accesses to mnt_instance/s_mounts are protected
> > by vfsmount_lock.  What else is needed?

>   I guess Al meant that sb_prepare_remount_readonly() from the next patch
> could race with new mountpoint being added to the list and the check is
> thus still unreliable?

If sb_prepare_remount_readonly() is successfull, it sets
->s_readonly_remount, indicating that remounting is in progress, which
will cause mnt_want_write() to return -EROFS for any mount regardless
whether they were added before or after sb_prepare_remount_readonly().

So it doesn't matter if the clone or new mount races with remount.

The drawback of this approach is that transient EROFS errors may be
returned even if the filesystem remount fails for some reason and so the
filesystem will not become read-only.  But this, I think, is an
acceptable compromise.

>  Quickly checking the locking seems to confirm that
> since e.g. clone_mnt() does not hold s_umount semaphore (only
> namespace_sem?) and do_remount_sb() has it the other way around...

Yeah, and I think I came to the conclusion that that kind of exclusion
is not realistically doable.

Thanks,
Miklos


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 3/4] vfs: protect remounting superblock read-only
       [not found] <1314191791-29218-1-git-send-email-miklos@szeredi.hu>
@ 2011-08-24 13:22 ` Miklos Szeredi
  0 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2011-08-24 13:22 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, jack, hch, akpm, toshi.okajima, mszeredi

From: Miklos Szeredi <mszeredi@suse.cz>

Currently remouting superblock read-only is racy in a major way.

With the per mount read-only infrastructure it is now possible to
prevent most races, which this patch attempts.

Before starting the remount read-only, iterate through all mounts
belonging to the superblock and if none of them have any pending
writes, set sb->s_readonly_remount.  This indicates that remount is in
progress and no further write requests are allowed.  If the remount
succeeds set MS_RDONLY and reset s_readonly_remount.

If the remounting is unsuccessful just reset s_readonly_remount.
This can result in transient EROFS errors, despite the fact the
remount failed.  Unfortunately hodling off writes is difficult as
remount itself may touch the filesystem (e.g. through load_nls())
which would deadlock.

Small races still remain because delayed writes due to nlink going to
zero but inode still having a reference are not dealt with by this
patch.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/internal.h      |    1 +
 fs/namespace.c     |   40 +++++++++++++++++++++++++++++++++++++++-
 fs/super.c         |   22 ++++++++++++++++++----
 include/linux/fs.h |    3 +++
 4 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index fe327c2..f925271 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -74,6 +74,7 @@ extern int finish_automount(struct vfsmount *, struct path *);
 
 extern void mnt_make_longterm(struct vfsmount *);
 extern void mnt_make_shortterm(struct vfsmount *);
+extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index c369c32..263201c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -311,6 +311,15 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
 #endif
 }
 
+static int mnt_is_readonly(struct vfsmount *mnt)
+{
+	if (mnt->mnt_sb->s_readonly_remount)
+		return 1;
+	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
+	smp_rmb();
+	return __mnt_is_readonly(mnt);
+}
+
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
@@ -349,7 +358,7 @@ int mnt_want_write(struct vfsmount *mnt)
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
-	if (__mnt_is_readonly(mnt)) {
+	if (mnt_is_readonly(mnt)) {
 		mnt_dec_writers(mnt);
 		ret = -EROFS;
 		goto out;
@@ -466,6 +475,35 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	br_write_unlock(vfsmount_lock);
 }
 
+int sb_prepare_remount_readonly(struct super_block *sb)
+{
+	struct vfsmount *mnt;
+	int err = 0;
+
+	br_write_lock(vfsmount_lock);
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (!(mnt->mnt_flags & MNT_READONLY)) {
+			mnt->mnt_flags |= MNT_WRITE_HOLD;
+			smp_mb();
+			if (mnt_get_writers(mnt) > 0) {
+				err = -EBUSY;
+				break;
+			}
+		}
+	}
+	if (!err) {
+		sb->s_readonly_remount = 1;
+		smp_wmb();
+	}
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_flags & MNT_WRITE_HOLD)
+			mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	}
+	br_write_unlock(vfsmount_lock);
+
+	return err;
+}
+
 static void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/fs/super.c b/fs/super.c
index e8aad34..94c8ace 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -721,23 +721,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if (remount_ro) {
-		if (force)
+		if (force) {
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
-			return -EBUSY;
+		} else {
+			retval = sb_prepare_remount_readonly(sb);
+			if (retval)
+				return retval;
+
+			retval = -EBUSY;
+			if (!fs_may_remount_ro(sb))
+				goto cancel_readonly;
+		}
 	}
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		if (retval) {
 			if(!force)
-				return retval;
+				goto cancel_readonly;
 			/* If forced remount, go ahead despite any errors */
 			WARN(1, "forced remount of a %s fs returned %i\n",
 			     sb->s_type->name, retval);
 		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	/* Needs to be ordered wrt mnt_is_readonly() */
+	smp_wmb();
+	sb->s_readonly_remount = 0;
 
 	/*
 	 * Some filesystems modify their metadata via some other path than the
@@ -750,6 +760,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (remount_ro && sb->s_bdev)
 		invalidate_bdev(sb->s_bdev);
 	return 0;
+
+cancel_readonly:
+	sb->s_readonly_remount = 0;
+	return retval;
 }
 
 static void do_emergency_remount(struct work_struct *work)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2a67e6..e820478 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1464,6 +1464,9 @@ struct super_block {
 	int cleancache_poolid;
 
 	struct shrinker s_shrink;	/* per-sb shrinker handle */
+
+	/* Being remounted read-only */
+	int s_readonly_remount;
 };
 
 /* superblock cache pruning functions */
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] vfs: protect remounting superblock read-only
  2010-10-26 14:16 [PATCH 0/4] read-only remount fixes Miklos Szeredi
@ 2010-10-26 14:16 ` Miklos Szeredi
  0 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2010-10-26 14:16 UTC (permalink / raw)
  To: viro; +Cc: dave, akpm, linux-fsdevel, linux-kernel

[-- Attachment #1: vfs-protect-sb-remount.patch --]
[-- Type: text/plain, Size: 5473 bytes --]

From: Miklos Szeredi <mszeredi@suse.cz>

Currently remouting superblock read-only is racy in a major way.

With the per mount read-only infrastructure it is now possible to
prevent most races, which this patch attempts.

Before starting the remount read-only, iterate through all mounts
belonging to the superblock and if none of them have any pending
writes, set sb->s_readonly_remount.  This indicates that remount is in
progress and no further write requests are allowed.  If the remount
succeeds set MS_RDONLY and reset s_readonly_remount.

If the remounting is unsuccessful just reset s_readonly_remount.
This can result in transient EROFS errors, despite the fact the
remount failed.  Unfortunately hodling off writes is difficult as
remount itself may touch the filesystem (e.g. through load_nls())
which would deadlock.

Small races still remain because delayed writes due to nlink going to
zero but inode still having a reference are not dealt with by this
patch.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/internal.h      |    1 +
 fs/namespace.c     |   40 +++++++++++++++++++++++++++++++++++++++-
 fs/super.c         |   22 ++++++++++++++++++----
 include/linux/fs.h |    3 +++
 4 files changed, 61 insertions(+), 5 deletions(-)

Index: linux-2.6/fs/internal.h
===================================================================
--- linux-2.6.orig/fs/internal.h	2010-10-26 14:26:57.000000000 +0200
+++ linux-2.6/fs/internal.h	2010-10-26 14:27:03.000000000 +0200
@@ -69,6 +69,7 @@ extern void mnt_set_mountpoint(struct vf
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c	2010-10-26 14:27:02.000000000 +0200
+++ linux-2.6/fs/namespace.c	2010-10-26 14:27:03.000000000 +0200
@@ -251,6 +251,15 @@ static unsigned int count_mnt_writers(st
 #endif
 }
 
+static int mnt_is_readonly(struct vfsmount *mnt)
+{
+	if (mnt->mnt_sb->s_readonly_remount)
+		return 1;
+	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
+	smp_rmb();
+	return __mnt_is_readonly(mnt);
+}
+
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
@@ -289,7 +298,7 @@ int mnt_want_write(struct vfsmount *mnt)
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
-	if (__mnt_is_readonly(mnt)) {
+	if (mnt_is_readonly(mnt)) {
 		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
@@ -406,6 +415,35 @@ static void __mnt_unmake_readonly(struct
 	br_write_unlock(vfsmount_lock);
 }
 
+int sb_prepare_remount_readonly(struct super_block *sb)
+{
+	struct vfsmount *mnt;
+	int err = 0;
+
+	br_write_lock(vfsmount_lock);
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (!(mnt->mnt_flags & MNT_READONLY)) {
+			mnt->mnt_flags |= MNT_WRITE_HOLD;
+			smp_mb();
+			if (count_mnt_writers(mnt) > 0) {
+				err = -EBUSY;
+				break;
+			}
+		}
+	}
+	if (!err) {
+		sb->s_readonly_remount = 1;
+		smp_wmb();
+	}
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_flags & MNT_WRITE_HOLD)
+			mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	}
+	br_write_unlock(vfsmount_lock);
+
+	return err;
+}
+
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c	2010-10-26 14:27:02.000000000 +0200
+++ linux-2.6/fs/super.c	2010-10-26 14:27:03.000000000 +0200
@@ -573,19 +573,29 @@ int do_remount_sb(struct super_block *sb
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if (remount_ro) {
-		if (force)
+		if (force) {
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
-			return -EBUSY;
+		} else {
+			retval = sb_prepare_remount_readonly(sb);
+			if (retval)
+				return retval;
+
+			retval = -EBUSY;
+			if (!fs_may_remount_ro(sb))
+				goto cancel_readonly;
+		}
 	}
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		/* If forced remount, go ahead despite any errors */
 		if (retval && !force)
-			return retval;
+			goto cancel_readonly;
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	/* Needs to be ordered wrt mnt_is_readonly() */
+	smp_wmb();
+	sb->s_readonly_remount = 0;
 
 	/*
 	 * Some filesystems modify their metadata via some other path than the
@@ -598,6 +608,10 @@ int do_remount_sb(struct super_block *sb
 	if (remount_ro && sb->s_bdev)
 		invalidate_bdev(sb->s_bdev);
 	return 0;
+
+cancel_readonly:
+	sb->s_readonly_remount = 0;
+	return retval;
 }
 
 static void do_emergency_remount(struct work_struct *work)
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2010-10-26 14:27:02.000000000 +0200
+++ linux-2.6/include/linux/fs.h	2010-10-26 16:06:30.000000000 +0200
@@ -1388,6 +1388,9 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char __rcu *s_options;
+
+	/* Being remounted read-only */
+	int s_readonly_remount;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);

-- 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-08-24 13:22 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-03 10:48 [PATCH v5 0/4] read-only remount race fix Miklos Szeredi
2011-08-03 10:48 ` [PATCH 1/4] vfs: ignore error on forced remount Miklos Szeredi
2011-08-03 10:49   ` Christoph Hellwig
2011-08-03 16:15     ` Miklos Szeredi
2011-08-03 10:48 ` [PATCH 2/4] vfs: keep list of mounts for each superblock Miklos Szeredi
2011-08-03 21:53   ` Christoph Hellwig
2011-08-03 22:17   ` Al Viro
2011-08-04  9:10     ` Miklos Szeredi
2011-08-04  9:46       ` Jan Kara
2011-08-04  9:58         ` Miklos Szeredi
2011-08-03 10:48 ` [PATCH 3/4] vfs: protect remounting superblock read-only Miklos Szeredi
2011-08-03 10:48 ` [PATCH 4/4] vfs: fs_may_remount_ro: turn unnecessary check into a WARN_ON Miklos Szeredi
     [not found] <1314191791-29218-1-git-send-email-miklos@szeredi.hu>
2011-08-24 13:22 ` [PATCH 3/4] vfs: protect remounting superblock read-only Miklos Szeredi
  -- strict thread matches above, loose matches on Subject: below --
2010-10-26 14:16 [PATCH 0/4] read-only remount fixes Miklos Szeredi
2010-10-26 14:16 ` [PATCH 3/4] vfs: protect remounting superblock read-only Miklos Szeredi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.