All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/22] filesystem helpers for custom 'struct file's
@ 2007-02-09 22:53 Dave Hansen
  2007-02-09 22:53 ` [PATCH 02/22] r/o bind mounts: add vfsmount writer counts Dave Hansen
                   ` (21 more replies)
  0 siblings, 22 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen


Some filesystems forego the vfs and may_open() and create their
own 'struct file's.

This patch creates a couple of helper functions which can be
used by these filesystems, and will provide a unified place
which the r/o bind mount code may patch.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/file_table.c      |   36 ++++++++++++++++++++++++++++++++++++
 lxc-dave/fs/hugetlbfs/inode.c |   22 +++++++++-------------
 lxc-dave/include/linux/file.h |    8 ++++++++
 lxc-dave/mm/shmem.c           |    7 ++-----
 lxc-dave/mm/tiny-shmem.c      |   24 +++++++++---------------
 lxc-dave/net/socket.c         |   18 +++++++++---------
 6 files changed, 73 insertions(+), 42 deletions(-)

diff -puN fs/file_table.c~01-24-filesystem-helpers-for-custom-struct-file-s fs/file_table.c
--- lxc/fs/file_table.c~01-24-filesystem-helpers-for-custom-struct-file-s	2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/fs/file_table.c	2007-02-09 14:26:46.000000000 -0800
@@ -140,6 +140,42 @@ fail:
 
 EXPORT_SYMBOL(get_empty_filp);
 
+struct file *alloc_file(struct vfsmount *mnt,  struct dentry *dentry,
+		mode_t mode, const struct file_operations *fop)
+{
+	struct file *file;
+
+	file = get_empty_filp();
+	if (!file)
+		return NULL;
+
+	init_file(file, mnt, dentry, mode, fop);
+	return file;
+}
+
+EXPORT_SYMBOL(alloc_file);
+
+/*
+ * Note: This is a crappy interface.  It is here to make
+ * merging with the existing users of get_empty_filp()
+ * who have complex failure logic easier.  All users
+ * of this should be moving to alloc_file().
+ */
+int init_file(struct file *file, struct vfsmount *mnt,
+	   struct dentry *dentry, mode_t mode,
+	   const struct file_operations *fop)
+{
+	int error = 0;
+	file->f_vfsmnt = mntget(mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = dentry->d_inode->i_mapping;
+	file->f_mode = mode;
+	file->f_op = fop;
+	return error;
+}
+
+EXPORT_SYMBOL(init_file);
+
 void fastcall fput(struct file *file)
 {
 	if (atomic_dec_and_test(&file->f_count))
diff -puN fs/hugetlbfs/inode.c~01-24-filesystem-helpers-for-custom-struct-file-s fs/hugetlbfs/inode.c
--- lxc/fs/hugetlbfs/inode.c~01-24-filesystem-helpers-for-custom-struct-file-s	2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/fs/hugetlbfs/inode.c	2007-02-09 14:26:46.000000000 -0800
@@ -756,16 +756,11 @@ struct file *hugetlb_zero_setup(size_t s
 	if (!dentry)
 		goto out_shm_unlock;
 
-	error = -ENFILE;
-	file = get_empty_filp();
-	if (!file)
-		goto out_dentry;
-
 	error = -ENOSPC;
 	inode = hugetlbfs_get_inode(root->d_sb, current->fsuid,
 				current->fsgid, S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
-		goto out_file;
+		goto out_dentry;
 
 	error = -ENOMEM;
 	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
@@ -774,17 +769,18 @@ struct file *hugetlb_zero_setup(size_t s
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;
-	file->f_path.mnt = mntget(hugetlbfs_vfsmount);
-	file->f_path.dentry = dentry;
-	file->f_mapping = inode->i_mapping;
-	file->f_op = &hugetlbfs_file_operations;
-	file->f_mode = FMODE_WRITE | FMODE_READ;
+
+	error = -ENFILE;
+	file = alloc_file(hugetlbfs_vfsmount, dentry,
+			FMODE_WRITE | FMODE_READ,
+			&hugetlbfs_file_operations);
+	if (!file)
+		goto out_inode;
+
 	return file;
 
 out_inode:
 	iput(inode);
-out_file:
-	put_filp(file);
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
diff -puN include/linux/file.h~01-24-filesystem-helpers-for-custom-struct-file-s include/linux/file.h
--- lxc/include/linux/file.h~01-24-filesystem-helpers-for-custom-struct-file-s	2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/include/linux/file.h	2007-02-09 14:26:46.000000000 -0800
@@ -62,6 +62,14 @@ extern struct kmem_cache *filp_cachep;
 extern void FASTCALL(__fput(struct file *));
 extern void FASTCALL(fput(struct file *));
 
+struct file_operations;
+struct vfsmount;
+struct dentry;
+extern int init_file(struct file *, struct vfsmount *, struct dentry *dentry,
+		mode_t mode, const struct file_operations *fop);
+extern struct file *alloc_file(struct vfsmount *, struct dentry *dentry,
+		mode_t mode, const struct file_operations *fop);
+
 static inline void fput_light(struct file *file, int fput_needed)
 {
 	if (unlikely(fput_needed))
diff -puN mm/shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s mm/shmem.c
--- lxc/mm/shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s	2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/mm/shmem.c	2007-02-09 14:26:46.000000000 -0800
@@ -2497,11 +2497,8 @@ struct file *shmem_file_setup(char *name
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
-	file->f_path.mnt = mntget(shm_mnt);
-	file->f_path.dentry = dentry;
-	file->f_mapping = inode->i_mapping;
-	file->f_op = &shmem_file_operations;
-	file->f_mode = FMODE_WRITE | FMODE_READ;
+	init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+			&shmem_file_operations);
 	return file;
 
 close_file:
diff -puN mm/tiny-shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s mm/tiny-shmem.c
--- lxc/mm/tiny-shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s	2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/mm/tiny-shmem.c	2007-02-09 14:26:46.000000000 -0800
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name
 	if (!dentry)
 		goto put_memory;
 
-	error = -ENFILE;
-	file = get_empty_filp();
-	if (!file)
-		goto put_dentry;
-
 	error = -ENOSPC;
 	inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
-		goto close_file;
+		goto put_dentry;
 
 	d_instantiate(dentry, inode);
-	inode->i_nlink = 0;	/* It is unlinked */
+	error = -ENFILE;
+	file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+			&ramfs_file_operations);
+	if (!file)
+		goto put_inode;
 
-	file->f_path.mnt = mntget(shm_mnt);
-	file->f_path.dentry = dentry;
-	file->f_mapping = inode->i_mapping;
-	file->f_op = &ramfs_file_operations;
-	file->f_mode = FMODE_WRITE | FMODE_READ;
+	inode->i_nlink = 0;	/* It is unlinked */
 
 	/* notify everyone as to the change of file size */
 	error = do_truncate(dentry, size, 0, file);
@@ -91,9 +86,8 @@ struct file *shmem_file_setup(char *name
 		goto close_file;
 
 	return file;
-
-close_file:
-	put_filp(file);
+put_inode:
+	iput(inode);
 put_dentry:
 	dput(dentry);
 put_memory:
diff -puN net/socket.c~01-24-filesystem-helpers-for-custom-struct-file-s net/socket.c
--- lxc/net/socket.c~01-24-filesystem-helpers-for-custom-struct-file-s	2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/net/socket.c	2007-02-09 14:26:46.000000000 -0800
@@ -355,6 +355,7 @@ static int sock_alloc_fd(struct file **f
 
 static int sock_attach_fd(struct socket *sock, struct file *file)
 {
+	struct dentry *dentry;
 	struct qstr this;
 	char name[32];
 
@@ -362,24 +363,23 @@ static int sock_attach_fd(struct socket 
 	this.name = name;
 	this.hash = 0;
 
-	file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
-	if (unlikely(!file->f_path.dentry))
+	dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
+	if (unlikely(!dentry))
 		return -ENOMEM;
 
-	file->f_path.dentry->d_op = &sockfs_dentry_operations;
+	dentry->d_op = &sockfs_dentry_operations;
 	/*
 	 * We dont want to push this dentry into global dentry hash table.
 	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
 	 * This permits a working /proc/$pid/fd/XXX on sockets
 	 */
-	file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED;
-	d_instantiate(file->f_path.dentry, SOCK_INODE(sock));
-	file->f_path.mnt = mntget(sock_mnt);
-	file->f_mapping = file->f_path.dentry->d_inode->i_mapping;
-
+	dentry->d_flags &= ~DCACHE_UNHASHED;
+	d_instantiate(dentry, SOCK_INODE(sock));
+	init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
+		&socket_file_ops);
+	SOCK_INODE(sock)->i_fop = &socket_file_ops;
 	sock->file = file;
 	file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
-	file->f_mode = FMODE_READ | FMODE_WRITE;
 	file->f_flags = O_RDWR;
 	file->f_pos = 0;
 	file->private_data = sock;
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 02/22] r/o bind mounts: add vfsmount writer counts
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 23:41   ` Eric Dumazet
  2007-02-09 22:53 ` [PATCH 03/22] record when sb_writer_count elevated for inode Dave Hansen
                   ` (20 subsequent siblings)
  21 siblings, 1 reply; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen


This patch actually adds the mount and superblock writer
counts, and the mnt_want/drop_write() functions that use
them.

Before these can become useful, we must first cover each
place in the VFS where writes are performed with a
want/drop pair.  When that is complete, we can actually
introduce code that will safely check the counts before
allowing r/w<->r/o transitions to occur.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namespace.c        |   53 +++++++++++++++++++++++++++++++++++++++++
 lxc-dave/fs/super.c            |   18 ++++++++++---
 lxc-dave/include/linux/fs.h    |    2 +
 lxc-dave/include/linux/mount.h |   21 ++++++++++++++++
 4 files changed, 90 insertions(+), 4 deletions(-)

diff -puN fs/namespace.c~03-24-add-vfsmount-writer-count fs/namespace.c
--- lxc/fs/namespace.c~03-24-add-vfsmount-writer-count	2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/fs/namespace.c	2007-02-09 14:26:47.000000000 -0800
@@ -58,6 +58,7 @@ struct vfsmount *alloc_vfsmnt(const char
 	if (mnt) {
 		mnt->mnt_user_ns = get_user_ns(current->nsproxy->user_ns);
 		atomic_set(&mnt->mnt_count, 1);
+		mnt->mnt_writers = 0;
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -78,6 +79,56 @@ struct vfsmount *alloc_vfsmnt(const char
 	return mnt;
 }
 
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	WARN_ON(__mnt_is_readonly(mnt));
+
+	/*
+	 * This flag set is actually redundant with what
+	 * happens in do_remount(), but since we do this
+	 * under the lock, anyone attempting to get a write
+	 * on it after this will fail.
+	 */
+	spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+	if (!mnt->mnt_writers)
+		mnt->mnt_flags |= MNT_READONLY;
+	else
+		ret = -EBUSY;
+	spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+	return ret;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+	if (mnt->mnt_writers)
+		goto out;
+
+	if (__mnt_is_readonly(mnt)) {
+		ret = -EROFS;
+		goto out;
+	}
+	mnt->mnt_sb->s_writers++;
+	mnt->mnt_writers++;
+out:
+	spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+void mnt_drop_write(struct vfsmount *mnt)
+{
+	spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+	mnt->mnt_sb->s_writers--;
+	mnt->mnt_writers--;
+	spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
@@ -1415,6 +1466,8 @@ long do_mount(char *dev_name, char *dir_
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
 	/* Separate the per-mountpoint flags */
+	if (flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
 	if (flags & MS_NODEV)
diff -puN fs/super.c~03-24-add-vfsmount-writer-count fs/super.c
--- lxc/fs/super.c~03-24-add-vfsmount-writer-count	2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/fs/super.c	2007-02-09 14:26:47.000000000 -0800
@@ -93,6 +93,8 @@ static struct super_block *alloc_super(s
 		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		s->s_writers = 0;
+		spin_lock_init(&s->s_mnt_writers_lock);
 	}
 out:
 	return s;
@@ -576,6 +578,11 @@ static void mark_files_ro(struct super_b
 	file_list_unlock();
 }
 
+static int sb_remount_ro(struct super_block *sb)
+{
+	return fs_may_remount_ro(sb);
+}
+
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
@@ -587,7 +594,8 @@ static void mark_files_ro(struct super_b
  */
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
-	int retval;
+	int retval = 0;
+	int sb_started_ro = (sb->s_flags & MS_RDONLY);
 	
 #ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
@@ -600,11 +608,13 @@ int do_remount_sb(struct super_block *sb
 
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
-	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+	if ((flags & MS_RDONLY) && !sb_started_ro) {
 		if (force)
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
-			return -EBUSY;
+		else
+			retval = sb_remount_ro(sb);
+		if (retval)
+			return retval;
 	}
 
 	if (sb->s_op->remount_fs) {
diff -puN include/linux/fs.h~03-24-add-vfsmount-writer-count include/linux/fs.h
--- lxc/include/linux/fs.h~03-24-add-vfsmount-writer-count	2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/include/linux/fs.h	2007-02-09 14:26:47.000000000 -0800
@@ -972,6 +972,8 @@ struct super_block {
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_files;
+	int			s_writers;	/* number of files open for write */
+	spinlock_t		s_mnt_writers_lock; /* taken when mounts change rw state */
 
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
diff -puN include/linux/mount.h~03-24-add-vfsmount-writer-count include/linux/mount.h
--- lxc/include/linux/mount.h~03-24-add-vfsmount-writer-count	2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/include/linux/mount.h	2007-02-09 14:26:47.000000000 -0800
@@ -29,6 +29,7 @@ struct user_namespace;
 #define MNT_NOATIME	0x08
 #define MNT_NODIRATIME	0x10
 #define MNT_RELATIME	0x20
+#define MNT_READONLY	0x40 /* does the user want this to be r/o? */
 
 #define MNT_SHRINKABLE	0x100
 
@@ -56,6 +57,7 @@ struct vfsmount {
 	struct vfsmount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
+	int mnt_writers;		/* nr files open for write */
 	/*
 	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
 	 * to let these frequently modified fields in a separate cache line
@@ -72,7 +74,26 @@ static inline struct vfsmount *mntget(st
 		atomic_inc(&mnt->mnt_count);
 	return mnt;
 }
+/*
+ * This is temporary for now.  We also don't want to check
+ * the SB in because it is already checked in other
+ * code paths.  We'll have a better way to do this in
+ * the end of this series
+ */
+static inline int __mnt_is_readonly(struct vfsmount *mnt)
+{
+	return mnt->mnt_flags & MNT_READONLY;
+}
+
+static inline void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+	WARN_ON(!__mnt_is_readonly(mnt));
+	mnt->mnt_flags &= ~MNT_READONLY;
+}
 
+extern int mnt_make_readonly(struct vfsmount *mnt);
+extern int mnt_want_write(struct vfsmount *mnt);
+extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
 extern void mnt_unpin(struct vfsmount *mnt);
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 03/22] record when sb_writer_count elevated for inode
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
  2007-02-09 22:53 ` [PATCH 02/22] r/o bind mounts: add vfsmount writer counts Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 04/22] elevate writer count for chown and friends Dave Hansen
                   ` (19 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen


There are a number of filesystems that do iput()s without first
having messed with i_nlink.  In order to keep from accidentally
decrementing the superblock writer count for these, we record
when the count is bumped up, so that we can properly balance
it.

I first tried to do this by assuming that, for each dec_nlink() to
zero, there was exactly one call to iput_final().  But, there are
a number of cases where this isn't true, especially in error handling
code.  Even if all of the filesystems were fixed up, it would be simple
to reintroduce new bugs imbalancing the mnt writer count.  This patch
trades that possibility for the chance that we will miss a i_nlink--,
and not bump the sb writer count.

I like the idea screwing up writing out a single inode better than
screwing up a global superblock count imbalance that will affect
all inodes on the superblock.

Also, since this is the first non-trivial use of the inc/drop_nlink()
functions, add some kernel docs for them.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/inode.c         |    7 +++++
 lxc-dave/fs/libfs.c         |    1 
 lxc-dave/include/linux/fs.h |   58 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+)

diff -puN fs/inode.c~04-24-record-when-sb-writer-count-elevated-for-inode fs/inode.c
--- lxc/fs/inode.c~04-24-record-when-sb-writer-count-elevated-for-inode	2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/fs/inode.c	2007-02-09 14:26:48.000000000 -0800
@@ -1097,10 +1097,17 @@ static inline void iput_final(struct ino
 {
 	const struct super_operations *op = inode->i_sb->s_op;
 	void (*drop)(struct inode *) = generic_drop_inode;
+	int must_drop_sb_write = (inode->i_state & I_AWAITING_FINAL_IPUT);
+	struct super_block *sb = inode->i_sb;
 
 	if (op && op->drop_inode)
 		drop = op->drop_inode;
 	drop(inode);
+	if (must_drop_sb_write) {
+		spin_lock(&sb->s_mnt_writers_lock);
+		sb->s_writers--;
+		spin_unlock(&sb->s_mnt_writers_lock);
+	}
 }
 
 /**
diff -puN fs/libfs.c~04-24-record-when-sb-writer-count-elevated-for-inode fs/libfs.c
--- lxc/fs/libfs.c~04-24-record-when-sb-writer-count-elevated-for-inode	2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/fs/libfs.c	2007-02-09 14:26:48.000000000 -0800
@@ -388,6 +388,7 @@ int simple_fill_super(struct super_block
 	 * because the root inode is 1, the files array must not contain an
 	 * entry at index 1
 	 */
+	inode->i_state |= I_AWAITING_FINAL_IPUT;
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
 	inode->i_uid = inode->i_gid = 0;
diff -puN include/linux/fs.h~04-24-record-when-sb-writer-count-elevated-for-inode include/linux/fs.h
--- lxc/include/linux/fs.h~04-24-record-when-sb-writer-count-elevated-for-inode	2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/include/linux/fs.h	2007-02-09 14:26:48.000000000 -0800
@@ -1230,6 +1230,7 @@ struct super_operations {
 #define I_CLEAR			32
 #define I_NEW			64
 #define I_WILL_FREE		128
+#define I_AWAITING_FINAL_IPUT		256
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1244,6 +1245,14 @@ static inline void mark_inode_dirty_sync
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+/**
+ * inc_nlink - directly increment an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  Currently,
+ * it is only here for parity with dec_nlink().
+ */
 static inline void inc_nlink(struct inode *inode)
 {
 	inode->i_nlink++;
@@ -1255,14 +1264,63 @@ static inline void inode_inc_link_count(
 	mark_inode_dirty(inode);
 }
 
+/**
+ * check_nlink - check an inode's status after direct
+ * 		 i_nlink modification.
+ * @inode: inode
+ *
+ * Some filesystems can not make simple incremental changes
+ * to i_nlink, most notably clustered ones.  They must do
+ * direct manipulation of i_nlink.  This function must be
+ * called after such modifications are complete to make
+ * sure that the VFS knows that the inode is going to go
+ * away.
+ */
+static inline void check_nlink(struct inode *inode)
+{
+	if (inode->i_nlink)
+		return;
+
+	inode->i_state |= I_AWAITING_FINAL_IPUT;
+	spin_lock(&inode->i_sb->s_mnt_writers_lock);
+	inode->i_sb->s_writers++;
+	spin_unlock(&inode->i_sb->s_mnt_writers_lock);
+}
+
+/**
+ * drop_nlink - directly drop an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  In cases
+ * where we are attempting to track writes to the
+ * filesystem, a decrement to zero means an imminent
+ * write when the file is truncated and actually unlinked
+ * on the filesystem.
+ */
 static inline void drop_nlink(struct inode *inode)
 {
 	inode->i_nlink--;
+	check_nlink(inode);
 }
 
+/**
+ * clear_nlink - directly zero an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  See
+ * drop_nlink() for why we care about i_nlink hitting zero.
+ *
+ * Note that we could do the i_state flag directly in here,
+ * but we call check_nlink() to keep the number of places
+ * where the flag is set to exactly one.  The compiler
+ * should get rid of the superfluous i_nlink check.
+ */
 static inline void clear_nlink(struct inode *inode)
 {
 	inode->i_nlink = 0;
+	check_nlink(inode);
 }
 
 static inline void inode_dec_link_count(struct inode *inode)
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 04/22] elevate writer count for chown and friends
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
  2007-02-09 22:53 ` [PATCH 02/22] r/o bind mounts: add vfsmount writer counts Dave Hansen
  2007-02-09 22:53 ` [PATCH 03/22] record when sb_writer_count elevated for inode Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 05/22] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
                   ` (18 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



chown/chmod,etc... don't call permission in the same way
that the normal "open for write" calls do.  They still
write to the filesystem, so bump the write count during
these operations.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/open.c |   37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff -puN fs/open.c~06-24-elevate-writer-count-for-chown-and-friends fs/open.c
--- lxc/fs/open.c~06-24-elevate-writer-count-for-chown-and-friends	2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/fs/open.c	2007-02-09 14:26:48.000000000 -0800
@@ -511,9 +511,12 @@ asmlinkage long sys_fchmod(unsigned int 
 	err = -EROFS;
 	if (IS_RDONLY(inode))
 		goto out_putf;
+	err = mnt_want_write(file->f_vfsmnt);
+	if (err)
+		goto out_putf;
 	err = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_putf;
+		goto out_drop_write;
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -522,6 +525,8 @@ asmlinkage long sys_fchmod(unsigned int 
 	err = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
 
+out_drop_write:
+	mnt_drop_write(file->f_vfsmnt);
 out_putf:
 	fput(file);
 out:
@@ -541,13 +546,16 @@ asmlinkage long sys_fchmodat(int dfd, co
 		goto out;
 	inode = nd.dentry->d_inode;
 
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto dput_and_out;
 	error = -EROFS;
 	if (IS_RDONLY(inode))
-		goto dput_and_out;
+		goto out_drop_write;
 
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto dput_and_out;
+		goto out_drop_write;
 
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
@@ -557,6 +565,8 @@ asmlinkage long sys_fchmodat(int dfd, co
 	error = notify_change(nd.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
 
+out_drop_write:
+	mnt_drop_write(nd.mnt);
 dput_and_out:
 	path_release(&nd);
 out:
@@ -582,7 +592,7 @@ static int chown_common(struct dentry * 
 	error = -EROFS;
 	if (IS_RDONLY(inode))
 		goto out;
-	error = -EPERM;
+ 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto out;
 	newattrs.ia_valid =  ATTR_CTIME;
@@ -611,7 +621,12 @@ asmlinkage long sys_chown(const char __u
 	error = user_path_walk(filename, &nd);
 	if (error)
 		goto out;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto out_release;
 	error = chown_common(nd.dentry, user, group);
+	mnt_drop_write(nd.mnt);
+out_release:
 	path_release(&nd);
 out:
 	return error;
@@ -631,7 +646,12 @@ asmlinkage long sys_fchownat(int dfd, co
 	error = __user_walk_fd(dfd, filename, follow, &nd);
 	if (error)
 		goto out;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto out_release;
 	error = chown_common(nd.dentry, user, group);
+	mnt_drop_write(nd.mnt);
+out_release:
 	path_release(&nd);
 out:
 	return error;
@@ -645,7 +665,11 @@ asmlinkage long sys_lchown(const char __
 	error = user_path_walk_link(filename, &nd);
 	if (error)
 		goto out;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto out_release;
 	error = chown_common(nd.dentry, user, group);
+out_release:
 	path_release(&nd);
 out:
 	return error;
@@ -662,9 +686,14 @@ asmlinkage long sys_fchown(unsigned int 
 	if (!file)
 		goto out;
 
+	error = mnt_want_write(file->f_vfsmnt);
+	if (error)
+		goto out_fput;
 	dentry = file->f_path.dentry;
 	audit_inode(NULL, dentry->d_inode);
 	error = chown_common(dentry, user, group);
+	mnt_drop_write(file->f_vfsmnt);
+out_fput:
 	fput(file);
 out:
 	return error;
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 05/22] elevate mnt writers for callers of vfs_mkdir()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (2 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 04/22] elevate writer count for chown and friends Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 06/22] elevate write count during entire ncp_ioctl() Dave Hansen
                   ` (17 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen


elevate mnt writers for callers of vfs_mkdir()

Pretty self-explanatory.  Fits in with the rest of the series.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c            |    5 +++++
 lxc-dave/fs/nfsd/nfs4recover.c |    4 ++++
 2 files changed, 9 insertions(+)

diff -puN fs/namei.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir fs/namei.c
--- lxc/fs/namei.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir	2007-02-09 14:26:49.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:49.000000000 -0800
@@ -1963,7 +1963,12 @@ asmlinkage long sys_mkdirat(int dfd, con
 
 	if (!IS_POSIXACL(nd.dentry->d_inode))
 		mode &= ~current->fs->umask;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto out_dput;
 	error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
+	mnt_drop_write(nd.mnt);
+out_dput:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&nd.dentry->d_inode->i_mutex);
diff -puN fs/nfsd/nfs4recover.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir fs/nfsd/nfs4recover.c
--- lxc/fs/nfsd/nfs4recover.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir	2007-02-09 14:26:49.000000000 -0800
+++ lxc-dave/fs/nfsd/nfs4recover.c	2007-02-09 14:26:49.000000000 -0800
@@ -156,7 +156,11 @@ nfsd4_create_clid_dir(struct nfs4_client
 		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
 		goto out_put;
 	}
+	status = mnt_want_write(rec_dir.mnt);
+	if (status)
+		goto out_put;
 	status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+	mnt_drop_write(rec_dir.mnt);
 out_put:
 	dput(dentry);
 out_unlock:
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 06/22] elevate write count during entire ncp_ioctl()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (3 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 05/22] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 07/22] elevate write count for link and symlink calls Dave Hansen
                   ` (16 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



Some ioctls need write access, but others don't.  Make a helper
function to decide when write access is needed, and take it.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/ncpfs/ioctl.c |   55 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff -puN fs/ncpfs/ioctl.c~08-24-elevate-write-count-during-entire-ncp-ioctl fs/ncpfs/ioctl.c
--- lxc/fs/ncpfs/ioctl.c~08-24-elevate-write-count-during-entire-ncp-ioctl	2007-02-09 14:26:50.000000000 -0800
+++ lxc-dave/fs/ncpfs/ioctl.c	2007-02-09 14:26:50.000000000 -0800
@@ -14,6 +14,7 @@
 #include <linux/ioctl.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
@@ -260,7 +261,7 @@ ncp_get_charsets(struct ncp_server* serv
 }
 #endif /* CONFIG_NCPFS_NLS */
 
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
 	      unsigned int cmd, unsigned long arg)
 {
 	struct ncp_server *server = NCP_SERVER(inode);
@@ -821,6 +822,58 @@ outrel:			
 	return -EINVAL;
 }
 
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+	switch (cmd) {
+	case NCP_IOC_GET_FS_INFO:
+	case NCP_IOC_GET_FS_INFO_V2:
+	case NCP_IOC_NCPREQUEST:
+	case NCP_IOC_SETDENTRYTTL:
+	case NCP_IOC_SIGN_INIT:
+	case NCP_IOC_LOCKUNLOCK:
+	case NCP_IOC_SET_SIGN_WANTED:
+		return 1;
+	case NCP_IOC_GETOBJECTNAME:
+	case NCP_IOC_SETOBJECTNAME:
+	case NCP_IOC_GETPRIVATEDATA:
+	case NCP_IOC_SETPRIVATEDATA:
+	case NCP_IOC_SETCHARSETS:
+	case NCP_IOC_GETCHARSETS:
+	case NCP_IOC_CONN_LOGGED_IN:
+	case NCP_IOC_GETDENTRYTTL:
+	case NCP_IOC_GETMOUNTUID2:
+	case NCP_IOC_SIGN_WANTED:
+	case NCP_IOC_GETROOT:
+	case NCP_IOC_SETROOT:
+		return 0;
+	default:
+		/* unkown IOCTL command, assume write */
+		WARN_ON(1);
+	}
+	return 1;
+}
+
+int ncp_ioctl(struct inode *inode, struct file *filp,
+	      unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	if (ncp_ioctl_need_write(cmd)) {
+		/*
+		 * inside the ioctl(), any failures which
+		 * are because of file_permission() are
+		 * -EACCESS, so it seems consistent to keep
+		 *  that here.
+		 */
+		if (mnt_want_write(filp->f_vfsmnt))
+			return -EACCES;
+	}
+	ret = __ncp_ioctl(inode, filp, cmd, arg);
+	if (ncp_ioctl_need_write(cmd))
+		mnt_drop_write(filp->f_vfsmnt);
+	return ret;
+}
+
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 08/22] elevate mount count for extended attributes
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (5 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 07/22] elevate write count for link and symlink calls Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 09/22] mount_is_safe(): add comment Dave Hansen
                   ` (14 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



This basically audits the callers of xattr_permission(), which
calls permission() and can perform writes to the filesystem.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/nfsd/nfs4proc.c |    7 ++++++-
 lxc-dave/fs/xattr.c         |   14 ++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff -puN fs/nfsd/nfs4proc.c~10-24-elevate-mount-count-for-extended-attributes fs/nfsd/nfs4proc.c
--- lxc/fs/nfsd/nfs4proc.c~10-24-elevate-mount-count-for-extended-attributes	2007-02-09 14:26:51.000000000 -0800
+++ lxc-dave/fs/nfsd/nfs4proc.c	2007-02-09 14:26:51.000000000 -0800
@@ -626,14 +626,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
 			return status;
 		}
 	}
+	status = mnt_want_write(cstate->current_fh.fh_export->ex_mnt);
+	if (status)
+		return status;
 	status = nfs_ok;
 	if (setattr->sa_acl != NULL)
 		status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
 					    setattr->sa_acl);
 	if (status)
-		return status;
+		goto out;
 	status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
 				0, (time_t)0);
+out:
+	mnt_drop_write(cstate->current_fh.fh_export->ex_mnt);
 	return status;
 }
 
diff -puN fs/xattr.c~10-24-elevate-mount-count-for-extended-attributes fs/xattr.c
--- lxc/fs/xattr.c~10-24-elevate-mount-count-for-extended-attributes	2007-02-09 14:26:51.000000000 -0800
+++ lxc-dave/fs/xattr.c	2007-02-09 14:26:51.000000000 -0800
@@ -12,6 +12,7 @@
 #include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/xattr.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -237,7 +238,11 @@ sys_setxattr(char __user *path, char __u
 	error = user_path_walk(path, &nd);
 	if (error)
 		return error;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		return error;
 	error = setxattr(nd.dentry, name, value, size, flags);
+	mnt_drop_write(nd.mnt);
 	path_release(&nd);
 	return error;
 }
@@ -252,7 +257,11 @@ sys_lsetxattr(char __user *path, char __
 	error = user_path_walk_link(path, &nd);
 	if (error)
 		return error;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		return error;
 	error = setxattr(nd.dentry, name, value, size, flags);
+	mnt_drop_write(nd.mnt);
 	path_release(&nd);
 	return error;
 }
@@ -268,9 +277,14 @@ sys_fsetxattr(int fd, char __user *name,
 	f = fget(fd);
 	if (!f)
 		return error;
+	error = mnt_want_write(f->f_vfsmnt);
+	if (error)
+		goto out_fput;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry->d_inode);
 	error = setxattr(dentry, name, value, size, flags);
+	mnt_drop_write(f->f_vfsmnt);
+out_fput:
 	fput(f);
 	return error;
 }
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 07/22] elevate write count for link and symlink calls
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (4 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 06/22] elevate write count during entire ncp_ioctl() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 08/22] elevate mount count for extended attributes Dave Hansen
                   ` (15 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen




Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c |   10 ++++++++++
 1 file changed, 10 insertions(+)

diff -puN fs/namei.c~09-24-elevate-write-count-for-link-and-symlink-calls fs/namei.c
--- lxc/fs/namei.c~09-24-elevate-write-count-for-link-and-symlink-calls	2007-02-09 14:26:50.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:50.000000000 -0800
@@ -2236,7 +2236,12 @@ asmlinkage long sys_symlinkat(const char
 	if (IS_ERR(dentry))
 		goto out_unlock;
 
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto out_dput;
 	error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
+	mnt_drop_write(nd.mnt);
+out_dput:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&nd.dentry->d_inode->i_mutex);
@@ -2331,7 +2336,12 @@ asmlinkage long sys_linkat(int olddfd, c
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
 		goto out_unlock;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto out_dput;
 	error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+	mnt_drop_write(nd.mnt);
+out_dput:
 	dput(new_dentry);
 out_unlock:
 	mutex_unlock(&nd.dentry->d_inode->i_mutex);
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 09/22] mount_is_safe(): add comment
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (6 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 08/22] elevate mount count for extended attributes Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 10/22] unix_find_other() elevate write count for touch_atime() Dave Hansen
                   ` (13 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



This area of code is currently #ifdef'd out, so add a comment
for the time when it is actually used.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namespace.c |    4 ++++
 1 file changed, 4 insertions(+)

diff -puN fs/namespace.c~11-24-mount-is-safe-add-comment fs/namespace.c
--- lxc/fs/namespace.c~11-24-mount-is-safe-add-comment	2007-02-09 14:26:52.000000000 -0800
+++ lxc-dave/fs/namespace.c	2007-02-09 14:26:52.000000000 -0800
@@ -744,6 +744,10 @@ static int mount_is_safe(struct nameidat
 		if (current->uid != nd->dentry->d_inode->i_uid)
 			return -EPERM;
 	}
+	/*
+	 * We will eventually check for the mnt->writer_count here,
+	 * but since the code is not used now, skip it - Dave Hansen
+	 */
 	if (vfs_permission(nd, MAY_WRITE))
 		return -EPERM;
 	return 0;
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 10/22] unix_find_other() elevate write count for touch_atime()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (7 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 09/22] mount_is_safe(): add comment Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 12/22] elevate write count files are open()ed Dave Hansen
                   ` (12 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen




Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/net/unix/af_unix.c |   16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff -puN net/unix/af_unix.c~12-24-unix-find-other-elevate-write-count-for-touch-atime net/unix/af_unix.c
--- lxc/net/unix/af_unix.c~12-24-unix-find-other-elevate-write-count-for-touch-atime	2007-02-09 14:26:52.000000000 -0800
+++ lxc-dave/net/unix/af_unix.c	2007-02-09 14:26:52.000000000 -0800
@@ -703,21 +703,27 @@ static struct sock *unix_find_other(stru
 		err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
 		if (err)
 			goto fail;
+
+		err = mnt_want_write(nd.mnt);
+		if (err)
+			goto put_path_fail;
+
 		err = vfs_permission(&nd, MAY_WRITE);
 		if (err)
-			goto put_fail;
+			goto mnt_drop_write_fail;
 
 		err = -ECONNREFUSED;
 		if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
-			goto put_fail;
+			goto mnt_drop_write_fail;
 		u=unix_find_socket_byinode(nd.dentry->d_inode);
 		if (!u)
-			goto put_fail;
+			goto mnt_drop_write_fail;
 
 		if (u->sk_type == type)
 			touch_atime(nd.mnt, nd.dentry);
 
 		path_release(&nd);
+		mnt_drop_write(nd.mnt);
 
 		err=-EPROTOTYPE;
 		if (u->sk_type != type) {
@@ -737,7 +743,9 @@ static struct sock *unix_find_other(stru
 	}
 	return u;
 
-put_fail:
+mnt_drop_write_fail:
+	mnt_drop_write(nd.mnt);
+put_path_fail:
 	path_release(&nd);
 fail:
 	*error=err;
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 11/22] elevate write count over calls to vfs_rename()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (9 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 12/22] elevate write count files are open()ed Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 13/22] elevate writer count for do_sys_truncate() Dave Hansen
                   ` (10 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



This does create a little helper in the NFS code to
make an if() a little bit less ugly.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c    |    4 ++++
 lxc-dave/fs/nfsd/vfs.c |   23 +++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff -puN fs/namei.c~13-24-elevate-write-count-over-calls-to-vfs-rename fs/namei.c
--- lxc/fs/namei.c~13-24-elevate-write-count-over-calls-to-vfs-rename	2007-02-09 14:26:53.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:53.000000000 -0800
@@ -2567,8 +2567,12 @@ static int do_rename(int olddfd, const c
 	if (new_dentry == trap)
 		goto exit5;
 
+	error = mnt_want_write(oldnd.mnt);
+	if (error)
+		goto exit5;
 	error = vfs_rename(old_dir->d_inode, old_dentry,
 				   new_dir->d_inode, new_dentry);
+	mnt_drop_write(oldnd.mnt);
 exit5:
 	dput(new_dentry);
 exit4:
diff -puN fs/nfsd/vfs.c~13-24-elevate-write-count-over-calls-to-vfs-rename fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~13-24-elevate-write-count-over-calls-to-vfs-rename	2007-02-09 14:26:53.000000000 -0800
+++ lxc-dave/fs/nfsd/vfs.c	2007-02-09 14:26:53.000000000 -0800
@@ -1555,6 +1555,14 @@ out_nfserr:
 	goto out_unlock;
 }
 
+static inline int svc_msnfs(struct svc_fh *ffhp)
+{
+#ifdef MSNFS
+	return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
+#else
+	return 0;
+#endif
+}
 /*
  * Rename a file
  * N.B. After this call _both_ ffhp and tfhp need an fh_put
@@ -1616,13 +1624,20 @@ nfsd_rename(struct svc_rqst *rqstp, stru
 	if (ndentry == trap)
 		goto out_dput_new;
 
-#ifdef MSNFS
-	if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+	if (svc_msnfs(ffhp) &&
 		((atomic_read(&odentry->d_count) > 1)
 		 || (atomic_read(&ndentry->d_count) > 1))) {
 			host_err = -EPERM;
-	} else
-#endif
+			goto out_dput_new;
+	}
+
+	host_err = -EXDEV;
+	if (ffhp->fh_export->ex_mnt != tfhp->fh_export->ex_mnt)
+		goto out_dput_new;
+	host_err = mnt_want_write(ffhp->fh_export->ex_mnt);
+	if (host_err)
+		goto out_dput_new;
+
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
 		host_err = nfsd_sync_dir(tdentry);
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 12/22] elevate write count files are open()ed
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (8 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 10/22] unix_find_other() elevate write count for touch_atime() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-13  5:11   ` Andrew Morton
  2007-02-09 22:53 ` [PATCH 11/22] elevate write count over calls to vfs_rename() Dave Hansen
                   ` (11 subsequent siblings)
  21 siblings, 1 reply; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



This is the first really tricky patch in the series.  It
elevates the writer count on a mount each time a
non-special file is opened for write.

This is not completely apparent in the patch because the
two if() conditions in may_open() above the
mnt_want_write() call are, combined, equivalent to
special_file().

There is also an elevated count around the vfs_create()
call in open_namei().  The count needs to be kept elevated
all the way into the may_open() call.  Otherwise, when the
write is dropped, a ro->rw transisition could occur.  This
would lead to having rw access on the newly created file,
while the vfsmount is ro.  That is bad.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/file_table.c |    5 ++++-
 lxc-dave/fs/namei.c      |   22 ++++++++++++++++++----
 lxc-dave/ipc/mqueue.c    |    3 +++
 3 files changed, 25 insertions(+), 5 deletions(-)

diff -puN fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/file_table.c
--- lxc/fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
+++ lxc-dave/fs/file_table.c	2007-02-09 14:26:54.000000000 -0800
@@ -209,8 +209,11 @@ void fastcall __fput(struct file *file)
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
-	if (file->f_mode & FMODE_WRITE)
+	if (file->f_mode & FMODE_WRITE) {
 		put_write_access(inode);
+		if(!special_file(inode->i_mode))
+			mnt_drop_write(mnt);
+	}
 	put_pid(file->f_owner.pid);
 	put_user_ns(file->f_owner.user_ns);
 	file_kill(file);
diff -puN fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/namei.c
--- lxc/fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:54.000000000 -0800
@@ -1548,8 +1548,17 @@ int may_open(struct nameidata *nd, int a
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
-	} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
-		return -EROFS;
+	} else if (flag & FMODE_WRITE) {
+		/*
+		 * effectively: !special_file()
+		 * balanced by __fput()
+		 */
+		error = mnt_want_write(nd->mnt);
+		if (error)
+			return error;
+		if (IS_RDONLY(inode))
+			return -EROFS;
+	}
 	/*
 	 * An append-only file must be opened in append mode for writing.
 	 */
@@ -1688,14 +1697,17 @@ do_last:
 	}
 
 	if (IS_ERR(nd->intent.open.file)) {
-		mutex_unlock(&dir->d_inode->i_mutex);
 		error = PTR_ERR(nd->intent.open.file);
-		goto exit_dput;
+		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
+		error = mnt_want_write(nd->mnt);
+		if (error)
+			goto exit_mutex_unlock;
 		error = open_namei_create(nd, &path, flag, mode);
+		mnt_drop_write(nd->mnt);
 		if (error)
 			goto exit;
 		return 0;
@@ -1733,6 +1745,8 @@ ok:
 		goto exit;
 	return 0;
 
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
 	dput_path(&path, nd);
 exit:
diff -puN ipc/mqueue.c~14-24-tricky-elevate-write-count-files-are-open-ed ipc/mqueue.c
--- lxc/ipc/mqueue.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
+++ lxc-dave/ipc/mqueue.c	2007-02-09 14:26:54.000000000 -0800
@@ -687,6 +687,9 @@ asmlinkage long sys_mq_open(const char _
 				goto out;
 			filp = do_open(dentry, oflag);
 		} else {
+			error = mnt_want_write(mqueue_mnt);
+			if (error)
+				goto out;
 			filp = do_create(mqueue_mnt->mnt_root, dentry,
 						oflag, mode, u_attr);
 		}
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 13/22] elevate writer count for do_sys_truncate()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (10 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 11/22] elevate write count over calls to vfs_rename() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 14/22] elevate write count for do_utimes() Dave Hansen
                   ` (9 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen




Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/open.c |   16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff -puN fs/open.c~15-24-elevate-writer-count-for-do-sys-truncate fs/open.c
--- lxc/fs/open.c~15-24-elevate-writer-count-for-do-sys-truncate	2007-02-09 14:26:55.000000000 -0800
+++ lxc-dave/fs/open.c	2007-02-09 14:26:55.000000000 -0800
@@ -241,28 +241,32 @@ static long do_sys_truncate(const char _
 	if (!S_ISREG(inode->i_mode))
 		goto dput_and_out;
 
-	error = vfs_permission(&nd, MAY_WRITE);
+	error = mnt_want_write(nd.mnt);
 	if (error)
 		goto dput_and_out;
 
+	error = vfs_permission(&nd, MAY_WRITE);
+	if (error)
+		goto mnt_drop_write_and_out;
+
 	error = -EROFS;
 	if (IS_RDONLY(inode))
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	/*
 	 * Make sure that there are no leases.
 	 */
 	error = break_lease(inode, FMODE_WRITE);
 	if (error)
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	error = get_write_access(inode);
 	if (error)
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
@@ -271,6 +275,8 @@ static long do_sys_truncate(const char _
 	}
 	put_write_access(inode);
 
+mnt_drop_write_and_out:
+	mnt_drop_write(nd.mnt);
 dput_and_out:
 	path_release(&nd);
 out:
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 14/22] elevate write count for do_utimes()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (11 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 13/22] elevate writer count for do_sys_truncate() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 15/22] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
                   ` (8 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen




Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/utimes.c |   13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff -puN fs/utimes.c~16-24-elevate-write-count-for-do-utimes fs/utimes.c
--- lxc/fs/utimes.c~16-24-elevate-write-count-for-do-utimes	2007-02-09 14:26:55.000000000 -0800
+++ lxc-dave/fs/utimes.c	2007-02-09 14:26:55.000000000 -0800
@@ -58,16 +58,19 @@ static long do_utimes_nsec(int dfd, char
 		goto out;
 	inode = nd.dentry->d_inode;
 
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto dput_and_out;
 	error = -EROFS;
 	if (IS_RDONLY(inode))
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	/* Don't worry, the checks are done in inode_change_ok() */
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
 		error = -EPERM;
                 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                        goto dput_and_out;
+			goto mnt_drop_write_and_out;
 
 		newattrs.ia_atime = times[0];
 		newattrs.ia_mtime = times[1];
@@ -75,15 +78,17 @@ static long do_utimes_nsec(int dfd, char
 	} else {
 		error = -EACCES;
                 if (IS_IMMUTABLE(inode))
-                        goto dput_and_out;
+			goto mnt_drop_write_and_out;
 
 		if (current->fsuid != inode->i_uid &&
 		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
-			goto dput_and_out;
+			goto mnt_drop_write_and_out;
 	}
 	mutex_lock(&inode->i_mutex);
 	error = notify_change(nd.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+	mnt_drop_write(nd.mnt);
 dput_and_out:
 	path_release(&nd);
 out:
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 15/22] elevate write count for do_sys_utime() and touch_atime()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (12 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 14/22] elevate write count for do_utimes() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 16/22] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
                   ` (7 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen




Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/inode.c |   20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff -puN fs/inode.c~17-24-elevate-write-count-for-do-sys-utime-and-touch-atime fs/inode.c
--- lxc/fs/inode.c~17-24-elevate-write-count-for-do-sys-utime-and-touch-atime	2007-02-09 14:26:56.000000000 -0800
+++ lxc-dave/fs/inode.c	2007-02-09 14:26:56.000000000 -0800
@@ -1170,22 +1170,23 @@ void touch_atime(struct vfsmount *mnt, s
 	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
-	if (inode->i_flags & S_NOATIME)
+	if (mnt && mnt_want_write(mnt))
 		return;
+	if (inode->i_flags & S_NOATIME)
+		goto out;
 	if (IS_NOATIME(inode))
-		return;
+		goto out;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-		return;
+		goto out;
 
 	/*
 	 * We may have a NULL vfsmount when coming from NFSD
 	 */
 	if (mnt) {
 		if (mnt->mnt_flags & MNT_NOATIME)
-			return;
+			goto out;
 		if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-			return;
-
+			goto out;
 		if (mnt->mnt_flags & MNT_RELATIME) {
 			/*
 			 * With relative atime, only update atime if the
@@ -1196,16 +1197,19 @@ void touch_atime(struct vfsmount *mnt, s
 						&inode->i_atime) < 0 &&
 			    timespec_compare(&inode->i_ctime,
 						&inode->i_atime) < 0)
-				return;
+				goto out;
 		}
 	}
 
 	now = current_fs_time(inode->i_sb);
 	if (timespec_equal(&inode->i_atime, &now))
-		return;
+		goto out;
 
 	inode->i_atime = now;
 	mark_inode_dirty_sync(inode);
+out:
+	if (mnt)
+		mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
 
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 16/22] sys_mknodat(): elevate write count for vfs_mknod/create()
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (13 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 15/22] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 17/22] elevate mnt writers for vfs_unlink() callers Dave Hansen
                   ` (6 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



This takes care of all of the direct callers of vfs_mknod().
Since a few of these cases also handle normal file creation
as well, this also covers some calls to vfs_create().

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c         |   12 ++++++++++++
 lxc-dave/fs/nfsd/vfs.c      |    4 ++++
 lxc-dave/net/unix/af_unix.c |    4 ++++
 3 files changed, 20 insertions(+)

diff -puN fs/namei.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create fs/namei.c
--- lxc/fs/namei.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create	2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:57.000000000 -0800
@@ -1903,14 +1903,26 @@ asmlinkage long sys_mknodat(int dfd, con
 	if (!IS_ERR(dentry)) {
 		switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
+			error = mnt_want_write(nd.mnt);
+			if (error)
+				break;
 			error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
+			mnt_drop_write(nd.mnt);
 			break;
 		case S_IFCHR: case S_IFBLK:
+			error = mnt_want_write(nd.mnt);
+			if (error)
+				break;
 			error = vfs_mknod(nd.dentry->d_inode,dentry,mode,
 					new_decode_dev(dev));
+			mnt_drop_write(nd.mnt);
 			break;
 		case S_IFIFO: case S_IFSOCK:
+			error = mnt_want_write(nd.mnt);
+			if (error)
+				break;
 			error = vfs_mknod(nd.dentry->d_inode,dentry,mode,0);
+			mnt_drop_write(nd.mnt);
 			break;
 		case S_IFDIR:
 			error = -EPERM;
diff -puN fs/nfsd/vfs.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create	2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/fs/nfsd/vfs.c	2007-02-09 14:26:57.000000000 -0800
@@ -664,6 +664,9 @@ nfsd_open(struct svc_rqst *rqstp, struct
 	/* Disallow write access to files with the append-only bit set
 	 * or any access when mandatory locking enabled
 	 */
+	err = mnt_want_write(fhp->fh_export->ex_mnt);
+	if (err)
+		goto out_nfserr;
 	err = nfserr_perm;
 	if (IS_APPEND(inode) && (access & MAY_WRITE))
 		goto out;
@@ -1199,6 +1202,7 @@ nfsd_create(struct svc_rqst *rqstp, stru
 	        printk("nfsd: bad file type %o in nfsd_create\n", type);
 		host_err = -EINVAL;
 	}
+	mnt_drop_write(fhp->fh_export->ex_mnt);
 	if (host_err < 0)
 		goto out_nfserr;
 
diff -puN net/unix/af_unix.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create net/unix/af_unix.c
--- lxc/net/unix/af_unix.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create	2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/net/unix/af_unix.c	2007-02-09 14:26:57.000000000 -0800
@@ -816,7 +816,11 @@ static int unix_bind(struct socket *sock
 		 */
 		mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+		err = mnt_want_write(nd.mnt);
+		if (err)
+			goto out_mknod_dput;
 		err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
+		mnt_drop_write(nd.mnt);
 		if (err)
 			goto out_mknod_dput;
 		mutex_unlock(&nd.dentry->d_inode->i_mutex);
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 17/22] elevate mnt writers for vfs_unlink() callers
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (14 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 16/22] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 18/22] do_rmdir(): elevate write count Dave Hansen
                   ` (5 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen




Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c   |    4 ++++
 lxc-dave/ipc/mqueue.c |    5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff -puN fs/namei.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers fs/namei.c
--- lxc/fs/namei.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers	2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:57.000000000 -0800
@@ -2181,7 +2181,11 @@ static long do_unlinkat(int dfd, const c
 		inode = dentry->d_inode;
 		if (inode)
 			atomic_inc(&inode->i_count);
+		error = mnt_want_write(nd.mnt);
+		if (error)
+			goto exit2;
 		error = vfs_unlink(nd.dentry->d_inode, dentry);
+		mnt_drop_write(nd.mnt);
 	exit2:
 		dput(dentry);
 	}
diff -puN ipc/mqueue.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers ipc/mqueue.c
--- lxc/ipc/mqueue.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers	2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/ipc/mqueue.c	2007-02-09 14:26:57.000000000 -0800
@@ -749,8 +749,11 @@ asmlinkage long sys_mq_unlink(const char
 	inode = dentry->d_inode;
 	if (inode)
 		atomic_inc(&inode->i_count);
-
+	err = mnt_want_write(mqueue_mnt);
+	if (err)
+		goto out_err;
 	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+	mnt_drop_write(mqueue_mnt);
 out_err:
 	dput(dentry);
 
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 18/22] do_rmdir(): elevate write count
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (15 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 17/22] elevate mnt writers for vfs_unlink() callers Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 19/22] elevate writer count for custom struct_file Dave Hansen
                   ` (4 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



Elevate the write count during the vfs_rmdir() call.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c |    5 +++++
 1 file changed, 5 insertions(+)

diff -puN fs/namei.c~20-24-do-rmdir-elevate-write-count fs/namei.c
--- lxc/fs/namei.c~20-24-do-rmdir-elevate-write-count	2007-02-09 14:26:58.000000000 -0800
+++ lxc-dave/fs/namei.c	2007-02-09 14:26:58.000000000 -0800
@@ -2101,7 +2101,12 @@ static long do_rmdir(int dfd, const char
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit2;
+	error = mnt_want_write(nd.mnt);
+	if (error)
+		goto exit3;
 	error = vfs_rmdir(nd.dentry->d_inode, dentry);
+	mnt_drop_write(nd.mnt);
+exit3:
 	dput(dentry);
 exit2:
 	mutex_unlock(&nd.dentry->d_inode->i_mutex);
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 20/22] [PATCH] gfs: check nlink count
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (17 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 19/22] elevate writer count for custom struct_file Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 21/22] honor r/w changes at do_remount() time Dave Hansen
                   ` (2 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen

---

 lxc-dave/fs/gfs2/inode.c |    1 +
 1 file changed, 1 insertion(+)

diff -puN fs/gfs2/inode.c~gfs-check-nlink-count fs/gfs2/inode.c
--- lxc/fs/gfs2/inode.c~gfs-check-nlink-count	2007-02-09 14:26:59.000000000 -0800
+++ lxc-dave/fs/gfs2/inode.c	2007-02-09 14:26:59.000000000 -0800
@@ -169,6 +169,7 @@ static int gfs2_dinode_in(struct gfs2_in
 	 * to do that.
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
+	check_nlink(&ip->i_inode);
 	di->di_size = be64_to_cpu(str->di_size);
 	i_size_write(&ip->i_inode, di->di_size);
 	di->di_blocks = be64_to_cpu(str->di_blocks);
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 19/22] elevate writer count for custom struct_file
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (16 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 18/22] do_rmdir(): elevate write count Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 22:53 ` [PATCH 20/22] [PATCH] gfs: check nlink count Dave Hansen
                   ` (3 subsequent siblings)
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



Some filesystems forego the use of normal vfs calls to create
struct files.  Make sure that these users elevate the mnt writer
count.  These probably don't have any real meaning because there
is no real backing store for these mounts, but it is here for
consistency.


Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/file_table.c |    4 ++++
 1 file changed, 4 insertions(+)

diff -puN fs/file_table.c~22-24-elevate-writer-count-for-custom-struct-file fs/file_table.c
--- lxc/fs/file_table.c~22-24-elevate-writer-count-for-custom-struct-file	2007-02-09 14:26:59.000000000 -0800
+++ lxc-dave/fs/file_table.c	2007-02-09 14:26:59.000000000 -0800
@@ -171,6 +171,10 @@ int init_file(struct file *file, struct 
 	file->f_mapping = dentry->d_inode->i_mapping;
 	file->f_mode = mode;
 	file->f_op = fop;
+	if (mode & FMODE_WRITE) {
+		error = mnt_want_write(mnt);
+		WARN_ON(error);
+	}
 	return error;
 }
 
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 21/22] honor r/w changes at do_remount() time
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (18 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 20/22] [PATCH] gfs: check nlink count Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 23:22   ` Andrew Morton
  2007-02-09 22:53 ` [PATCH 22/22] kill open files traverse on remount ro Dave Hansen
  2007-02-09 23:18 ` [PATCH 01/22] filesystem helpers for custom 'struct file's Andrew Morton
  21 siblings, 1 reply; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen



Originally from: Herbert Poetzl <herbert@13thfloor.at>

This is the core of the read-only bind mount patch set.

Note that this does _not_ add a "ro" option directly to
the bind mount operation.  If you require such a mount,
you must first do the bind, then follow it up with a
'mount -o remount,ro' operation.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namespace.c |   24 ++++++++++++++++++++++--
 lxc-dave/fs/open.c      |    2 +-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff -puN fs/namespace.c~23-24-honor-r-w-changes-at-do-remount-time fs/namespace.c
--- lxc/fs/namespace.c~23-24-honor-r-w-changes-at-do-remount-time	2007-02-09 14:27:00.000000000 -0800
+++ lxc-dave/fs/namespace.c	2007-02-09 14:27:00.000000000 -0800
@@ -443,7 +443,7 @@ static int show_vfsmnt(struct seq_file *
 	seq_path(m, mnt, mnt->mnt_root, " \t\n\\");
 	seq_putc(m, ' ');
 	mangle(m, mnt->mnt_sb->s_type->name);
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
 	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 		if (mnt->mnt_sb->s_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
@@ -1017,6 +1017,23 @@ out:
 	return err;
 }
 
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+	int error = 0;
+	int readonly_request = 0;
+
+	if (ms_flags & MS_RDONLY)
+		readonly_request = 1;
+	if (readonly_request == __mnt_is_readonly(mnt))
+		return 0;
+
+	if (readonly_request)
+		error = mnt_make_readonly(mnt);
+	else
+		__mnt_unmake_readonly(mnt);
+	return error;
+}
+
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
@@ -1038,7 +1055,10 @@ static int do_remount(struct nameidata *
 		return -EINVAL;
 
 	down_write(&sb->s_umount);
-	err = do_remount_sb(sb, flags, data, 0);
+	if (flags & MS_BIND)
+		err = change_mount_flags(nd->mnt, flags);
+	else
+		err = do_remount_sb(sb, flags, data, 0);
 	if (!err)
 		nd->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
diff -puN fs/open.c~23-24-honor-r-w-changes-at-do-remount-time fs/open.c
--- lxc/fs/open.c~23-24-honor-r-w-changes-at-do-remount-time	2007-02-09 14:27:00.000000000 -0800
+++ lxc-dave/fs/open.c	2007-02-09 14:27:00.000000000 -0800
@@ -401,7 +401,7 @@ asmlinkage long sys_faccessat(int dfd, c
 	   special_file(nd.dentry->d_inode->i_mode))
 		goto out_path_release;
 
-	if(IS_RDONLY(nd.dentry->d_inode))
+	if(__mnt_is_readonly(nd.mnt) || IS_RDONLY(nd.dentry->d_inode))
 		res = -EROFS;
 
 out_path_release:
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 22/22] kill open files traverse on remount ro
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (19 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 21/22] honor r/w changes at do_remount() time Dave Hansen
@ 2007-02-09 22:53 ` Dave Hansen
  2007-02-09 23:18 ` [PATCH 01/22] filesystem helpers for custom 'struct file's Andrew Morton
  21 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-09 22:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, hch, Dave Hansen


Now that we have the sb writer count, and all of the
writers marked with mnt_want_write(), we don't need to
go looking at all of the individual open files.

Kill the open files walk, and use the sb writer count.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/file_table.c    |   25 -------------------------
 lxc-dave/fs/super.c         |   13 ++++++++++++-
 lxc-dave/include/linux/fs.h |    2 --
 3 files changed, 12 insertions(+), 28 deletions(-)

diff -puN fs/file_table.c~24-24-kill-open-files-traverse-on-remount-ro fs/file_table.c
--- lxc/fs/file_table.c~24-24-kill-open-files-traverse-on-remount-ro	2007-02-09 14:27:01.000000000 -0800
+++ lxc-dave/fs/file_table.c	2007-02-09 14:27:01.000000000 -0800
@@ -308,31 +308,6 @@ void file_kill(struct file *file)
 	}
 }
 
-int fs_may_remount_ro(struct super_block *sb)
-{
-	struct list_head *p;
-
-	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *file = list_entry(p, struct file, f_u.fu_list);
-		struct inode *inode = file->f_path.dentry->d_inode;
-
-		/* File with pending delete? */
-		if (inode->i_nlink == 0)
-			goto too_bad;
-
-		/* Writeable file? */
-		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
-			goto too_bad;
-	}
-	file_list_unlock();
-	return 1; /* Tis' cool bro. */
-too_bad:
-	file_list_unlock();
-	return 0;
-}
-
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
diff -puN fs/super.c~24-24-kill-open-files-traverse-on-remount-ro fs/super.c
--- lxc/fs/super.c~24-24-kill-open-files-traverse-on-remount-ro	2007-02-09 14:27:01.000000000 -0800
+++ lxc-dave/fs/super.c	2007-02-09 14:27:01.000000000 -0800
@@ -580,7 +580,18 @@ static void mark_files_ro(struct super_b
 
 static int sb_remount_ro(struct super_block *sb)
 {
-	return fs_may_remount_ro(sb);
+	int ret = 0;
+
+	/*
+	 * The r/o flag actually gets set
+	 * by the caller.
+	 */
+	spin_lock(&sb->s_mnt_writers_lock);
+	if (sb->s_writers)
+		ret = -EBUSY;
+	spin_unlock(&sb->s_mnt_writers_lock);
+
+	return ret;
 }
 
 /**
diff -puN include/linux/fs.h~24-24-kill-open-files-traverse-on-remount-ro include/linux/fs.h
--- lxc/include/linux/fs.h~24-24-kill-open-files-traverse-on-remount-ro	2007-02-09 14:27:01.000000000 -0800
+++ lxc-dave/include/linux/fs.h	2007-02-09 14:27:01.000000000 -0800
@@ -1657,8 +1657,6 @@ extern const struct file_operations read
 extern const struct file_operations write_fifo_fops;
 extern const struct file_operations rdwr_fifo_fops;
 
-extern int fs_may_remount_ro(struct super_block *);
-
 #ifdef CONFIG_BLOCK
 /*
  * return READ, READA, or WRITE
_

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 01/22] filesystem helpers for custom 'struct file's
  2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
                   ` (20 preceding siblings ...)
  2007-02-09 22:53 ` [PATCH 22/22] kill open files traverse on remount ro Dave Hansen
@ 2007-02-09 23:18 ` Andrew Morton
  21 siblings, 0 replies; 33+ messages in thread
From: Andrew Morton @ 2007-02-09 23:18 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, hch

On Fri, 09 Feb 2007 14:53:29 -0800
Dave Hansen <hansendc@us.ibm.com> wrote:

> +/*
> + * Note: This is a crappy interface.  It is here to make
> + * merging with the existing users of get_empty_filp()
> + * who have complex failure logic easier.  All users
> + * of this should be moving to alloc_file().
> + */
> +int init_file(struct file *file, struct vfsmount *mnt,
> +	   struct dentry *dentry, mode_t mode,
> +	   const struct file_operations *fop)

crappy name too ;)  At least two filesystems have defined their own
static-scope init_file() and so they'll explode if they somehow maange
to include file.h.

I guess we can cross that bridge when we fall off it, but sometime it might be
prudent to do s/init_file/configfs_init_file/ and ditto sysfs_init_file.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 21/22] honor r/w changes at do_remount() time
  2007-02-09 22:53 ` [PATCH 21/22] honor r/w changes at do_remount() time Dave Hansen
@ 2007-02-09 23:22   ` Andrew Morton
  2007-02-10  0:00     ` Dave Hansen
                       ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Andrew Morton @ 2007-02-09 23:22 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, hch

On Fri, 09 Feb 2007 14:53:44 -0800
Dave Hansen <hansendc@us.ibm.com> wrote:

> This is the core of the read-only bind mount patch set.

Who wants read-only bind mounts, and for what reason?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 02/22] r/o bind mounts: add vfsmount writer counts
  2007-02-09 22:53 ` [PATCH 02/22] r/o bind mounts: add vfsmount writer counts Dave Hansen
@ 2007-02-09 23:41   ` Eric Dumazet
  2007-02-10  0:10     ` Dave Hansen
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2007-02-09 23:41 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, akpm, hch

Dave Hansen a écrit :

> @@ -56,6 +57,7 @@ struct vfsmount {
>  	struct vfsmount *mnt_master;	/* slave is on master->mnt_slave_list */
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
> +	int mnt_writers;		/* nr files open for write */
>  	/*
>  	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
>  	 * to let these frequently modified fields in a separate cache line
> @@ -72,7 +74,26 @@ static inline struct vfsmount *mntget(st
>  		atomic_inc(&mnt->mnt_count);
>  	return mnt;

Dave, please read again this comment in struct vfsmount definition.

If I understand your infrastructure, mnt_writers is going to be frequently 
modified, so it should be placed at the end of struct vfsmount, in the same 
cache line than mnt_count.

Thank you
Eric


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 21/22] honor r/w changes at do_remount() time
  2007-02-09 23:22   ` Andrew Morton
@ 2007-02-10  0:00     ` Dave Hansen
  2007-02-10  0:29     ` Anton Altaparmakov
  2007-02-10  9:54     ` Jan Engelhardt
  2 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-10  0:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch, Dave Hansen

On Fri, 2007-02-09 at 15:22 -0800, Andrew Morton wrote:
> On Fri, 09 Feb 2007 14:53:44 -0800
> Dave Hansen <hansendc@us.ibm.com> wrote:
> 
> > This is the core of the read-only bind mount patch set.
> 
> Who wants read-only bind mounts, and for what reason?

The original desire came out of the linux-vserver project.  It allows a
sysadmin to share directories between many vservers/containers and keep
those containers from writing to it, even though the users in that
vserver may have "root" privileges.

This also has the advantage of cleaning up the somewhat hackish "look
for writable-open-files during remount/ro operations".  It should also
allow us to separate the concepts of the user wanting a filesystem to be
r/o and the filesystem _itself_ being r/o because of a r/o device or
some kind of corruption.

-- Dave


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 02/22] r/o bind mounts: add vfsmount writer counts
  2007-02-09 23:41   ` Eric Dumazet
@ 2007-02-10  0:10     ` Dave Hansen
  0 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-10  0:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel, akpm, hch, David C. Hansen [imap]

On Sat, 2007-02-10 at 00:41 +0100, Eric Dumazet wrote:
> Dave, please read again this comment in struct vfsmount definition.
> 
> If I understand your infrastructure, mnt=5Fwriters is going to be frequently
> modified, so it should be placed at the end of struct vfsmount, in the same
> cache line than mnt_count.

That's an excellent point, thanks for catching it.  Here's an updated
patch.

-- Dave

This patch actually adds the mount and superblock writer
counts, and the mnt_want/drop_write() functions that use
them.

Before these can become useful, we must first cover each
place in the VFS where writes are performed with a
want/drop pair.  When that is complete, we can actually
introduce code that will safely check the counts before
allowing r/w<->r/o transitions to occur.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namespace.c        |   53 +++++++++++++++++++++++++++++++++++++++++
 lxc-dave/fs/super.c            |   18 ++++++++++---
 lxc-dave/include/linux/fs.h    |    2 +
 lxc-dave/include/linux/mount.h |   28 +++++++++++++++++++--
 4 files changed, 94 insertions(+), 7 deletions(-)

diff -puN fs/namespace.c~03-24-add-vfsmount-writer-count fs/namespace.c
--- lxc/fs/namespace.c~03-24-add-vfsmount-writer-count	2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/fs/namespace.c	2007-02-09 16:04:40.000000000 -0800
@@ -58,6 +58,7 @@ struct vfsmount *alloc_vfsmnt(const char
 	if (mnt) {
 		mnt->mnt_user_ns = get_user_ns(current->nsproxy->user_ns);
 		atomic_set(&mnt->mnt_count, 1);
+		mnt->mnt_writers = 0;
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -78,6 +79,56 @@ struct vfsmount *alloc_vfsmnt(const char
 	return mnt;
 }
 
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	WARN_ON(__mnt_is_readonly(mnt));
+
+	/*
+	 * This flag set is actually redundant with what
+	 * happens in do_remount(), but since we do this
+	 * under the lock, anyone attempting to get a write
+	 * on it after this will fail.
+	 */
+	spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+	if (!mnt->mnt_writers)
+		mnt->mnt_flags |= MNT_READONLY;
+	else
+		ret = -EBUSY;
+	spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+	return ret;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+	if (mnt->mnt_writers)
+		goto out;
+
+	if (__mnt_is_readonly(mnt)) {
+		ret = -EROFS;
+		goto out;
+	}
+	mnt->mnt_sb->s_writers++;
+	mnt->mnt_writers++;
+out:
+	spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+void mnt_drop_write(struct vfsmount *mnt)
+{
+	spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+	mnt->mnt_sb->s_writers--;
+	mnt->mnt_writers--;
+	spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
@@ -1415,6 +1466,8 @@ long do_mount(char *dev_name, char *dir_
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
 	/* Separate the per-mountpoint flags */
+	if (flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
 	if (flags & MS_NODEV)
diff -puN fs/super.c~03-24-add-vfsmount-writer-count fs/super.c
--- lxc/fs/super.c~03-24-add-vfsmount-writer-count	2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/fs/super.c	2007-02-09 16:04:40.000000000 -0800
@@ -93,6 +93,8 @@ static struct super_block *alloc_super(s
 		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		s->s_writers = 0;
+		spin_lock_init(&s->s_mnt_writers_lock);
 	}
 out:
 	return s;
@@ -576,6 +578,11 @@ static void mark_files_ro(struct super_b
 	file_list_unlock();
 }
 
+static int sb_remount_ro(struct super_block *sb)
+{
+	return fs_may_remount_ro(sb);
+}
+
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
@@ -587,7 +594,8 @@ static void mark_files_ro(struct super_b
  */
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
-	int retval;
+	int retval = 0;
+	int sb_started_ro = (sb->s_flags & MS_RDONLY);
 	
 #ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
@@ -600,11 +608,13 @@ int do_remount_sb(struct super_block *sb
 
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
-	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+	if ((flags & MS_RDONLY) && !sb_started_ro) {
 		if (force)
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
-			return -EBUSY;
+		else
+			retval = sb_remount_ro(sb);
+		if (retval)
+			return retval;
 	}
 
 	if (sb->s_op->remount_fs) {
diff -puN include/linux/fs.h~03-24-add-vfsmount-writer-count include/linux/fs.h
--- lxc/include/linux/fs.h~03-24-add-vfsmount-writer-count	2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/include/linux/fs.h	2007-02-09 16:04:40.000000000 -0800
@@ -972,6 +972,8 @@ struct super_block {
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_files;
+	int			s_writers;	/* number of files open for write */
+	spinlock_t		s_mnt_writers_lock; /* taken when mounts change rw state */
 
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
diff -puN include/linux/mount.h~03-24-add-vfsmount-writer-count include/linux/mount.h
--- lxc/include/linux/mount.h~03-24-add-vfsmount-writer-count	2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/include/linux/mount.h	2007-02-09 16:07:28.000000000 -0800
@@ -29,6 +29,7 @@ struct user_namespace;
 #define MNT_NOATIME	0x08
 #define MNT_NODIRATIME	0x10
 #define MNT_RELATIME	0x20
+#define MNT_READONLY	0x40 /* does the user want this to be r/o? */
 
 #define MNT_SHRINKABLE	0x100
 
@@ -57,12 +58,14 @@ struct vfsmount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
 	/*
-	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
-	 * to let these frequently modified fields in a separate cache line
-	 * (so that reads of mnt_flags wont ping-pong on SMP machines)
+	 * We put mnt_count, mnt_expiry_mark, and mnt_writers at the end of
+	 * struct vfsmount to let these frequently modified fields in a
+	 * separate cache line (so that reads of mnt_flags wont ping-pong
+	 * on SMP machines)
 	 */
 	atomic_t mnt_count;
 	int mnt_expiry_mark;		/* true if marked for expiry */
+	int mnt_writers;		/* nr files open for write */
 	int mnt_pinned;
 };
 
@@ -72,7 +75,26 @@ static inline struct vfsmount *mntget(st
 		atomic_inc(&mnt->mnt_count);
 	return mnt;
 }
+/*
+ * This is temporary for now.  We also don't want to check
+ * the SB in because it is already checked in other
+ * code paths.  We'll have a better way to do this in
+ * the end of this series
+ */
+static inline int __mnt_is_readonly(struct vfsmount *mnt)
+{
+	return mnt->mnt_flags & MNT_READONLY;
+}
+
+static inline void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+	WARN_ON(!__mnt_is_readonly(mnt));
+	mnt->mnt_flags &= ~MNT_READONLY;
+}
 
+extern int mnt_make_readonly(struct vfsmount *mnt);
+extern int mnt_want_write(struct vfsmount *mnt);
+extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
 extern void mnt_unpin(struct vfsmount *mnt);
_



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 21/22] honor r/w changes at do_remount() time
  2007-02-09 23:22   ` Andrew Morton
  2007-02-10  0:00     ` Dave Hansen
@ 2007-02-10  0:29     ` Anton Altaparmakov
  2007-02-10  9:54     ` Jan Engelhardt
  2 siblings, 0 replies; 33+ messages in thread
From: Anton Altaparmakov @ 2007-02-10  0:29 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Hansen, linux-kernel, hch

On 9 Feb 2007, at 23:22, Andrew Morton wrote:
> On Fri, 09 Feb 2007 14:53:44 -0800
> Dave Hansen <hansendc@us.ibm.com> wrote:
>
>> This is the core of the read-only bind mount patch set.
>
> Who wants read-only bind mounts, and for what reason?

On our local mirror server (mirrors just under 3TiB worth of stuff)  
we hold all data on r/w mounted storage in a private location in the  
file tree.  (Note the server runs Solaris 10 not Linux or the  
following would not be possible at present...)

We then bind mount (i.e. loopback mount on Solaris) various  
directories from inside the private paths to various other locations  
so for example we create /export/ftp/pub/* where "*" are directories  
we want to export via FTP and we do all of those as read-only bind  
mounts.  This gives us that little bit of extra confidence that no- 
one from the outside can cause any writes to happen to our mirrored  
data.  We do similar for NFS by creating lots of read-only bind  
mounts in /* that again point into the private locations.

It would be nice if the Linux box that we have that is a copy/backup  
of the Solaris box could do the same rather than have all the bind  
mounts be read-write because we need the storage in the private  
locations to be writable.

Best regards,

	Anton

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 21/22] honor r/w changes at do_remount() time
  2007-02-09 23:22   ` Andrew Morton
  2007-02-10  0:00     ` Dave Hansen
  2007-02-10  0:29     ` Anton Altaparmakov
@ 2007-02-10  9:54     ` Jan Engelhardt
  2 siblings, 0 replies; 33+ messages in thread
From: Jan Engelhardt @ 2007-02-10  9:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Hansen, linux-kernel, hch


On Feb 9 2007 15:22, Andrew Morton wrote:
>On Fri, 09 Feb 2007 14:53:44 -0800
>Dave Hansen <hansendc@us.ibm.com> wrote:
>
>> This is the core of the read-only bind mount patch set.
>
>Who wants read-only bind mounts, and for what reason?

And another case could be, that some application modifies ~/.xyz, but the user
(with root's help) does not want that:

  mount --bind -r ~/.xyz ~/.xyz

chmoding out the w bits does not always work, as programs might tamper with the
permissions of ~/.xyz itself... so a ro mount seems to be best.


Jan
-- 
ft: http://freshmeat.net/p/chaostables/

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/22] elevate write count files are open()ed
  2007-02-09 22:53 ` [PATCH 12/22] elevate write count files are open()ed Dave Hansen
@ 2007-02-13  5:11   ` Andrew Morton
  2007-02-13 16:58     ` Dave Hansen
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew Morton @ 2007-02-13  5:11 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, hch

On Fri, 09 Feb 2007 14:53:37 -0800 Dave Hansen <hansendc@us.ibm.com> wrote:

> diff -puN fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/file_table.c
> --- lxc/fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
> +++ lxc-dave/fs/file_table.c	2007-02-09 14:26:54.000000000 -0800
> @@ -209,8 +209,11 @@ void fastcall __fput(struct file *file)
>  	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
>  		cdev_put(inode->i_cdev);
>  	fops_put(file->f_op);
> -	if (file->f_mode & FMODE_WRITE)
> +	if (file->f_mode & FMODE_WRITE) {
>  		put_write_access(inode);
> +		if(!special_file(inode->i_mode))
> +			mnt_drop_write(mnt);
> +	}
>  	put_pid(file->f_owner.pid);
>  	put_user_ns(file->f_owner.user_ns);
>  	file_kill(file);
> diff -puN fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/namei.c
> --- lxc/fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
> +++ lxc-dave/fs/namei.c	2007-02-09 14:26:54.000000000 -0800
> @@ -1548,8 +1548,17 @@ int may_open(struct nameidata *nd, int a
>  			return -EACCES;
>  
>  		flag &= ~O_TRUNC;
> -	} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
> -		return -EROFS;
> +	} else if (flag & FMODE_WRITE) {
> +		/*
> +		 * effectively: !special_file()
> +		 * balanced by __fput()
> +		 */
> +		error = mnt_want_write(nd->mnt);
> +		if (error)
> +			return error;
> +		if (IS_RDONLY(inode))
> +			return -EROFS;
> +	}

yipes.  A new mount-wide spin_lock/unlock for each for-writing open() and close().
Can we have a microbenchmark on this please?

Are you sure that fget_light() and fput_light() don't accidentally bypass this
new logic?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/22] elevate write count files are open()ed
  2007-02-13  5:11   ` Andrew Morton
@ 2007-02-13 16:58     ` Dave Hansen
  2007-02-13 17:58       ` Andrew Morton
  0 siblings, 1 reply; 33+ messages in thread
From: Dave Hansen @ 2007-02-13 16:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch

On Mon, 2007-02-12 at 21:11 -0800, Andrew Morton wrote:
> On Fri, 09 Feb 2007 14:53:37 -0800 Dave Hansen <hansendc@us.ibm.com> wrote:
> 
> > diff -puN fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/file_table.c
> > --- lxc/fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
> > +++ lxc-dave/fs/file_table.c	2007-02-09 14:26:54.000000000 -0800
> > @@ -209,8 +209,11 @@ void fastcall __fput(struct file *file)
> >  	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
> >  		cdev_put(inode->i_cdev);
> >  	fops_put(file->f_op);
> > -	if (file->f_mode & FMODE_WRITE)
> > +	if (file->f_mode & FMODE_WRITE) {
> >  		put_write_access(inode);
> > +		if(!special_file(inode->i_mode))
> > +			mnt_drop_write(mnt);
> > +	}
> >  	put_pid(file->f_owner.pid);
> >  	put_user_ns(file->f_owner.user_ns);
> >  	file_kill(file);
> > diff -puN fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/namei.c
> > --- lxc/fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed	2007-02-09 14:26:54.000000000 -0800
> > +++ lxc-dave/fs/namei.c	2007-02-09 14:26:54.000000000 -0800
> > @@ -1548,8 +1548,17 @@ int may_open(struct nameidata *nd, int a
> >  			return -EACCES;
> >  
> >  		flag &= ~O_TRUNC;
> > -	} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
> > -		return -EROFS;
> > +	} else if (flag & FMODE_WRITE) {
> > +		/*
> > +		 * effectively: !special_file()
> > +		 * balanced by __fput()
> > +		 */
> > +		error = mnt_want_write(nd->mnt);
> > +		if (error)
> > +			return error;
> > +		if (IS_RDONLY(inode))
> > +			return -EROFS;
> > +	}
> 
> yipes.  A new mount-wide spin_lock/unlock for each for-writing open() and close().
> Can we have a microbenchmark on this please?

Yeah, I'll schedule some dbench time on a NUMA machine.

> Are you sure that fget_light() and fput_light() don't accidentally bypass this
> new logic?

Pretty sure.  My code actually surrounds all of the permission() checks
in the VFS.  To even use fget, you had to get a fd at some point, and to
do that you have to go through open, where both the mount and normal
filesystem checks are.

Is there something particular you had in mind?

-- Dave


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/22] elevate write count files are open()ed
  2007-02-13 16:58     ` Dave Hansen
@ 2007-02-13 17:58       ` Andrew Morton
  2007-02-14  0:17         ` Dave Hansen
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew Morton @ 2007-02-13 17:58 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, hch

> On Tue, 13 Feb 2007 08:58:16 -0800 Dave Hansen <hansendc@us.ibm.com> wrote:
> > yipes.  A new mount-wide spin_lock/unlock for each for-writing open() and close().
> > Can we have a microbenchmark on this please?
> 
> Yeah, I'll schedule some dbench time on a NUMA machine.

dbench doesn't do open() a lot.  To assess the worst-case we'd need one
process per cpu camping in an open/close loop.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/22] elevate write count files are open()ed
  2007-02-13 17:58       ` Andrew Morton
@ 2007-02-14  0:17         ` Dave Hansen
  0 siblings, 0 replies; 33+ messages in thread
From: Dave Hansen @ 2007-02-14  0:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch

On Tue, 2007-02-13 at 09:58 -0800, Andrew Morton wrote:
> > On Tue, 13 Feb 2007 08:58:16 -0800 Dave Hansen <hansendc@us.ibm.com> wrote:
> > > yipes.  A new mount-wide spin_lock/unlock for each for-writing open() and close().
> > > Can we have a microbenchmark on this please?
> > 
> > Yeah, I'll schedule some dbench time on a NUMA machine.
> 
> dbench doesn't do open() a lot.  To assess the worst-case we'd need one
> process per cpu camping in an open/close loop.

This is definitely a worst-case scenario.  A 32-way x86_64 NUMA machine
(with a pretty crappy interconnect) with a process-per-cpu all beating
on the same filesystem.  

no patch:
real: 30.111s
user: 0.031s
 sys: 2.685s

r/o bind mount patch:
real: 48.359s
user: 0.146s
 sys: 47.984s

It definitely makes a huge difference in system time, although not a
fatal one.  Christoph, what do you think?  Back to caching the
superblock flag in the mount?

#!/bin/sh
# go.sh
name=`uname -r`
grep -q /mnt/ram /proc/mounts || mount -t ramfs ram /mnt/ram;
make openbench;
nr_cpus=`cat /proc/cpuinfo | grep -c ^processor`
for ((run=0;run<5;run++)); do
        dir=$name.run.$run;
        mkdir -p $dir;
        for ((i=0;i<nr_cpus;i++)); do
                { time taskset -c $i ./openbench $((1<<16))  & } \
                > $dir/openbench.time.$i 2>&1
        done;
        wait
        echo run $run done
done

// openbench.c
#include <stdio.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

void main(int argc, char **argv)
{
        pid_t pid = getpid();
        char buf[100];
        int ret;
        int fd;
        int i;
        int loops = atoi(argv[1]);
        sprintf(&buf[0], "/mnt/ram/openbench.%d", pid);

        for (i=0; i< loops; i++) {
                fd = open(&buf[0], O_WRONLY|O_CREAT);
                if (fd < 0) {
                        perror("open error");
                        exit(fd);
                }
                write(fd, "foo");
                close(fd);
        }
        ret = unlink(&buf[0]);
        if (ret)
                perror("unlink error");
}

-- Dave


^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2007-02-14  0:17 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-09 22:53 [PATCH 01/22] filesystem helpers for custom 'struct file's Dave Hansen
2007-02-09 22:53 ` [PATCH 02/22] r/o bind mounts: add vfsmount writer counts Dave Hansen
2007-02-09 23:41   ` Eric Dumazet
2007-02-10  0:10     ` Dave Hansen
2007-02-09 22:53 ` [PATCH 03/22] record when sb_writer_count elevated for inode Dave Hansen
2007-02-09 22:53 ` [PATCH 04/22] elevate writer count for chown and friends Dave Hansen
2007-02-09 22:53 ` [PATCH 05/22] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
2007-02-09 22:53 ` [PATCH 06/22] elevate write count during entire ncp_ioctl() Dave Hansen
2007-02-09 22:53 ` [PATCH 07/22] elevate write count for link and symlink calls Dave Hansen
2007-02-09 22:53 ` [PATCH 08/22] elevate mount count for extended attributes Dave Hansen
2007-02-09 22:53 ` [PATCH 09/22] mount_is_safe(): add comment Dave Hansen
2007-02-09 22:53 ` [PATCH 10/22] unix_find_other() elevate write count for touch_atime() Dave Hansen
2007-02-09 22:53 ` [PATCH 12/22] elevate write count files are open()ed Dave Hansen
2007-02-13  5:11   ` Andrew Morton
2007-02-13 16:58     ` Dave Hansen
2007-02-13 17:58       ` Andrew Morton
2007-02-14  0:17         ` Dave Hansen
2007-02-09 22:53 ` [PATCH 11/22] elevate write count over calls to vfs_rename() Dave Hansen
2007-02-09 22:53 ` [PATCH 13/22] elevate writer count for do_sys_truncate() Dave Hansen
2007-02-09 22:53 ` [PATCH 14/22] elevate write count for do_utimes() Dave Hansen
2007-02-09 22:53 ` [PATCH 15/22] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
2007-02-09 22:53 ` [PATCH 16/22] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
2007-02-09 22:53 ` [PATCH 17/22] elevate mnt writers for vfs_unlink() callers Dave Hansen
2007-02-09 22:53 ` [PATCH 18/22] do_rmdir(): elevate write count Dave Hansen
2007-02-09 22:53 ` [PATCH 19/22] elevate writer count for custom struct_file Dave Hansen
2007-02-09 22:53 ` [PATCH 20/22] [PATCH] gfs: check nlink count Dave Hansen
2007-02-09 22:53 ` [PATCH 21/22] honor r/w changes at do_remount() time Dave Hansen
2007-02-09 23:22   ` Andrew Morton
2007-02-10  0:00     ` Dave Hansen
2007-02-10  0:29     ` Anton Altaparmakov
2007-02-10  9:54     ` Jan Engelhardt
2007-02-09 22:53 ` [PATCH 22/22] kill open files traverse on remount ro Dave Hansen
2007-02-09 23:18 ` [PATCH 01/22] filesystem helpers for custom 'struct file's Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.