* [PATCH] ext4: fix how i_version is modified and turn it on by default V2
@ 2012-05-15 14:33 ` Josef Bacik
0 siblings, 0 replies; 20+ messages in thread
From: Josef Bacik @ 2012-05-15 14:33 UTC (permalink / raw)
To: linux-ext4-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, bfields-uC3wQj2KruNg9hUCZPvPmw,
adilger-m1MBpc4rdrD3fQ9qLvQP4Q, tytso-3s7WtUTddSA,
jack-AlSwsSmVLrQ
This makes MS_I_VERSION be turned on by default. Ext4 had been
unconditionally doing i_version++ in a few cases anway so the mount option
was kind of silly. This patch also removes the update in mark_inode_dirty
and makes all of the cases where we update ctime also do inode_inc_iversion.
file_update_time takes care of the write case and all the places where we
update iversion are protected by the i_mutex so there should be no extra
i_lock overhead in the normal non-exported fs case. Thanks,
Signed-off-by: Josef Bacik <josef-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
V1->V2: introduce ext4_inode_update_time helper to update ctime/time and
i_version to make the code a little easier and make it less fragile.
fs/ext4/acl.c | 2 +-
fs/ext4/ext4.h | 16 ++++++++++++++++
fs/ext4/extents.c | 13 ++++---------
fs/ext4/indirect.c | 2 +-
fs/ext4/inode.c | 3 ---
fs/ext4/ioctl.c | 4 ++--
fs/ext4/namei.c | 24 ++++++++++--------------
fs/ext4/super.c | 6 +++---
fs/ext4/xattr.c | 2 +-
9 files changed, 38 insertions(+), 34 deletions(-)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a5c29bb..c445d3a 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -202,7 +202,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
if (error < 0)
return error;
else {
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
ext4_mark_inode_dirty(handle, inode);
if (error == 0)
acl = NULL;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004f..9ff4bdb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1276,6 +1276,22 @@ static inline struct timespec ext4_current_time(struct inode *inode)
current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
}
+enum time_update {
+ MTIME = 1,
+ CTIME = 2,
+};
+
+static inline void ext4_inode_update_time(struct inode *inode, int flags)
+{
+ inode_inc_iversion(inode);
+
+ if (flags & MTIME)
+ inode->i_mtime = ext4_current_time(inode);
+ if (flags & CTIME)
+ inode->i_ctime = ext4_current_time(inode);
+}
+
+
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c2..5c2b4cf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4249,7 +4249,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
}
@@ -4257,13 +4257,8 @@ out_stop:
static void ext4_falloc_update_inode(struct inode *inode,
int mode, loff_t new_size, int update_ctime)
{
- struct timespec now;
-
- if (update_ctime) {
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
- }
+ if (update_ctime)
+ ext4_inode_update_time(inode, CTIME);
/*
* Update only when preallocation was requested beyond
* the file size.
@@ -4905,7 +4900,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
out:
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
return err;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..a63a4ad 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1476,7 +1476,7 @@ do_indirects:
out_unlock:
up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82f..d11b4e5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4314,9 +4314,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
- inode_inc_iversion(inode);
-
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eee255..a162faf 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -120,7 +120,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ei->i_flags = flags;
ext4_set_inode_flags(inode);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -168,7 +168,7 @@ flags_out:
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f48..c8417e6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1316,9 +1316,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_inode_update_time(dir, CTIME | MTIME);
ext4_update_dx_flag(dir);
- dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, dir, bh);
@@ -1668,7 +1667,6 @@ static int ext4_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
- dir->i_version++;
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, dir, bh);
if (unlikely(err)) {
@@ -2158,14 +2156,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_warning(inode->i_sb,
"empty directory has too many links (%d)",
inode->i_nlink);
- inode->i_version++;
clear_nlink(inode);
/* There's no need to set i_disksize: the fact that i_nlink is
* zero will ensure that the right thing happens during any
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
+ ext4_inode_update_time(dir, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_dec_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -2218,13 +2216,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
- dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+ ext4_inode_update_time(dir, MTIME | CTIME);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
ext4_mark_inode_dirty(handle, inode);
retval = 0;
@@ -2363,7 +2361,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
ext4_inc_count(handle, inode);
ihold(inode);
@@ -2470,9 +2468,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
EXT4_FEATURE_INCOMPAT_FILETYPE))
new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- new_dir->i_ctime = new_dir->i_mtime =
- ext4_current_time(new_dir);
+ ext4_inode_update_time(new_dir, CTIME | MTIME);
ext4_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
@@ -2488,7 +2484,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = ext4_current_time(old_inode);
+ ext4_inode_update_time(old_inode, CTIME);
ext4_mark_inode_dirty(handle, old_inode);
/*
@@ -2521,9 +2517,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode) {
ext4_dec_count(handle, new_inode);
- new_inode->i_ctime = ext4_current_time(new_inode);
+ ext4_inode_update_time(new_inode, CTIME);
}
- old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
+ ext4_inode_update_time(old_dir, CTIME | MTIME);
ext4_update_dx_flag(old_dir);
if (dir_bh) {
PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61f..98d6863 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1813,8 +1813,7 @@ set_qf_format:
"Ignoring deprecated bh option");
break;
case Opt_i_version:
- set_opt(sb, I_VERSION);
- sb->s_flags |= MS_I_VERSION;
+ /* On by default now */
break;
case Opt_nodelalloc:
clear_opt(sb, DELALLOC);
@@ -3132,6 +3131,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_orig;
}
sb->s_fs_info = sbi;
+ sb->s_flags |= MS_I_VERSION;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT4_DEF_RESUID;
sbi->s_resgid = EXT4_DEF_RESGID;
@@ -4831,7 +4831,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle))
goto out;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d8..e8bcc6f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1048,7 +1048,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
if (!value)
ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
--
1.7.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH] ext4: fix how i_version is modified and turn it on by default V2
@ 2012-05-15 14:33 ` Josef Bacik
0 siblings, 0 replies; 20+ messages in thread
From: Josef Bacik @ 2012-05-15 14:33 UTC (permalink / raw)
To: linux-ext4, linux-nfs, bfields, adilger, tytso, jack
This makes MS_I_VERSION be turned on by default. Ext4 had been
unconditionally doing i_version++ in a few cases anway so the mount option
was kind of silly. This patch also removes the update in mark_inode_dirty
and makes all of the cases where we update ctime also do inode_inc_iversion.
file_update_time takes care of the write case and all the places where we
update iversion are protected by the i_mutex so there should be no extra
i_lock overhead in the normal non-exported fs case. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
---
V1->V2: introduce ext4_inode_update_time helper to update ctime/time and
i_version to make the code a little easier and make it less fragile.
fs/ext4/acl.c | 2 +-
fs/ext4/ext4.h | 16 ++++++++++++++++
fs/ext4/extents.c | 13 ++++---------
fs/ext4/indirect.c | 2 +-
fs/ext4/inode.c | 3 ---
fs/ext4/ioctl.c | 4 ++--
fs/ext4/namei.c | 24 ++++++++++--------------
fs/ext4/super.c | 6 +++---
fs/ext4/xattr.c | 2 +-
9 files changed, 38 insertions(+), 34 deletions(-)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a5c29bb..c445d3a 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -202,7 +202,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
if (error < 0)
return error;
else {
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
ext4_mark_inode_dirty(handle, inode);
if (error == 0)
acl = NULL;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004f..9ff4bdb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1276,6 +1276,22 @@ static inline struct timespec ext4_current_time(struct inode *inode)
current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
}
+enum time_update {
+ MTIME = 1,
+ CTIME = 2,
+};
+
+static inline void ext4_inode_update_time(struct inode *inode, int flags)
+{
+ inode_inc_iversion(inode);
+
+ if (flags & MTIME)
+ inode->i_mtime = ext4_current_time(inode);
+ if (flags & CTIME)
+ inode->i_ctime = ext4_current_time(inode);
+}
+
+
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c2..5c2b4cf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4249,7 +4249,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
}
@@ -4257,13 +4257,8 @@ out_stop:
static void ext4_falloc_update_inode(struct inode *inode,
int mode, loff_t new_size, int update_ctime)
{
- struct timespec now;
-
- if (update_ctime) {
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
- }
+ if (update_ctime)
+ ext4_inode_update_time(inode, CTIME);
/*
* Update only when preallocation was requested beyond
* the file size.
@@ -4905,7 +4900,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
out:
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
return err;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..a63a4ad 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1476,7 +1476,7 @@ do_indirects:
out_unlock:
up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82f..d11b4e5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4314,9 +4314,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
- inode_inc_iversion(inode);
-
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eee255..a162faf 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -120,7 +120,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ei->i_flags = flags;
ext4_set_inode_flags(inode);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -168,7 +168,7 @@ flags_out:
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f48..c8417e6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1316,9 +1316,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_inode_update_time(dir, CTIME | MTIME);
ext4_update_dx_flag(dir);
- dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, dir, bh);
@@ -1668,7 +1667,6 @@ static int ext4_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
- dir->i_version++;
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, dir, bh);
if (unlikely(err)) {
@@ -2158,14 +2156,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_warning(inode->i_sb,
"empty directory has too many links (%d)",
inode->i_nlink);
- inode->i_version++;
clear_nlink(inode);
/* There's no need to set i_disksize: the fact that i_nlink is
* zero will ensure that the right thing happens during any
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
+ ext4_inode_update_time(dir, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_dec_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -2218,13 +2216,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
- dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+ ext4_inode_update_time(dir, MTIME | CTIME);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
ext4_mark_inode_dirty(handle, inode);
retval = 0;
@@ -2363,7 +2361,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
ext4_inc_count(handle, inode);
ihold(inode);
@@ -2470,9 +2468,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
EXT4_FEATURE_INCOMPAT_FILETYPE))
new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- new_dir->i_ctime = new_dir->i_mtime =
- ext4_current_time(new_dir);
+ ext4_inode_update_time(new_dir, CTIME | MTIME);
ext4_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
@@ -2488,7 +2484,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = ext4_current_time(old_inode);
+ ext4_inode_update_time(old_inode, CTIME);
ext4_mark_inode_dirty(handle, old_inode);
/*
@@ -2521,9 +2517,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode) {
ext4_dec_count(handle, new_inode);
- new_inode->i_ctime = ext4_current_time(new_inode);
+ ext4_inode_update_time(new_inode, CTIME);
}
- old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
+ ext4_inode_update_time(old_dir, CTIME | MTIME);
ext4_update_dx_flag(old_dir);
if (dir_bh) {
PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61f..98d6863 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1813,8 +1813,7 @@ set_qf_format:
"Ignoring deprecated bh option");
break;
case Opt_i_version:
- set_opt(sb, I_VERSION);
- sb->s_flags |= MS_I_VERSION;
+ /* On by default now */
break;
case Opt_nodelalloc:
clear_opt(sb, DELALLOC);
@@ -3132,6 +3131,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_orig;
}
sb->s_fs_info = sbi;
+ sb->s_flags |= MS_I_VERSION;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT4_DEF_RESUID;
sbi->s_resgid = EXT4_DEF_RESGID;
@@ -4831,7 +4831,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle))
goto out;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_inode_update_time(inode, CTIME | MTIME);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d8..e8bcc6f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1048,7 +1048,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = ext4_current_time(inode);
+ ext4_inode_update_time(inode, CTIME);
if (!value)
ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
--
1.7.7.6
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 14:33 ` Josef Bacik
(?)
@ 2012-05-15 15:18 ` Jan Kara
-1 siblings, 0 replies; 20+ messages in thread
From: Jan Kara @ 2012-05-15 15:18 UTC (permalink / raw)
To: Josef Bacik; +Cc: linux-ext4, linux-nfs, bfields, adilger, tytso, jack
On Tue 15-05-12 10:33:16, Josef Bacik wrote:
> This makes MS_I_VERSION be turned on by default. Ext4 had been
> unconditionally doing i_version++ in a few cases anway so the mount option
> was kind of silly. This patch also removes the update in mark_inode_dirty
> and makes all of the cases where we update ctime also do inode_inc_iversion.
> file_update_time takes care of the write case and all the places where we
> update iversion are protected by the i_mutex so there should be no extra
> i_lock overhead in the normal non-exported fs case. Thanks,
>
> Signed-off-by: Josef Bacik <josef@redhat.com>
Looks good. You can add
Reviewed-by: Jan Kara <jack@suse.cz>
provided Bruce won't find some serious performance issue with having
i_version enabled by default...
Honza
> ---
> V1->V2: introduce ext4_inode_update_time helper to update ctime/time and
> i_version to make the code a little easier and make it less fragile.
>
> fs/ext4/acl.c | 2 +-
> fs/ext4/ext4.h | 16 ++++++++++++++++
> fs/ext4/extents.c | 13 ++++---------
> fs/ext4/indirect.c | 2 +-
> fs/ext4/inode.c | 3 ---
> fs/ext4/ioctl.c | 4 ++--
> fs/ext4/namei.c | 24 ++++++++++--------------
> fs/ext4/super.c | 6 +++---
> fs/ext4/xattr.c | 2 +-
> 9 files changed, 38 insertions(+), 34 deletions(-)
>
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index a5c29bb..c445d3a 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -202,7 +202,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
> if (error < 0)
> return error;
> else {
> - inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
> ext4_mark_inode_dirty(handle, inode);
> if (error == 0)
> acl = NULL;
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 513004f..9ff4bdb 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1276,6 +1276,22 @@ static inline struct timespec ext4_current_time(struct inode *inode)
> current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
> }
>
> +enum time_update {
> + MTIME = 1,
> + CTIME = 2,
> +};
> +
> +static inline void ext4_inode_update_time(struct inode *inode, int flags)
> +{
> + inode_inc_iversion(inode);
> +
> + if (flags & MTIME)
> + inode->i_mtime = ext4_current_time(inode);
> + if (flags & CTIME)
> + inode->i_ctime = ext4_current_time(inode);
> +}
> +
> +
> static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
> {
> return ino == EXT4_ROOT_INO ||
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 74f23c2..5c2b4cf 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4249,7 +4249,7 @@ out_stop:
> if (inode->i_nlink)
> ext4_orphan_del(handle, inode);
>
> - inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME | MTIME);
> ext4_mark_inode_dirty(handle, inode);
> ext4_journal_stop(handle);
> }
> @@ -4257,13 +4257,8 @@ out_stop:
> static void ext4_falloc_update_inode(struct inode *inode,
> int mode, loff_t new_size, int update_ctime)
> {
> - struct timespec now;
> -
> - if (update_ctime) {
> - now = current_fs_time(inode->i_sb);
> - if (!timespec_equal(&inode->i_ctime, &now))
> - inode->i_ctime = now;
> - }
> + if (update_ctime)
> + ext4_inode_update_time(inode, CTIME);
> /*
> * Update only when preallocation was requested beyond
> * the file size.
> @@ -4905,7 +4900,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
>
> out:
> ext4_orphan_del(handle, inode);
> - inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME | MTIME);
> ext4_mark_inode_dirty(handle, inode);
> ext4_journal_stop(handle);
> return err;
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 830e1b2..a63a4ad 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -1476,7 +1476,7 @@ do_indirects:
>
> out_unlock:
> up_write(&ei->i_data_sem);
> - inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME | MTIME);
> ext4_mark_inode_dirty(handle, inode);
>
> /*
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index feaa82f..d11b4e5 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4314,9 +4314,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
> {
> int err = 0;
>
> - if (test_opt(inode->i_sb, I_VERSION))
> - inode_inc_iversion(inode);
> -
> /* the do_update_inode consumes one bh->b_count */
> get_bh(iloc->bh);
>
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index 6eee255..a162faf 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -120,7 +120,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> ei->i_flags = flags;
>
> ext4_set_inode_flags(inode);
> - inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
>
> err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> flags_err:
> @@ -168,7 +168,7 @@ flags_out:
> }
> err = ext4_reserve_inode_write(handle, inode, &iloc);
> if (err == 0) {
> - inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
> inode->i_generation = generation;
> err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> }
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 2043f48..c8417e6 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -1316,9 +1316,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
> * happen is that the times are slightly out of date
> * and/or different from the directory change time.
> */
> - dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
> + ext4_inode_update_time(dir, CTIME | MTIME);
> ext4_update_dx_flag(dir);
> - dir->i_version++;
> ext4_mark_inode_dirty(handle, dir);
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> err = ext4_handle_dirty_metadata(handle, dir, bh);
> @@ -1668,7 +1667,6 @@ static int ext4_delete_entry(handle_t *handle,
> blocksize);
> else
> de->inode = 0;
> - dir->i_version++;
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> err = ext4_handle_dirty_metadata(handle, dir, bh);
> if (unlikely(err)) {
> @@ -2158,14 +2156,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
> ext4_warning(inode->i_sb,
> "empty directory has too many links (%d)",
> inode->i_nlink);
> - inode->i_version++;
> clear_nlink(inode);
> /* There's no need to set i_disksize: the fact that i_nlink is
> * zero will ensure that the right thing happens during any
> * recovery. */
> inode->i_size = 0;
> ext4_orphan_add(handle, inode);
> - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
> + ext4_inode_update_time(dir, CTIME | MTIME);
> ext4_mark_inode_dirty(handle, inode);
> ext4_dec_count(handle, dir);
> ext4_update_dx_flag(dir);
> @@ -2218,13 +2216,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
> retval = ext4_delete_entry(handle, dir, de, bh);
> if (retval)
> goto end_unlink;
> - dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
> + ext4_inode_update_time(dir, MTIME | CTIME);
> ext4_update_dx_flag(dir);
> ext4_mark_inode_dirty(handle, dir);
> drop_nlink(inode);
> if (!inode->i_nlink)
> ext4_orphan_add(handle, inode);
> - inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
> ext4_mark_inode_dirty(handle, inode);
> retval = 0;
>
> @@ -2363,7 +2361,7 @@ retry:
> if (IS_DIRSYNC(dir))
> ext4_handle_sync(handle);
>
> - inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
> ext4_inc_count(handle, inode);
> ihold(inode);
>
> @@ -2470,9 +2468,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
> if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
> EXT4_FEATURE_INCOMPAT_FILETYPE))
> new_de->file_type = old_de->file_type;
> - new_dir->i_version++;
> - new_dir->i_ctime = new_dir->i_mtime =
> - ext4_current_time(new_dir);
> + ext4_inode_update_time(new_dir, CTIME | MTIME);
> ext4_mark_inode_dirty(handle, new_dir);
> BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
> retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
> @@ -2488,7 +2484,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
> * Like most other Unix systems, set the ctime for inodes on a
> * rename.
> */
> - old_inode->i_ctime = ext4_current_time(old_inode);
> + ext4_inode_update_time(old_inode, CTIME);
> ext4_mark_inode_dirty(handle, old_inode);
>
> /*
> @@ -2521,9 +2517,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
>
> if (new_inode) {
> ext4_dec_count(handle, new_inode);
> - new_inode->i_ctime = ext4_current_time(new_inode);
> + ext4_inode_update_time(new_inode, CTIME);
> }
> - old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
> + ext4_inode_update_time(old_dir, CTIME | MTIME);
> ext4_update_dx_flag(old_dir);
> if (dir_bh) {
> PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 502c61f..98d6863 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1813,8 +1813,7 @@ set_qf_format:
> "Ignoring deprecated bh option");
> break;
> case Opt_i_version:
> - set_opt(sb, I_VERSION);
> - sb->s_flags |= MS_I_VERSION;
> + /* On by default now */
> break;
> case Opt_nodelalloc:
> clear_opt(sb, DELALLOC);
> @@ -3132,6 +3131,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> goto out_free_orig;
> }
> sb->s_fs_info = sbi;
> + sb->s_flags |= MS_I_VERSION;
> sbi->s_mount_opt = 0;
> sbi->s_resuid = EXT4_DEF_RESUID;
> sbi->s_resgid = EXT4_DEF_RESGID;
> @@ -4831,7 +4831,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
> handle = ext4_journal_start(inode, 1);
> if (IS_ERR(handle))
> goto out;
> - inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> + ext4_inode_update_time(inode, CTIME | MTIME);
> ext4_mark_inode_dirty(handle, inode);
> ext4_journal_stop(handle);
>
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 93a00d8..e8bcc6f 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1048,7 +1048,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
> }
> if (!error) {
> ext4_xattr_update_super_block(handle, inode->i_sb);
> - inode->i_ctime = ext4_current_time(inode);
> + ext4_inode_update_time(inode, CTIME);
> if (!value)
> ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
> error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
> --
> 1.7.7.6
>
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 14:33 ` Josef Bacik
(?)
(?)
@ 2012-05-15 17:53 ` Josef Bacik
[not found] ` <20120515175308.GB1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2012-05-16 1:36 ` Ted Ts'o
-1 siblings, 2 replies; 20+ messages in thread
From: Josef Bacik @ 2012-05-15 17:53 UTC (permalink / raw)
To: Josef Bacik; +Cc: linux-ext4, linux-nfs, bfields, adilger, tytso, jack
On Tue, May 15, 2012 at 10:33:16AM -0400, Josef Bacik wrote:
> This makes MS_I_VERSION be turned on by default. Ext4 had been
> unconditionally doing i_version++ in a few cases anway so the mount option
> was kind of silly. This patch also removes the update in mark_inode_dirty
> and makes all of the cases where we update ctime also do inode_inc_iversion.
> file_update_time takes care of the write case and all the places where we
> update iversion are protected by the i_mutex so there should be no extra
> i_lock overhead in the normal non-exported fs case. Thanks,
>
Ok did some basic benchmarking with dd, I ran
dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
3 times with the patch and without the patch. With the worst case scenario
there is about a 40% longer run time, going from on average 12 seconds to 17
seconds. With the other two runs they are the same runtime with the 1 megabyte
blocks. So the question is, do we care about this worst case since any sane
application developer isn't going to do writes that small? Thanks,
Josef
^ permalink raw reply [flat|nested] 20+ messages in thread
[parent not found: <20120515175308.GB1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 17:53 ` Josef Bacik
@ 2012-05-15 18:17 ` Boaz Harrosh
2012-05-16 1:36 ` Ted Ts'o
1 sibling, 0 replies; 20+ messages in thread
From: Boaz Harrosh @ 2012-05-15 18:17 UTC (permalink / raw)
To: Josef Bacik
Cc: linux-ext4-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, bfields-uC3wQj2KruNg9hUCZPvPmw,
adilger-m1MBpc4rdrD3fQ9qLvQP4Q, tytso-3s7WtUTddSA,
jack-AlSwsSmVLrQ
On 05/15/2012 08:53 PM, Josef Bacik wrote:
> On Tue, May 15, 2012 at 10:33:16AM -0400, Josef Bacik wrote:
>> This makes MS_I_VERSION be turned on by default. Ext4 had been
>> unconditionally doing i_version++ in a few cases anway so the mount option
>> was kind of silly. This patch also removes the update in mark_inode_dirty
>> and makes all of the cases where we update ctime also do inode_inc_ב.
>> file_update_time takes care of the write case and all the places where we
>> update iversion are protected by the i_mutex so there should be no extra
>> i_lock overhead in the normal non-exported fs case. Thanks,
>>
>
> Ok did some basic benchmarking with dd, I ran
>
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
>
> 3 times with the patch and without the patch. With the worst case scenario
> there is about a 40% longer run time, going from on average 12 seconds to 17
> seconds.
do you mean that the "with the patch" is 40% slower then "without the patch"
in the same "bs=1 count=10485760 test" ?
Then count me clueless. Do you understand this difference?
The way I read your patch the inode is copied and written to disk exactly the
same number of times, as before. Only that now i_version is also updated
together with ctime and/or mtime. What is the fundamental difference then?
Is it just that i_version++ in-memory operation?
> With the other two runs they are the same runtime with the 1 megabyte
> blocks. So the question is, do we care about this worst case since any sane
> application developer isn't going to do writes that small? Thanks,
> Josef
Thanks
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
@ 2012-05-15 18:17 ` Boaz Harrosh
0 siblings, 0 replies; 20+ messages in thread
From: Boaz Harrosh @ 2012-05-15 18:17 UTC (permalink / raw)
To: Josef Bacik; +Cc: linux-ext4, linux-nfs, bfields, adilger, tytso, jack
On 05/15/2012 08:53 PM, Josef Bacik wrote:
> On Tue, May 15, 2012 at 10:33:16AM -0400, Josef Bacik wrote:
>> This makes MS_I_VERSION be turned on by default. Ext4 had been
>> unconditionally doing i_version++ in a few cases anway so the mount option
>> was kind of silly. This patch also removes the update in mark_inode_dirty
>> and makes all of the cases where we update ctime also do inode_inc_ב.
>> file_update_time takes care of the write case and all the places where we
>> update iversion are protected by the i_mutex so there should be no extra
>> i_lock overhead in the normal non-exported fs case. Thanks,
>>
>
> Ok did some basic benchmarking with dd, I ran
>
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
>
> 3 times with the patch and without the patch. With the worst case scenario
> there is about a 40% longer run time, going from on average 12 seconds to 17
> seconds.
do you mean that the "with the patch" is 40% slower then "without the patch"
in the same "bs=1 count=10485760 test" ?
Then count me clueless. Do you understand this difference?
The way I read your patch the inode is copied and written to disk exactly the
same number of times, as before. Only that now i_version is also updated
together with ctime and/or mtime. What is the fundamental difference then?
Is it just that i_version++ in-memory operation?
> With the other two runs they are the same runtime with the 1 megabyte
> blocks. So the question is, do we care about this worst case since any sane
> application developer isn't going to do writes that small? Thanks,
> Josef
Thanks
Boaz
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 18:17 ` Boaz Harrosh
@ 2012-05-15 18:29 ` Josef Bacik
-1 siblings, 0 replies; 20+ messages in thread
From: Josef Bacik @ 2012-05-15 18:29 UTC (permalink / raw)
To: Boaz Harrosh
Cc: Josef Bacik, linux-ext4, linux-nfs, bfields, adilger, tytso, jack
On Tue, May 15, 2012 at 09:17:16PM +0300, Boaz Harrosh wrote:
> On 05/15/2012 08:53 PM, Josef Bacik wrote:
>
> > On Tue, May 15, 2012 at 10:33:16AM -0400, Josef Bacik wrote:
> >> This makes MS_I_VERSION be turned on by default. Ext4 had been
> >> unconditionally doing i_version++ in a few cases anway so the mount option
> >> was kind of silly. This patch also removes the update in mark_inode_dirty
> >> and makes all of the cases where we update ctime also do inode_inc_ב.
> >> file_update_time takes care of the write case and all the places where we
> >> update iversion are protected by the i_mutex so there should be no extra
> >> i_lock overhead in the normal non-exported fs case. Thanks,
> >>
> >
> > Ok did some basic benchmarking with dd, I ran
> >
> > dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
> > dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
> > dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
> >
> > 3 times with the patch and without the patch. With the worst case scenario
> > there is about a 40% longer run time, going from on average 12 seconds to 17
> > seconds.
>
>
> do you mean that the "with the patch" is 40% slower then "without the patch"
> in the same "bs=1 count=10485760 test" ?
>
> Then count me clueless. Do you understand this difference?
>
> The way I read your patch the inode is copied and written to disk exactly the
> same number of times, as before. Only that now i_version is also updated
> together with ctime and/or mtime. What is the fundamental difference then?
> Is it just that i_version++ in-memory operation?
>
Yeah but for every write() call we have to do this in-memory operation (via
file_update_time()), so in the worst case where you are doing 1 byte writes
where you would notice this sort of overhead you get a 40% overhead just from
spin_lock(&inode->i_lock);
inode->i_version++;
spin_unlock(&inode->i_lock);
and then the subsequent mark_inode_dirty() call. What is likely saving us here
is that the 1 byte writes are happening fast enough that the normal ctime/mtime
operations aren't triggering a mark_inode_dirty() since it appears to happen
wihtin the same time, but now that we're doing the i_version++ we are calling
mark_inode_dirty() more than before. I'd have to profile it to verify that's
what is actually happening, but that means turning on ftrace and parsing a bunch
of output and man that sounds like more time than I want to waste on this. The
question is do we care that the worst case is so awful since the worst case is
likely to not show up often if at all? If we do then I guess the next step is
to add a fs flag for i_version so that people who are going to be exporting file
systems can get this without having to use a special option all the time.
Thanks,
Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
@ 2012-05-15 18:29 ` Josef Bacik
0 siblings, 0 replies; 20+ messages in thread
From: Josef Bacik @ 2012-05-15 18:29 UTC (permalink / raw)
To: Boaz Harrosh
Cc: Josef Bacik, linux-ext4, linux-nfs, bfields, adilger, tytso, jack
On Tue, May 15, 2012 at 09:17:16PM +0300, Boaz Harrosh wrote:
> On 05/15/2012 08:53 PM, Josef Bacik wrote:
>
> > On Tue, May 15, 2012 at 10:33:16AM -0400, Josef Bacik wrote:
> >> This makes MS_I_VERSION be turned on by default. Ext4 had been
> >> unconditionally doing i_version++ in a few cases anway so the mount option
> >> was kind of silly. This patch also removes the update in mark_inode_dirty
> >> and makes all of the cases where we update ctime also do inode_inc_ב.
> >> file_update_time takes care of the write case and all the places where we
> >> update iversion are protected by the i_mutex so there should be no extra
> >> i_lock overhead in the normal non-exported fs case. Thanks,
> >>
> >
> > Ok did some basic benchmarking with dd, I ran
> >
> > dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
> > dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
> > dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
> >
> > 3 times with the patch and without the patch. With the worst case scenario
> > there is about a 40% longer run time, going from on average 12 seconds to 17
> > seconds.
>
>
> do you mean that the "with the patch" is 40% slower then "without the patch"
> in the same "bs=1 count=10485760 test" ?
>
> Then count me clueless. Do you understand this difference?
>
> The way I read your patch the inode is copied and written to disk exactly the
> same number of times, as before. Only that now i_version is also updated
> together with ctime and/or mtime. What is the fundamental difference then?
> Is it just that i_version++ in-memory operation?
>
Yeah but for every write() call we have to do this in-memory operation (via
file_update_time()), so in the worst case where you are doing 1 byte writes
where you would notice this sort of overhead you get a 40% overhead just from
spin_lock(&inode->i_lock);
inode->i_version++;
spin_unlock(&inode->i_lock);
and then the subsequent mark_inode_dirty() call. What is likely saving us here
is that the 1 byte writes are happening fast enough that the normal ctime/mtime
operations aren't triggering a mark_inode_dirty() since it appears to happen
wihtin the same time, but now that we're doing the i_version++ we are calling
mark_inode_dirty() more than before. I'd have to profile it to verify that's
what is actually happening, but that means turning on ftrace and parsing a bunch
of output and man that sounds like more time than I want to waste on this. The
question is do we care that the worst case is so awful since the worst case is
likely to not show up often if at all? If we do then I guess the next step is
to add a fs flag for i_version so that people who are going to be exporting file
systems can get this without having to use a special option all the time.
Thanks,
Josef
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 18:29 ` Josef Bacik
(?)
@ 2012-05-15 19:55 ` Andreas Dilger
[not found] ` <AB91C817-DFEE-415F-8769-78831D72C6B7-m1MBpc4rdrD3fQ9qLvQP4Q@public.gmane.org>
-1 siblings, 1 reply; 20+ messages in thread
From: Andreas Dilger @ 2012-05-15 19:55 UTC (permalink / raw)
To: Josef Bacik; +Cc: Boaz Harrosh, linux-ext4, linux-nfs, bfields, tytso, jack
On 2012-05-15, at 12:29 PM, Josef Bacik wrote:
> On Tue, May 15, 2012 at 09:17:16PM +0300, Boaz Harrosh wrote:
>> On 05/15/2012 08:53 PM, Josef Bacik wrote:
>>> On Tue, May 15, 2012 at 10:33:16AM -0400, Josef Bacik wrote:
>>>> This makes MS_I_VERSION be turned on by default. Ext4 had been
>>>> unconditionally doing i_version++ in a few cases anway so the
>>>> mount option was kind of silly. This patch also removes the
>>>> update in mark_inode_dirty and makes all of the cases where we
>>>> update ctime also do inode_inc_version. file_update_time takes
>>>> care of the write case and all the places where we update
>>>> iversion are protected by the i_mutex so there should be no
>>>> extra i_lock overhead in the normal non-exported fs case.
>>>
>>> Ok did some basic benchmarking with dd, I ran
>>>
>>> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
>>> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
>>> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
>>>
>>> 3 times with the patch and without the patch. With the worst case
>>> scenario there is about a 40% longer run time, going from on average
>>> 12 seconds to 17 seconds. With the other two runs they are the same
>>> runtime with the 1 megabyte blocks.
Josef,
thanks for running these tests. This confirms that the problem
which has existed in the past with ext[34]_mark_inode_dirty()
still exists today, and for btrfs as well.
>> do you mean that the "with the patch" is 40% slower then "without the patch" in the same "bs=1 count=10485760 test" ?
>>
>> Then count me clueless. Do you understand this difference?
>>
>> The way I read your patch the inode is copied and written to disk
>> exactly the same number of times, as before. Only that now
>> i_version is also updated together with ctime and/or mtime. What
>> is the fundamental difference then?
>> Is it just that i_version++ in-memory operation?
>
> Yeah but for every write() call we have to do this in-memory operation (via file_update_time()), so in the worst case where you are doing 1
> byte writes where you would notice this sort of overhead you get a 40%
> overhead just from:
>
> spin_lock(&inode->i_lock);
> inode->i_version++;
> spin_unlock(&inode->i_lock);
>
> and then the subsequent mark_inode_dirty() call.
Right. That this is so noticeable (40% wallclock slowdown for
filesystem IO) means that the CPU usage is higher with the
patch than without, likely MUCH more than the 40% shown by the
wallclock time.
> What is likely saving us here is that the 1 byte writes are happening
> fast enough that the normal ctime/mtime operations aren't triggering
> a mark_inode_dirty() since it appears to happen within the same time,
> but now that we're doing the i_version++ we are calling
> mark_inode_dirty() more than before.
Right, and hence my objection to the "basic" version of this patch
that just turns on i_version updating for everyone.
> I'd have to profile it to verify that's what is actually happening,
> but that means turning on ftrace and parsing a bunch of output and
> man that sounds like more time than I want to waste on this. The
> question is do we care that the worst case is so awful since the
> worst case is likely to not show up often if at all?
Based on my experience, there are a LOT of badly written applications
in the world, and I don't think anyone would be happy with a 40%
slowdown, PLUS 100% CPU usage on the node while doing IO. I would
guess that the number of systems running badly-written applications
far exceeds the number of systems that are doing NFS exporting, so
I don't think this is a good tradeoff.
> If we do then I guess the next step is to add a fs flag for i_version
> so that people who are going to be exporting file systems can get
> this without having to use a special option all the time.
It should be fairly straight forward to have a flag set in the ext4
superblock (s_state flag?) that indicates that the filesystem has
been exported via NFS. There might be other optimizations that can
be done based on this (e.g. avoid some of the directory cookie
hijinx that are only needed if NFS has exported the filesystem and
needs to keep persistent cookies across reboots).
I think that the ext4_mark_inode_dirty() performance problem could
be at least partially fixed by deferring the copy of in-core inode
to on-disk inode to use a journal commit callback. This is far
more work than just setting a flag in the superblock, but it has
the potential to _improve_ performance rather than make it worse.
Cheers, Andreas
^ permalink raw reply [flat|nested] 20+ messages in thread
[parent not found: <20120515182914.GC1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 18:29 ` Josef Bacik
@ 2012-05-15 20:24 ` Boaz Harrosh
-1 siblings, 0 replies; 20+ messages in thread
From: Boaz Harrosh @ 2012-05-15 20:24 UTC (permalink / raw)
To: Josef Bacik
Cc: linux-ext4-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, bfields-uC3wQj2KruNg9hUCZPvPmw,
adilger-m1MBpc4rdrD3fQ9qLvQP4Q, tytso-3s7WtUTddSA,
jack-AlSwsSmVLrQ
On 05/15/2012 09:29 PM, Josef Bacik wrote:
> On Tue, May 15, 2012 at 09:17:16PM +0300, Boaz Harrosh wrote:
<snip>
>>
>> do you mean that the "with the patch" is 40% slower then "without the patch"
>> in the same "bs=1 count=10485760 test" ?
>>
>> Then count me clueless. Do you understand this difference?
>>
>> The way I read your patch the inode is copied and written to disk exactly the
>> same number of times, as before. Only that now i_version is also updated
>> together with ctime and/or mtime. What is the fundamental difference then?
>> Is it just that i_version++ in-memory operation?
>>
>
> Yeah but for every write() call we have to do this in-memory operation (via
> file_update_time()), so in the worst case where you are doing 1 byte writes
> where you would notice this sort of overhead you get a 40% overhead just from
>
> spin_lock(&inode->i_lock);
> inode->i_version++;
> spin_unlock(&inode->i_lock);
>
We should be optimizing this we can take the spin_lock once and change
the i_version and ctime/mtime at the same locking cycle.
> and then the subsequent mark_inode_dirty() call. What is likely saving us here
> is that the 1 byte writes are happening fast enough that the normal ctime/mtime
> operations aren't triggering a mark_inode_dirty() since it appears to happen
> wihtin the same time,
Ha, OK that Jiffy resolution of ctime. Hence the core problem BTW, changes come
in and we can't differentiate them.
So you are saying that there is somewhere in code that does
if (old_ctime != cur_ctime)
mark_inode_dirty()
Isn't the mark_inode_dirty() called every-time regardless of the value set
at ctime ?
> but now that we're doing the i_version++ we are calling
> mark_inode_dirty() more than before.
Can we fix that. Can we mark the inode dirty at Jiffy resolution?
In the case of a Server crash the nfs_changed attr is expected to reset
back to old value until such time that some client did a COMMIT. Up to
COMMIT it might happen that on disk version is older than runtime.
But the Jiffy resolution is a real bummer believe me.
> I'd have to profile it to verify that's
> what is actually happening, but that means turning on ftrace and parsing a bunch
> of output and man that sounds like more time than I want to waste on this. The
> question is do we care that the worst case is so awful since the worst case is
> likely to not show up often if at all? If we do then I guess the next step is
> to add a fs flag for i_version so that people who are going to be exporting file
> systems can get this without having to use a special option all the time.
> Thanks,
>
Lets just think about it a bit, Can we make it that disk updates happen just
as often as today and no more. But in memory inquiry of i_version gives us
a much higher resolution.
What about that Al's proposal of incrementing only after an NFSD inquiry
> Josef
Thanks for looking into this. We all hate this for a long time but never
took the time to fix it.
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
@ 2012-05-15 20:24 ` Boaz Harrosh
0 siblings, 0 replies; 20+ messages in thread
From: Boaz Harrosh @ 2012-05-15 20:24 UTC (permalink / raw)
To: Josef Bacik; +Cc: linux-ext4, linux-nfs, bfields, adilger, tytso, jack
On 05/15/2012 09:29 PM, Josef Bacik wrote:
> On Tue, May 15, 2012 at 09:17:16PM +0300, Boaz Harrosh wrote:
<snip>
>>
>> do you mean that the "with the patch" is 40% slower then "without the patch"
>> in the same "bs=1 count=10485760 test" ?
>>
>> Then count me clueless. Do you understand this difference?
>>
>> The way I read your patch the inode is copied and written to disk exactly the
>> same number of times, as before. Only that now i_version is also updated
>> together with ctime and/or mtime. What is the fundamental difference then?
>> Is it just that i_version++ in-memory operation?
>>
>
> Yeah but for every write() call we have to do this in-memory operation (via
> file_update_time()), so in the worst case where you are doing 1 byte writes
> where you would notice this sort of overhead you get a 40% overhead just from
>
> spin_lock(&inode->i_lock);
> inode->i_version++;
> spin_unlock(&inode->i_lock);
>
We should be optimizing this we can take the spin_lock once and change
the i_version and ctime/mtime at the same locking cycle.
> and then the subsequent mark_inode_dirty() call. What is likely saving us here
> is that the 1 byte writes are happening fast enough that the normal ctime/mtime
> operations aren't triggering a mark_inode_dirty() since it appears to happen
> wihtin the same time,
Ha, OK that Jiffy resolution of ctime. Hence the core problem BTW, changes come
in and we can't differentiate them.
So you are saying that there is somewhere in code that does
if (old_ctime != cur_ctime)
mark_inode_dirty()
Isn't the mark_inode_dirty() called every-time regardless of the value set
at ctime ?
> but now that we're doing the i_version++ we are calling
> mark_inode_dirty() more than before.
Can we fix that. Can we mark the inode dirty at Jiffy resolution?
In the case of a Server crash the nfs_changed attr is expected to reset
back to old value until such time that some client did a COMMIT. Up to
COMMIT it might happen that on disk version is older than runtime.
But the Jiffy resolution is a real bummer believe me.
> I'd have to profile it to verify that's
> what is actually happening, but that means turning on ftrace and parsing a bunch
> of output and man that sounds like more time than I want to waste on this. The
> question is do we care that the worst case is so awful since the worst case is
> likely to not show up often if at all? If we do then I guess the next step is
> to add a fs flag for i_version so that people who are going to be exporting file
> systems can get this without having to use a special option all the time.
> Thanks,
>
Lets just think about it a bit, Can we make it that disk updates happen just
as often as today and no more. But in memory inquiry of i_version gives us
a much higher resolution.
What about that Al's proposal of incrementing only after an NFSD inquiry
> Josef
Thanks for looking into this. We all hate this for a long time but never
took the time to fix it.
Boaz
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ext4: fix how i_version is modified and turn it on by default V2
2012-05-15 17:53 ` Josef Bacik
[not found] ` <20120515175308.GB1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2012-05-16 1:36 ` Ted Ts'o
1 sibling, 0 replies; 20+ messages in thread
From: Ted Ts'o @ 2012-05-16 1:36 UTC (permalink / raw)
To: Josef Bacik; +Cc: linux-ext4, linux-nfs, bfields, adilger, jack
On Tue, May 15, 2012 at 01:53:08PM -0400, Josef Bacik wrote:
>
> Ok did some basic benchmarking with dd, I ran
>
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1 count=10485760
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=1000
> dd if=/dev/zero of=/mnt/btrfs-test/file bs=1M count=5000
>
> 3 times with the patch and without the patch. With the worst case
> scenario there is about a 40% longer run time, going from on average
> 12 seconds to 17 seconds. With the other two runs they are the same
> runtime with the 1 megabyte blocks. So the question is, do we care
> about this worst case since any sane application developer isn't
> going to do writes that small?
Even if there's no runtime change, it's also useful to measure the CPU
utilization. If there's an increase in CPU utilization, then it can
show up in workloads and benchmarks which are sensitive to CPU
utilization as well as disk utilization, e.g., TPC-C/H.
But since it takes so long for performance teams to notice, they tend
to get very cranky when they observe regressions. So for changes like
this it's really important to measure any changes in CPU utilization,
especially on larger on SMP systems when there multiple processes
writing to the same file at high rates --- you know, like what an
Enterprise database might do to a table space file. :-)
- Ted
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2012-05-16 1:36 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-15 14:33 [PATCH] ext4: fix how i_version is modified and turn it on by default V2 Josef Bacik
2012-05-15 14:33 ` Josef Bacik
2012-05-15 15:18 ` Jan Kara
2012-05-15 17:53 ` Josef Bacik
[not found] ` <20120515175308.GB1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2012-05-15 18:17 ` Boaz Harrosh
2012-05-15 18:17 ` Boaz Harrosh
2012-05-15 18:29 ` Josef Bacik
2012-05-15 18:29 ` Josef Bacik
2012-05-15 19:55 ` Andreas Dilger
[not found] ` <AB91C817-DFEE-415F-8769-78831D72C6B7-m1MBpc4rdrD3fQ9qLvQP4Q@public.gmane.org>
2012-05-15 20:05 ` Josef Bacik
2012-05-15 20:05 ` Josef Bacik
[not found] ` <20120515200533.GD1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2012-05-15 21:00 ` J. Bruce Fields
2012-05-15 21:00 ` J. Bruce Fields
[not found] ` <20120515210029.GA11932-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2012-05-15 21:08 ` Josef Bacik
2012-05-15 21:08 ` Josef Bacik
2012-05-15 22:19 ` Andreas Dilger
2012-05-15 22:19 ` Andreas Dilger
[not found] ` <20120515182914.GC1907-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2012-05-15 20:24 ` Boaz Harrosh
2012-05-15 20:24 ` Boaz Harrosh
2012-05-16 1:36 ` Ted Ts'o
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.