* [PATCH v3 1/6] ext4: convert i_fc_lock to spinlock
2022-04-19 17:31 [PATCH v3 0/6] ext4: improve commit path performance for fast commit Harshad Shirwadkar
@ 2022-04-19 17:31 ` Harshad Shirwadkar
2022-04-27 15:58 ` Jan Kara
2022-04-19 17:31 ` [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait Harshad Shirwadkar
` (4 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Harshad Shirwadkar @ 2022-04-19 17:31 UTC (permalink / raw)
To: linux-ext4; +Cc: riteshh, jack, tytso, Harshad Shirwadkar
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Convert ext4_inode_info->i_fc_lock to spinlock to avoid sleeping
in invalid contexts.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
fs/ext4/ext4.h | 7 +++++--
fs/ext4/fast_commit.c | 22 ++++++++++------------
fs/ext4/super.c | 2 +-
3 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1d79012c5a5b..46ca0979e73b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1065,8 +1065,11 @@ struct ext4_inode_info {
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
- /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
- struct mutex i_fc_lock;
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 3d72565ec6e8..c278060a15bc 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -380,7 +380,7 @@ static int ext4_fc_track_template(
int ret;
tid = handle->h_transaction->t_tid;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
update = true;
} else {
@@ -388,7 +388,7 @@ static int ext4_fc_track_template(
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(inode, args, update);
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
if (!enqueue)
return ret;
@@ -420,11 +420,11 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
struct dentry *dentry = dentry_update->dentry;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -437,7 +437,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
kmem_cache_free(ext4_fc_dentry_cachep, node);
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_NOMEM, NULL);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -ENOMEM;
}
memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
@@ -471,7 +471,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
spin_unlock(&sbi->s_fc_lock);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return 0;
}
@@ -611,10 +611,8 @@ static int __track_range(struct inode *inode, void *arg, bool update)
struct __track_range_args *__arg =
(struct __track_range_args *)arg;
- if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
- ext4_debug("Special inode %ld being modified\n", inode->i_ino);
+ if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb))
return -ECANCELED;
- }
oldstart = ei->i_fc_lblk_start;
@@ -906,15 +904,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
struct ext4_extent *ex;
int ret;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (ei->i_fc_lblk_len == 0) {
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
return 0;
}
old_blk_size = ei->i_fc_lblk_start;
new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
ei->i_fc_lblk_len = 0;
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ae98b07285d2..d6fc12782657 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1347,7 +1347,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
- mutex_init(&ei->i_fc_lock);
+ spin_lock_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
--
2.36.0.rc0.470.gd361397f0d-goog
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/6] ext4: convert i_fc_lock to spinlock
2022-04-19 17:31 ` [PATCH v3 1/6] ext4: convert i_fc_lock to spinlock Harshad Shirwadkar
@ 2022-04-27 15:58 ` Jan Kara
0 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2022-04-27 15:58 UTC (permalink / raw)
To: Harshad Shirwadkar; +Cc: linux-ext4, riteshh, jack, tytso
On Tue 19-04-22 10:31:38, Harshad Shirwadkar wrote:
> From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
>
> Convert ext4_inode_info->i_fc_lock to spinlock to avoid sleeping
> in invalid contexts.
>
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/ext4.h | 7 +++++--
> fs/ext4/fast_commit.c | 22 ++++++++++------------
> fs/ext4/super.c | 2 +-
> 3 files changed, 16 insertions(+), 15 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1d79012c5a5b..46ca0979e73b 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1065,8 +1065,11 @@ struct ext4_inode_info {
> /* Fast commit wait queue for this inode */
> wait_queue_head_t i_fc_wait;
>
> - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
> - struct mutex i_fc_lock;
> + /*
> + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
> + * and inode's EXT4_FC_STATE_COMMITTING state bit.
> + */
> + spinlock_t i_fc_lock;
>
> /*
> * i_disksize keeps track of what the inode size is ON DISK, not
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index 3d72565ec6e8..c278060a15bc 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -380,7 +380,7 @@ static int ext4_fc_track_template(
> int ret;
>
> tid = handle->h_transaction->t_tid;
> - mutex_lock(&ei->i_fc_lock);
> + spin_lock(&ei->i_fc_lock);
> if (tid == ei->i_sync_tid) {
> update = true;
> } else {
> @@ -388,7 +388,7 @@ static int ext4_fc_track_template(
> ei->i_sync_tid = tid;
> }
> ret = __fc_track_fn(inode, args, update);
> - mutex_unlock(&ei->i_fc_lock);
> + spin_unlock(&ei->i_fc_lock);
>
> if (!enqueue)
> return ret;
> @@ -420,11 +420,11 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
> struct dentry *dentry = dentry_update->dentry;
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>
> - mutex_unlock(&ei->i_fc_lock);
> + spin_unlock(&ei->i_fc_lock);
> node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
> if (!node) {
> ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
> - mutex_lock(&ei->i_fc_lock);
> + spin_lock(&ei->i_fc_lock);
> return -ENOMEM;
> }
>
> @@ -437,7 +437,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
> kmem_cache_free(ext4_fc_dentry_cachep, node);
> ext4_fc_mark_ineligible(inode->i_sb,
> EXT4_FC_REASON_NOMEM, NULL);
> - mutex_lock(&ei->i_fc_lock);
> + spin_lock(&ei->i_fc_lock);
> return -ENOMEM;
> }
> memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
> @@ -471,7 +471,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
> list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
> }
> spin_unlock(&sbi->s_fc_lock);
> - mutex_lock(&ei->i_fc_lock);
> + spin_lock(&ei->i_fc_lock);
>
> return 0;
> }
> @@ -611,10 +611,8 @@ static int __track_range(struct inode *inode, void *arg, bool update)
> struct __track_range_args *__arg =
> (struct __track_range_args *)arg;
>
> - if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
> - ext4_debug("Special inode %ld being modified\n", inode->i_ino);
> + if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb))
> return -ECANCELED;
> - }
>
> oldstart = ei->i_fc_lblk_start;
>
> @@ -906,15 +904,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
> struct ext4_extent *ex;
> int ret;
>
> - mutex_lock(&ei->i_fc_lock);
> + spin_lock(&ei->i_fc_lock);
> if (ei->i_fc_lblk_len == 0) {
> - mutex_unlock(&ei->i_fc_lock);
> + spin_unlock(&ei->i_fc_lock);
> return 0;
> }
> old_blk_size = ei->i_fc_lblk_start;
> new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
> ei->i_fc_lblk_len = 0;
> - mutex_unlock(&ei->i_fc_lock);
> + spin_unlock(&ei->i_fc_lock);
>
> cur_lblk_off = old_blk_size;
> jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index ae98b07285d2..d6fc12782657 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1347,7 +1347,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
> atomic_set(&ei->i_unwritten, 0);
> INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
> ext4_fc_init_inode(&ei->vfs_inode);
> - mutex_init(&ei->i_fc_lock);
> + spin_lock_init(&ei->i_fc_lock);
> return &ei->vfs_inode;
> }
>
> --
> 2.36.0.rc0.470.gd361397f0d-goog
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait
2022-04-19 17:31 [PATCH v3 0/6] ext4: improve commit path performance for fast commit Harshad Shirwadkar
2022-04-19 17:31 ` [PATCH v3 1/6] ext4: convert i_fc_lock to spinlock Harshad Shirwadkar
@ 2022-04-19 17:31 ` Harshad Shirwadkar
2022-04-27 15:50 ` Jan Kara
2022-04-19 17:31 ` [PATCH v3 3/6] ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr Harshad Shirwadkar
` (3 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Harshad Shirwadkar @ 2022-04-19 17:31 UTC (permalink / raw)
To: linux-ext4; +Cc: riteshh, jack, tytso, Harshad Shirwadkar
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
If the inode that's being requested to track using ext4_fc_track_inode
is being committed, then wait until the inode finishes the
commit. Also, add calls to ext4_fc_track_inode at the right places.
With this patch, now calling ext4_reserve_inode_write() results in
inode being tracked for next fast commit. A subtle lock ordering
requirement with i_data_sem (which is documented in the code) requires
that ext4_fc_track_inode() be called before grabbing i_data_sem. So,
this patch also adds explicit ext4_fc_track_inode() calls in places
where i_data_sem grabbed.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
fs/ext4/fast_commit.c | 38 ++++++++++++++++++++++++++++++++++++++
fs/ext4/inline.c | 3 +++
fs/ext4/inode.c | 5 ++++-
3 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index c278060a15bc..55f4c5ddd8e5 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -574,8 +574,14 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
return 0;
}
+/*
+ * Track inode as part of the next fast commit. If the inode is being
+ * committed, this function will wait for the commit to finish.
+ */
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
@@ -595,6 +601,38 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) ||
+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE) ||
+ !list_empty(&ei->i_fc_list))
+ return;
+
+ /*
+ * If we come here, we may sleep while waiting for the inode to
+ * commit. We shouldn't be holding i_data_sem in write mode when we go
+ * to sleep since the commit path needs to grab the lock while
+ * committing the inode.
+ */
+ WARN_ON(lockdep_is_held_type(&ei->i_data_sem, 1));
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(handle, inode, ret);
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c076262770d..5d824d528f1c 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -586,6 +586,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
goto out;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
@@ -686,6 +687,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
goto convert;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
EXT4_JTR_NONE);
if (ret)
@@ -967,6 +969,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
if (ret < 0)
goto out_release_page;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
EXT4_JTR_NONE);
if (ret)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 955dd978dccf..e88940251afd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -622,6 +622,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
*/
map->m_flags &= ~EXT4_MAP_FLAGS;
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -4058,7 +4059,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
/* If there are blocks to remove, do it */
if (stop_block > first_block) {
-
+ ext4_fc_track_inode(handle, inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
@@ -4214,6 +4215,7 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
+ ext4_fc_track_inode(handle, inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
@@ -5734,6 +5736,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
--
2.36.0.rc0.470.gd361397f0d-goog
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait
2022-04-19 17:31 ` [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait Harshad Shirwadkar
@ 2022-04-27 15:50 ` Jan Kara
2022-05-19 14:28 ` harshad shirwadkar
0 siblings, 1 reply; 12+ messages in thread
From: Jan Kara @ 2022-04-27 15:50 UTC (permalink / raw)
To: Harshad Shirwadkar; +Cc: linux-ext4, riteshh, jack, tytso
On Tue 19-04-22 10:31:39, Harshad Shirwadkar wrote:
> From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
>
> If the inode that's being requested to track using ext4_fc_track_inode
> is being committed, then wait until the inode finishes the
> commit. Also, add calls to ext4_fc_track_inode at the right places.
>
> With this patch, now calling ext4_reserve_inode_write() results in
> inode being tracked for next fast commit. A subtle lock ordering
> requirement with i_data_sem (which is documented in the code) requires
> that ext4_fc_track_inode() be called before grabbing i_data_sem. So,
> this patch also adds explicit ext4_fc_track_inode() calls in places
> where i_data_sem grabbed.
>
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> ---
> fs/ext4/fast_commit.c | 38 ++++++++++++++++++++++++++++++++++++++
> fs/ext4/inline.c | 3 +++
> fs/ext4/inode.c | 5 ++++-
> 3 files changed, 45 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index c278060a15bc..55f4c5ddd8e5 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> + /*
> + * If we come here, we may sleep while waiting for the inode to
> + * commit. We shouldn't be holding i_data_sem in write mode when we go
> + * to sleep since the commit path needs to grab the lock while
> + * committing the inode.
> + */
> + WARN_ON(lockdep_is_held_type(&ei->i_data_sem, 1));
Note that we can deadlock even if we had i_data_sem for reading because
another reader is not allowed to get the rwsem if there is writer waiting
for it. So we need to check even that case here.
> + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> +#if (BITS_PER_LONG < 64)
> + DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> + EXT4_STATE_FC_COMMITTING);
> + wq = bit_waitqueue(&ei->i_state_flags,
> + EXT4_STATE_FC_COMMITTING);
> +#else
> + DEFINE_WAIT_BIT(wait, &ei->i_flags,
> + EXT4_STATE_FC_COMMITTING);
> + wq = bit_waitqueue(&ei->i_flags,
> + EXT4_STATE_FC_COMMITTING);
> +#endif
> + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
> + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
> + schedule();
> + finish_wait(wq, &wait.wq_entry);
> + }
> +
> ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
> trace_ext4_fc_track_inode(handle, inode, ret);
As we discussed in the call we should tell lockdep that this is equivalent
to lock+unlock of let's say fc_committing_lock and the fastcommit code
setting / clearing EXT4_STATE_FC_COMMITTING is equivalent to lock / unlock
of fc_committing_lock. That way we get proper lockdep tracking of this
waiting primitive.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait
2022-04-27 15:50 ` Jan Kara
@ 2022-05-19 14:28 ` harshad shirwadkar
2022-05-19 16:11 ` Jan Kara
0 siblings, 1 reply; 12+ messages in thread
From: harshad shirwadkar @ 2022-05-19 14:28 UTC (permalink / raw)
To: Jan Kara; +Cc: Ext4 Developers List, Ritesh Harjani, Theodore Y. Ts'o
Thanks for the review. Some questions / comments below:
On Wed, 27 Apr 2022 at 08:50, Jan Kara <jack@suse.cz> wrote:
>
> On Tue 19-04-22 10:31:39, Harshad Shirwadkar wrote:
> > From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> >
> > If the inode that's being requested to track using ext4_fc_track_inode
> > is being committed, then wait until the inode finishes the
> > commit. Also, add calls to ext4_fc_track_inode at the right places.
> >
> > With this patch, now calling ext4_reserve_inode_write() results in
> > inode being tracked for next fast commit. A subtle lock ordering
> > requirement with i_data_sem (which is documented in the code) requires
> > that ext4_fc_track_inode() be called before grabbing i_data_sem. So,
> > this patch also adds explicit ext4_fc_track_inode() calls in places
> > where i_data_sem grabbed.
> >
> > Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> > ---
> > fs/ext4/fast_commit.c | 38 ++++++++++++++++++++++++++++++++++++++
> > fs/ext4/inline.c | 3 +++
> > fs/ext4/inode.c | 5 ++++-
> > 3 files changed, 45 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> > index c278060a15bc..55f4c5ddd8e5 100644
> > --- a/fs/ext4/fast_commit.c
> > +++ b/fs/ext4/fast_commit.c
> > + /*
> > + * If we come here, we may sleep while waiting for the inode to
> > + * commit. We shouldn't be holding i_data_sem in write mode when we go
> > + * to sleep since the commit path needs to grab the lock while
> > + * committing the inode.
> > + */
> > + WARN_ON(lockdep_is_held_type(&ei->i_data_sem, 1));
>
> Note that we can deadlock even if we had i_data_sem for reading because
> another reader is not allowed to get the rwsem if there is writer waiting
> for it. So we need to check even that case here.
I turned the above WARN_ON to check if data_sem is held in either read
or write mode and now I am seeing many other places where data_sem
gets grabbed in read mode before calling ext4_fc_track_inode(). We
either need to call ext4_fc_track_inode() before all
ext4_reserve_inode_write() in all those cases, or ensure that places
that acquire in data_sem in write mode, wait if there's an ongoing
commit and only then lock data_sem.
>
> > + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> > +#if (BITS_PER_LONG < 64)
> > + DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> > + EXT4_STATE_FC_COMMITTING);
> > + wq = bit_waitqueue(&ei->i_state_flags,
> > + EXT4_STATE_FC_COMMITTING);
> > +#else
> > + DEFINE_WAIT_BIT(wait, &ei->i_flags,
> > + EXT4_STATE_FC_COMMITTING);
> > + wq = bit_waitqueue(&ei->i_flags,
> > + EXT4_STATE_FC_COMMITTING);
> > +#endif
> > + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
> > + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
> > + schedule();
> > + finish_wait(wq, &wait.wq_entry);
> > + }
> > +
> > ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
> > trace_ext4_fc_track_inode(handle, inode, ret);
>
> As we discussed in the call we should tell lockdep that this is equivalent
> to lock+unlock of let's say fc_committing_lock and the fastcommit code
> setting / clearing EXT4_STATE_FC_COMMITTING is equivalent to lock / unlock
> of fc_committing_lock. That way we get proper lockdep tracking of this
> waiting primitive.
Sure, so you mean just adding __acquires() / __releases() annotations
in these places right?
- Harshad
>
> Honza
>
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait
2022-05-19 14:28 ` harshad shirwadkar
@ 2022-05-19 16:11 ` Jan Kara
0 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2022-05-19 16:11 UTC (permalink / raw)
To: harshad shirwadkar
Cc: Jan Kara, Ext4 Developers List, Ritesh Harjani, Theodore Y. Ts'o
On Thu 19-05-22 07:28:11, harshad shirwadkar wrote:
> On Wed, 27 Apr 2022 at 08:50, Jan Kara <jack@suse.cz> wrote:
> >
> > On Tue 19-04-22 10:31:39, Harshad Shirwadkar wrote:
> > > From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> > >
> > > If the inode that's being requested to track using ext4_fc_track_inode
> > > is being committed, then wait until the inode finishes the
> > > commit. Also, add calls to ext4_fc_track_inode at the right places.
> > >
> > > With this patch, now calling ext4_reserve_inode_write() results in
> > > inode being tracked for next fast commit. A subtle lock ordering
> > > requirement with i_data_sem (which is documented in the code) requires
> > > that ext4_fc_track_inode() be called before grabbing i_data_sem. So,
> > > this patch also adds explicit ext4_fc_track_inode() calls in places
> > > where i_data_sem grabbed.
> > >
> > > Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> > > ---
> > > fs/ext4/fast_commit.c | 38 ++++++++++++++++++++++++++++++++++++++
> > > fs/ext4/inline.c | 3 +++
> > > fs/ext4/inode.c | 5 ++++-
> > > 3 files changed, 45 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> > > index c278060a15bc..55f4c5ddd8e5 100644
> > > --- a/fs/ext4/fast_commit.c
> > > +++ b/fs/ext4/fast_commit.c
> > > + /*
> > > + * If we come here, we may sleep while waiting for the inode to
> > > + * commit. We shouldn't be holding i_data_sem in write mode when we go
> > > + * to sleep since the commit path needs to grab the lock while
> > > + * committing the inode.
> > > + */
> > > + WARN_ON(lockdep_is_held_type(&ei->i_data_sem, 1));
> >
> > Note that we can deadlock even if we had i_data_sem for reading because
> > another reader is not allowed to get the rwsem if there is writer waiting
> > for it. So we need to check even that case here.
> I turned the above WARN_ON to check if data_sem is held in either read
> or write mode and now I am seeing many other places where data_sem
> gets grabbed in read mode before calling ext4_fc_track_inode().
Hum, that's unpleasant. Which places BTW? I'd expect this mostly happens in
ext4_map_blocks() paths. Anywhere else?
> We either need to call ext4_fc_track_inode() before all
> ext4_reserve_inode_write() in all those cases, or ensure that places that
> acquire in data_sem in write mode, wait if there's an ongoing commit and
> only then lock data_sem.
Neither is particularly appealing I guess. As we discussed on the call,
probably using extent status tree for the fastcommit code might be a
cleaner option.
> > > + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> > > +#if (BITS_PER_LONG < 64)
> > > + DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> > > + EXT4_STATE_FC_COMMITTING);
> > > + wq = bit_waitqueue(&ei->i_state_flags,
> > > + EXT4_STATE_FC_COMMITTING);
> > > +#else
> > > + DEFINE_WAIT_BIT(wait, &ei->i_flags,
> > > + EXT4_STATE_FC_COMMITTING);
> > > + wq = bit_waitqueue(&ei->i_flags,
> > > + EXT4_STATE_FC_COMMITTING);
> > > +#endif
> > > + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
> > > + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
> > > + schedule();
> > > + finish_wait(wq, &wait.wq_entry);
> > > + }
> > > +
> > > ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
> > > trace_ext4_fc_track_inode(handle, inode, ret);
> >
> > As we discussed in the call we should tell lockdep that this is equivalent
> > to lock+unlock of let's say fc_committing_lock and the fastcommit code
> > setting / clearing EXT4_STATE_FC_COMMITTING is equivalent to lock / unlock
> > of fc_committing_lock. That way we get proper lockdep tracking of this
> > waiting primitive.
> Sure, so you mean just adding __acquires() / __releases() annotations
> in these places right?
No. __acquires() and __releases() are sparse annotations. Sparse does also
some lock checking but it is a static checker and is pretty trivial. Here you
need to instrument lockdep. We do similar thing in jbd2 to tell lockdep
that starting a transaction handle effectively behaves as a lock - see the
rwsem_acquire_read() and rwsem_release() in start_this_handle() and
stop_this_handle(), respectively.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 3/6] ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr
2022-04-19 17:31 [PATCH v3 0/6] ext4: improve commit path performance for fast commit Harshad Shirwadkar
2022-04-19 17:31 ` [PATCH v3 1/6] ext4: convert i_fc_lock to spinlock Harshad Shirwadkar
2022-04-19 17:31 ` [PATCH v3 2/6] ext4: for committing inode, make ext4_fc_track_inode wait Harshad Shirwadkar
@ 2022-04-19 17:31 ` Harshad Shirwadkar
2022-04-27 16:02 ` Jan Kara
2022-04-19 17:31 ` [PATCH v3 4/6] ext4: rework fast commit commit path Harshad Shirwadkar
` (2 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Harshad Shirwadkar @ 2022-04-19 17:31 UTC (permalink / raw)
To: linux-ext4; +Cc: riteshh, jack, tytso, Harshad Shirwadkar
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Mark inode dirty first and then grab i_data_sem in ext4_setattr().
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
fs/ext4/inode.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e88940251afd..6eae0804c6fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5455,11 +5455,12 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
inode->i_sb->s_blocksize_bits);
- down_write(&EXT4_I(inode)->i_data_sem);
- EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
if (!error)
error = rc;
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
--
2.36.0.rc0.470.gd361397f0d-goog
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v3 3/6] ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr
2022-04-19 17:31 ` [PATCH v3 3/6] ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr Harshad Shirwadkar
@ 2022-04-27 16:02 ` Jan Kara
0 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2022-04-27 16:02 UTC (permalink / raw)
To: Harshad Shirwadkar; +Cc: linux-ext4, riteshh, jack, tytso
On Tue 19-04-22 10:31:40, Harshad Shirwadkar wrote:
> From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
>
> Mark inode dirty first and then grab i_data_sem in ext4_setattr().
>
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index e88940251afd..6eae0804c6fd 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5455,11 +5455,12 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
> (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
> inode->i_sb->s_blocksize_bits);
>
> - down_write(&EXT4_I(inode)->i_data_sem);
> - EXT4_I(inode)->i_disksize = attr->ia_size;
> rc = ext4_mark_inode_dirty(handle, inode);
> if (!error)
> error = rc;
> + down_write(&EXT4_I(inode)->i_data_sem);
> + EXT4_I(inode)->i_disksize = attr->ia_size;
> +
Hum, this isn't going to fly because ext4_mark_inode_dirty() copies data
from ext4_inode_info to the on-disk buffer and thus new i_disksize will not
be stored on the disk after your change.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 4/6] ext4: rework fast commit commit path
2022-04-19 17:31 [PATCH v3 0/6] ext4: improve commit path performance for fast commit Harshad Shirwadkar
` (2 preceding siblings ...)
2022-04-19 17:31 ` [PATCH v3 3/6] ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr Harshad Shirwadkar
@ 2022-04-19 17:31 ` Harshad Shirwadkar
2022-04-19 17:31 ` [PATCH v3 5/6] ext4: drop i_fc_updates from inode fc info Harshad Shirwadkar
2022-04-19 17:31 ` [PATCH v3 6/6] ext4: update code documentation Harshad Shirwadkar
5 siblings, 0 replies; 12+ messages in thread
From: Harshad Shirwadkar @ 2022-04-19 17:31 UTC (permalink / raw)
To: linux-ext4; +Cc: riteshh, jack, tytso, Harshad Shirwadkar
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
This patch reworks fast commit's commit path to remove locking the
journal for the entire duration of a fast commit. Instead, we only lock
the journal while marking all the eligible inodes as "committing". This
allows handles to make progress in parallel with the fast commit.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/fast_commit.c | 71 ++++++++++++++++++++++++++++---------------
fs/jbd2/journal.c | 2 --
2 files changed, 46 insertions(+), 27 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 55f4c5ddd8e5..75f5abbf7c5d 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -287,20 +287,30 @@ void ext4_fc_del(struct inode *inode)
(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
return;
-restart:
spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
return;
}
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ list_del_init(&ei->i_fc_list);
/*
* Since this inode is getting removed, let's also remove all FC
@@ -323,8 +333,6 @@ void ext4_fc_del(struct inode *inode)
fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
kfree(fc_dentry->fcd_name.name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
- return;
}
/*
@@ -1013,19 +1021,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
spin_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
- while (atomic_read(&ei->i_fc_updates)) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&ei->i_fc_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&ei->i_fc_updates)) {
- spin_unlock(&sbi->s_fc_lock);
- schedule();
- spin_lock(&sbi->s_fc_lock);
- }
- finish_wait(&ei->i_fc_wait, &wait);
- }
spin_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(ei->jinode);
if (ret)
@@ -1138,6 +1133,16 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;
+ /* Lock the journal */
+ jbd2_journal_lock_updates(journal);
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
ret = ext4_fc_submit_inode_data_all(journal);
if (ret)
return ret;
@@ -1188,6 +1193,18 @@ static int ext4_fc_perform_commit(journal_t *journal)
ret = ext4_fc_write_inode(inode, &crc);
if (ret)
goto out;
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
+ smp_mb();
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
spin_lock(&sbi->s_fc_lock);
}
spin_unlock(&sbi->s_fc_lock);
@@ -1325,13 +1342,17 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
spin_lock(&sbi->s_fc_lock);
list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
i_fc_list) {
- list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
if (iter->i_sync_tid <= tid)
ext4_fc_reset_inode(&iter->vfs_inode);
- /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
smp_mb();
+ list_del_init(&iter->i_fc_list);
#if (BITS_PER_LONG < 64)
wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c2cf74b01ddb..06b885628b1c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -757,7 +757,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
}
journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);
- jbd2_journal_lock_updates(journal);
return 0;
}
@@ -769,7 +768,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
*/
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
- jbd2_journal_unlock_updates(journal);
if (journal->j_fc_cleanup_callback)
journal->j_fc_cleanup_callback(journal, 0, tid);
write_lock(&journal->j_state_lock);
--
2.36.0.rc0.470.gd361397f0d-goog
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 5/6] ext4: drop i_fc_updates from inode fc info
2022-04-19 17:31 [PATCH v3 0/6] ext4: improve commit path performance for fast commit Harshad Shirwadkar
` (3 preceding siblings ...)
2022-04-19 17:31 ` [PATCH v3 4/6] ext4: rework fast commit commit path Harshad Shirwadkar
@ 2022-04-19 17:31 ` Harshad Shirwadkar
2022-04-19 17:31 ` [PATCH v3 6/6] ext4: update code documentation Harshad Shirwadkar
5 siblings, 0 replies; 12+ messages in thread
From: Harshad Shirwadkar @ 2022-04-19 17:31 UTC (permalink / raw)
To: linux-ext4; +Cc: riteshh, jack, tytso, Harshad Shirwadkar
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
The new logic introduced in this series does not require tracking number
of active handles open on an inode. So, drop it.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/ext4.h | 5 ----
fs/ext4/fast_commit.c | 70 -------------------------------------------
2 files changed, 75 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 46ca0979e73b..dd8d1623fbac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1059,9 +1059,6 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
- /* Number of ongoing updates on this inode */
- atomic_t i_fc_updates;
-
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
@@ -2930,8 +2927,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
-void ext4_fc_start_update(struct inode *inode);
-void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 75f5abbf7c5d..266c95ff0d74 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -201,76 +201,6 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
- atomic_set(&ei->i_fc_updates, 0);
-}
-
-/* This function must be called with sbi->s_fc_lock held. */
-static void ext4_fc_wait_committing_inode(struct inode *inode)
-__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
-{
- wait_queue_head_t *wq;
- struct ext4_inode_info *ei = EXT4_I(inode);
-
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
-}
-
-/*
- * Inform Ext4's fast about start of an inode update
- *
- * This function is called by the high level call VFS callbacks before
- * performing any inode update. This function blocks if there's an ongoing
- * fast commit on the inode in question.
- */
-void ext4_fc_start_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
- return;
-
-restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list))
- goto out;
-
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-out:
- atomic_inc(&ei->i_fc_updates);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
-}
-
-/*
- * Stop inode update and wake up waiting fast commits if any.
- */
-void ext4_fc_stop_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
- return;
-
- if (atomic_dec_and_test(&ei->i_fc_updates))
- wake_up_all(&ei->i_fc_wait);
}
/*
--
2.36.0.rc0.470.gd361397f0d-goog
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 6/6] ext4: update code documentation
2022-04-19 17:31 [PATCH v3 0/6] ext4: improve commit path performance for fast commit Harshad Shirwadkar
` (4 preceding siblings ...)
2022-04-19 17:31 ` [PATCH v3 5/6] ext4: drop i_fc_updates from inode fc info Harshad Shirwadkar
@ 2022-04-19 17:31 ` Harshad Shirwadkar
5 siblings, 0 replies; 12+ messages in thread
From: Harshad Shirwadkar @ 2022-04-19 17:31 UTC (permalink / raw)
To: linux-ext4; +Cc: riteshh, jack, tytso, Harshad Shirwadkar
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
This patch updates code documentation to reflect the commit path changes
made in this series.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/fast_commit.c | 39 ++++++++++++++++++++++++++-------------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 266c95ff0d74..1691cfd89954 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -49,14 +49,21 @@
* that need to be committed during a fast commit in another in memory queue of
* inodes. During the commit operation, we commit in the following order:
*
- * [1] Lock inodes for any further data updates by setting COMMITTING state
- * [2] Submit data buffers of all the inodes
- * [3] Wait for [2] to complete
- * [4] Commit all the directory entry updates in the fast commit space
- * [5] Commit all the changed inode structures
- * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * [1] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [2] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [3] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [4] Submit data buffers of all the committing inodes.
+ * [5] Wait for [4] to complete.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
- * [7] Wait for [4], [5] and [6] to complete.
*
* All the inode updates must call ext4_fc_start_update() before starting an
* update. If such an ongoing update is present, fast commit waits for it to
@@ -142,6 +149,13 @@
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
* TODOs
* -----
*
@@ -156,13 +170,12 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Fast commit's commit path locks the entire file system during fast
- * commit. This has significant performance penalty. Instead of that, we
- * should use ext4_fc_start/stop_update functions to start inode level
- * updates from ext4_journal_start/stop. Once we do that we can drop file
- * system locking during commit path.
+ * 1) Handle more ineligible cases.
*
- * 2) Handle more ineligible cases.
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
*/
#include <trace/events/ext4.h>
--
2.36.0.rc0.470.gd361397f0d-goog
^ permalink raw reply related [flat|nested] 12+ messages in thread