All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ritesh Harjani <riteshh@linux.ibm.com>
To: Harshad Shirwadkar <harshadshirwadkar@gmail.com>,
	linux-ext4@vger.kernel.org
Cc: tytso@mit.edu
Subject: Re: [PATCH v9 7/9] ext4: fast commit recovery path
Date: Fri, 9 Oct 2020 22:44:03 +0530	[thread overview]
Message-ID: <dcf720bd-8644-3001-75b4-d845a2495f72@linux.ibm.com> (raw)
In-Reply-To: <20200919005451.3899779-8-harshadshirwadkar@gmail.com>



On 9/19/20 6:24 AM, Harshad Shirwadkar wrote:
> This patch adds fast commit recovery path support for Ext4 file
> system. We add several helper functions that are similar in spirit to
> e2fsprogs journal recovery path handlers. Example of such functions
> include - a simple block allocator, idempotent block bitmap update
> function etc. Using these routines and the fast commit log in the fast
> commit area, the recovery path (ext4_fc_replay()) performs fast commit
> log recovery.
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> ---
>   fs/ext4/balloc.c            |   7 +-
>   fs/ext4/ext4.h              |  26 ++
>   fs/ext4/ext4_jbd2.c         |   2 +-
>   fs/ext4/extents.c           | 261 +++++++++++
>   fs/ext4/extents_status.c    |  24 +
>   fs/ext4/fast_commit.c       | 881 +++++++++++++++++++++++++++++++++++-
>   fs/ext4/fast_commit.h       |  40 ++
>   fs/ext4/ialloc.c            | 165 ++++++-
>   fs/ext4/inode.c             |  89 ++--
>   fs/ext4/ioctl.c             |   6 +-
>   fs/ext4/mballoc.c           | 208 ++++++++-
>   fs/ext4/namei.c             | 149 +++---
>   fs/ext4/super.c             |  21 +
>   include/trace/events/ext4.h |  56 ++-
>   14 files changed, 1804 insertions(+), 131 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 48c3df47748d..77108c99ae90 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -368,7 +368,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>   				      struct buffer_head *bh)
>   {
>   	ext4_fsblk_t	blk;
> -	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
> +	struct ext4_group_info *grp;
> +
> +	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
> +	grp = ext4_get_group_info(sb, block_group);
> 
>   	if (buffer_verified(bh))
>   		return 0;
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 27d48d166e5d..372a38292ed1 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1167,6 +1167,7 @@ struct ext4_inode_info {
>   #define EXT4_FC_COMMITTING		0x0010	/* File system underoing a fast
>   						 * commit.
>   						 */
> +#define EXT4_FC_REPLAY			0x0020	/* Fast commit replay ongoing */
> 
>   /*
>    * Misc. filesystem flags
> @@ -1658,6 +1659,10 @@ struct ext4_sb_info {
>   	struct buffer_head *s_fc_bh;
>   	struct ext4_fc_stats s_fc_stats;
>   	u64 s_fc_avg_commit_time;
> +#ifdef CONFIG_EXT4_DEBUG
> +	int s_fc_debug_max_replay;
> +#endif
> +	struct ext4_fc_replay_state s_fc_replay_state;
>   };
> 
>   static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> @@ -2700,6 +2705,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
>   			  struct dx_hash_info *hinfo);
> 
>   /* ialloc.c */
> +extern int ext4_mark_inode_used(struct super_block *sb, int ino);
>   extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
>   				      const struct qstr *qstr, __u32 goal,
>   				      uid_t *owner, __u32 i_flags,
> @@ -2741,6 +2747,8 @@ void ext4_fc_stop_ineligible(struct super_block *sb);
>   void ext4_fc_start_update(struct inode *inode);
>   void ext4_fc_stop_update(struct inode *inode);
>   void ext4_fc_del(struct inode *inode);
> +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
> +void ext4_fc_replay_cleanup(struct super_block *sb);
>   int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
>   int __init ext4_fc_init_dentry_cache(void);
> 
> @@ -2773,8 +2781,12 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
>   				ext4_fsblk_t block, unsigned long count);
>   extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
>   extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
> +extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
> +		       int len, int state);
> 
>   /* inode.c */
> +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
> +			 struct ext4_inode_info *ei);
>   int ext4_inode_is_fast_symlink(struct inode *inode);
>   struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
>   struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
> @@ -2821,6 +2833,8 @@ extern int  ext4_sync_inode(handle_t *, struct inode *);
>   extern void ext4_dirty_inode(struct inode *, int);
>   extern int ext4_change_inode_journal_flag(struct inode *, int);
>   extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> +extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
> +			  struct ext4_iloc *iloc);
>   extern int ext4_inode_attach_jinode(struct inode *inode);
>   extern int ext4_can_truncate(struct inode *inode);
>   extern int ext4_truncate(struct inode *);
> @@ -2854,12 +2868,15 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
>   /* ioctl.c */
>   extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
>   extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
> +extern void ext4_reset_inode_seed(struct inode *inode);
> 
>   /* migrate.c */
>   extern int ext4_ext_migrate(struct inode *);
>   extern int ext4_ind_migrate(struct inode *inode);
> 
>   /* namei.c */
> +extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
> +			     struct inode *inode);
>   extern int ext4_dirblock_csum_verify(struct inode *inode,
>   				     struct buffer_head *bh);
>   extern int ext4_orphan_add(handle_t *, struct inode *);
> @@ -3426,6 +3443,10 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
>   extern int ext4_ci_compare(const struct inode *parent,
>   			   const struct qstr *fname,
>   			   const struct qstr *entry, bool quick);
> +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
> +			 struct inode *inode);
> +extern int __ext4_link(struct inode *dir, struct inode *inode,
> +		       struct dentry *dentry);
> 
>   #define S_SHIFT 12
>   static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
> @@ -3526,6 +3547,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
>   extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
>   				       int check_cred, int restart_cred,
>   				       int revoke_cred);
> +extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
> +extern int ext4_ext_replay_set_iblocks(struct inode *inode);
> +extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
> +		int len, int unwritten, ext4_fsblk_t pblk);
> +extern int ext4_ext_clear_bb(struct inode *inode);
> 
> 
>   /* move_extent.c */
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index 760b9ee49dc0..0fd0c42a4f7d 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -100,7 +100,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
>   		return ERR_PTR(err);
> 
>   	journal = EXT4_SB(sb)->s_journal;
> -	if (!journal)
> +	if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return ext4_get_nojournal();
>   	return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
>   				   GFP_NOFS, type, line);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 8de236fedade..29945f1172fc 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -5804,3 +5804,264 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
> 
>   	return err ? err : mapped;
>   }
> +
> +/*
> + * Updates physical block address and unwritten status of extent starting at
> + * lblk start and of len. If such an extent doesn't exist, this function
> + * splits the extent tree appropriately to create an extent like this.
> + * This function is called in Ext4 fast commit replay path. Returns 0 on success
> + * and error on failure.
> + */
> +int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
> +		int len, int unwritten, ext4_fsblk_t pblk)
> +{
> +	struct ext4_ext_path *path = NULL, *ppath;
> +	struct ext4_extent *ex;
> +	int ret;
> +
> +	path = ext4_find_extent(inode, start, NULL, 0);
> +	if (!path)
> +		return -EINVAL;
> +	ex = path[path->p_depth].p_ext;
> +	if (!ex) {
> +		ret = -EFSCORRUPTED;
> +		goto out;
> +	}
> +
> +	if (le32_to_cpu(ex->ee_block) != start ||
> +		ext4_ext_get_actual_len(ex) != len) {
> +		/* We need to split this extent to match our extent first */
> +		ppath = path;
> +		down_write(&EXT4_I(inode)->i_data_sem);
> +		ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
> +		up_write(&EXT4_I(inode)->i_data_sem);
> +		if (ret)
> +			goto out;
> +		kfree(path);
> +		path = ext4_find_extent(inode, start, NULL, 0);
> +		if (IS_ERR(path))
> +			return -1;
> +		ppath = path;
> +		ex = path[path->p_depth].p_ext;
> +		WARN_ON(le32_to_cpu(ex->ee_block) != start);
> +		if (ext4_ext_get_actual_len(ex) != len) {
> +			down_write(&EXT4_I(inode)->i_data_sem);
> +			ret = ext4_force_split_extent_at(NULL, inode, &ppath,
> +							 start + len, 1);
> +			up_write(&EXT4_I(inode)->i_data_sem);
> +			if (ret)
> +				goto out;
> +			kfree(path);
> +			path = ext4_find_extent(inode, start, NULL, 0);
> +			if (IS_ERR(path))
> +				return -EINVAL;
> +			ex = path[path->p_depth].p_ext;
> +		}
> +	}
> +	if (unwritten)
> +		ext4_ext_mark_unwritten(ex);
> +	else
> +		ext4_ext_mark_initialized(ex);
> +	ext4_ext_store_pblock(ex, pblk);
> +	down_write(&EXT4_I(inode)->i_data_sem);
> +	ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
> +	up_write(&EXT4_I(inode)->i_data_sem);
> +out:
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +	ext4_mark_inode_dirty(NULL, inode);
> +	return ret;
> +}
> +
> +/* Try to shrink the extent tree */
> +void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
> +{
> +	struct ext4_ext_path *path = NULL;
> +	struct ext4_extent *ex;
> +	ext4_lblk_t old_cur, cur = 0;
> +
> +	while (cur < end) {
> +		path = ext4_find_extent(inode, cur, NULL, 0);
> +		if (IS_ERR(path))
> +			return;
> +		ex = path[path->p_depth].p_ext;
> +		if (!ex) {
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			ext4_mark_inode_dirty(NULL, inode);
> +			return;
> +		}
> +		old_cur = cur;
> +		cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
> +		if (cur <= old_cur)
> +			cur = old_cur + 1;
> +		ext4_ext_try_to_merge(NULL, inode, path, ex);
> +		down_write(&EXT4_I(inode)->i_data_sem);
> +		ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
> +		up_write(&EXT4_I(inode)->i_data_sem);
> +		ext4_mark_inode_dirty(NULL, inode);
> +		ext4_ext_drop_refs(path);
> +		kfree(path);
> +	}
> +}
> +
> +/* Check if *cur is a hole and if it is, skip it */
> +static void skip_hole(struct inode *inode, ext4_lblk_t *cur)
> +{
> +	int ret;
> +	struct ext4_map_blocks map;
> +
> +	map.m_lblk = *cur;
> +	map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
> +
> +	ret = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (ret != 0)
> +		return;
> +	*cur = *cur + map.m_len;
> +}
> +
> +/* Count number of blocks used by this inode and update i_blocks */
> +int ext4_ext_replay_set_iblocks(struct inode *inode)
> +{
> +	struct ext4_ext_path *path = NULL, *path2 = NULL;
> +	struct ext4_extent *ex;
> +	ext4_lblk_t cur = 0, end;
> +	int numblks = 0, i, ret = 0;
> +	ext4_fsblk_t cmp1, cmp2;
> +	struct ext4_map_blocks map;
> +
> +	/* Determin the size of the file first */
> +	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
> +					EXT4_EX_NOCACHE);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +	ex = path[path->p_depth].p_ext;
> +	if (!ex) {
> +		ext4_ext_drop_refs(path);
> +		kfree(path);
> +		goto out;
> +	}
> +	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +
> +	/* Count the number of data blocks */
> +	cur = 0;
> +	while (cur < end) {
> +		map.m_lblk = cur;
> +		map.m_len = end - cur;
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0)
> +			break;
> +		if (ret > 0)
> +			numblks += ret;
> +		cur = cur + map.m_len;
> +	}
> +
> +	/*
> +	 * Count the number of extent tree blocks. We do it by looking up
> +	 * two successive extents and determining the difference between
> +	 * their paths. When path is different for 2 successive extents
> +	 * we compare the blocks in the path at each level and increment
> +	 * iblocks by total number of differences found.
> +	 */
> +	cur = 0;
> +	skip_hole(inode, &cur);
> +	path = ext4_find_extent(inode, cur, NULL, 0);
> +	if (IS_ERR(path))
> +		goto out;
> +	numblks += path->p_depth;
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +	while (cur < end) {
> +		path = ext4_find_extent(inode, cur, NULL, 0);
> +		if (IS_ERR(path))
> +			break;
> +		ex = path[path->p_depth].p_ext;
> +		if (!ex) {
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			return 0;
> +		}
> +		cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
> +					ext4_ext_get_actual_len(ex));
> +		skip_hole(inode, &cur);
> +
> +		path2 = ext4_find_extent(inode, cur, NULL, 0);
> +		if (IS_ERR(path2)) {
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			break;
> +		}
> +		ex = path2[path2->p_depth].p_ext;
> +		for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
> +			cmp1 = cmp2 = 0;
> +			if (i <= path->p_depth)
> +				cmp1 = path[i].p_bh ?
> +					path[i].p_bh->b_blocknr : 0;
> +			if (i <= path2->p_depth)
> +				cmp2 = path2[i].p_bh ?
> +					path2[i].p_bh->b_blocknr : 0;
> +			if (cmp1 != cmp2 && cmp2 != 0)
> +				numblks++;
> +		}
> +		ext4_ext_drop_refs(path);
> +		ext4_ext_drop_refs(path2);
> +		kfree(path);
> +		kfree(path2);
> +	}
> +
> +out:
> +	inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
> +	ext4_mark_inode_dirty(NULL, inode);
> +	return 0;
> +}
> +
> +int ext4_ext_clear_bb(struct inode *inode)
> +{
> +	struct ext4_ext_path *path = NULL;
> +	struct ext4_extent *ex;
> +	ext4_lblk_t cur = 0, end;
> +	int j, ret = 0;
> +	struct ext4_map_blocks map;
> +
> +	/* Determin the size of the file first */
> +	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
> +					EXT4_EX_NOCACHE);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +	ex = path[path->p_depth].p_ext;
> +	if (!ex) {
> +		ext4_ext_drop_refs(path);
> +		kfree(path);
> +		return 0;
> +	}
> +	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +
> +	cur = 0;
> +	while (cur < end) {
> +		map.m_lblk = cur;
> +		map.m_len = end - cur;
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0)
> +			break;
> +		if (ret > 0) {
> +			path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
> +			if (!IS_ERR_OR_NULL(path)) {
> +				for (j = 0; j < path->p_depth; j++) {
> +
> +					ext4_mb_mark_bb(inode->i_sb,
> +							path[j].p_block, 1, 0);
> +				}
> +				ext4_ext_drop_refs(path);
> +				kfree(path);
> +			}
> +			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
> +		}
> +		cur = cur + map.m_len;
> +	}
> +
> +	return 0;
> +}
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index e75171535375..0a729027322d 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -311,6 +311,9 @@ void ext4_es_find_extent_range(struct inode *inode,
>   			       ext4_lblk_t lblk, ext4_lblk_t end,
>   			       struct extent_status *es)
>   {
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return;
> +
>   	trace_ext4_es_find_extent_range_enter(inode, lblk);
> 
>   	read_lock(&EXT4_I(inode)->i_es_lock);
> @@ -361,6 +364,9 @@ bool ext4_es_scan_range(struct inode *inode,
>   {
>   	bool ret;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return false;
> +
>   	read_lock(&EXT4_I(inode)->i_es_lock);
>   	ret = __es_scan_range(inode, matching_fn, lblk, end);
>   	read_unlock(&EXT4_I(inode)->i_es_lock);
> @@ -404,6 +410,9 @@ bool ext4_es_scan_clu(struct inode *inode,
>   {
>   	bool ret;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return false;
> +
>   	read_lock(&EXT4_I(inode)->i_es_lock);
>   	ret = __es_scan_clu(inode, matching_fn, lblk);
>   	read_unlock(&EXT4_I(inode)->i_es_lock);
> @@ -812,6 +821,9 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>   	int err = 0;
>   	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
>   		 lblk, len, pblk, status, inode->i_ino);
> 
> @@ -873,6 +885,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
>   	struct extent_status newes;
>   	ext4_lblk_t end = lblk + len - 1;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return;
> +
>   	newes.es_lblk = lblk;
>   	newes.es_len = len;
>   	ext4_es_store_pblock_status(&newes, pblk, status);
> @@ -908,6 +923,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
>   	struct rb_node *node;
>   	int found = 0;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	trace_ext4_es_lookup_extent_enter(inode, lblk);
>   	es_debug("lookup extent in block %u\n", lblk);
> 
> @@ -1419,6 +1437,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>   	int err = 0;
>   	int reserved = 0;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	trace_ext4_es_remove_extent(inode, lblk, len);
>   	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
>   		 lblk, len, inode->i_ino);
> @@ -1969,6 +1990,9 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
>   	struct extent_status newes;
>   	int err = 0;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
>   		 lblk, inode->i_ino);
> 
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index 6e251b5682b4..63429076ad59 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -170,7 +170,8 @@ void ext4_fc_start_update(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return;
> 
>   restart:
> @@ -209,7 +210,8 @@ void ext4_fc_stop_update(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return;
> 
>   	if (atomic_dec_and_test(&ei->i_fc_updates))
> @@ -224,11 +226,8 @@ void ext4_fc_del(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> -		return;
> -
> -
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return;
> 
>   restart:
> @@ -270,6 +269,10 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
>   {
>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
> 
> +	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
> +		return;
> +
>   	sbi->s_mount_state |= EXT4_FC_INELIGIBLE;
>   	WARN_ON(reason >= EXT4_FC_REASON_MAX);
>   	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
> @@ -283,6 +286,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
>   {
>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
> 
> +	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
> +		return;
> +
>   	WARN_ON(reason >= EXT4_FC_REASON_MAX);
>   	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
>   	atomic_inc(&sbi->s_fc_ineligible_updates);
> @@ -295,6 +302,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
>    */
>   void ext4_fc_stop_ineligible(struct super_block *sb)
>   {
> +	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
> +		return;
> +
>   	EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE;
>   	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
>   }
> @@ -325,7 +336,8 @@ static int ext4_fc_track_template(
>   	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>   	int ret;
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (sbi->s_mount_state & EXT4_FC_REPLAY))
>   		return -EOPNOTSUPP;
> 
>   	if (ext4_fc_is_ineligible(inode->i_sb))
> @@ -1214,13 +1226,864 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
>   	trace_ext4_fc_stats(sb);
>   }
> 
> +/* Ext4 Replay Path Routines */
> +
> +/* Get length of a particular tlv */
> +static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
> +{
> +	return le16_to_cpu(tl->fc_len);
> +}
> +
> +/* Get a pointer to "value" of a tlv */
> +static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
> +{
> +	return (u8 *)tl + sizeof(*tl);
> +}
> +
> +/* Helper struct for dentry replay routines */
> +struct dentry_info_args {
> +	int parent_ino, dname_len, ino, inode_len;
> +	char *dname;
> +};
> +
> +static inline void tl_to_darg(struct dentry_info_args *darg,
> +				struct  ext4_fc_tl *tl)
> +{
> +	struct ext4_fc_dentry_info *fcd;
> +
> +	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
> +
> +	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
> +	darg->ino = le32_to_cpu(fcd->fc_ino);
> +	darg->dname = fcd->fc_dname;
> +	darg->dname_len = ext4_fc_tag_len(tl) -
> +			sizeof(struct ext4_fc_dentry_info);
> +}
> +
> +/* Unlink replay function */
> +static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct inode *inode, *old_parent;
> +	struct qstr entry;
> +	struct dentry_info_args darg;
> +	int ret = 0;
> +
> +	tl_to_darg(&darg, tl);
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
> +			darg.parent_ino, darg.dname_len);
> +
> +	entry.name = darg.dname;
> +	entry.len = darg.dname_len;
> +	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
> +
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode %d not found", darg.ino);
> +		return 0;
> +	}
> +
> +	old_parent = ext4_iget(sb, darg.parent_ino,
> +				EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(old_parent)) {
> +		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
> +		iput(inode);
> +		return 0;
> +	}
> +
> +	ret = __ext4_unlink(old_parent, &entry, inode);
> +	/* -ENOENT ok coz it might not exist anymore. */
> +	if (ret == -ENOENT)
> +		ret = 0;
> +	iput(old_parent);
> +	iput(inode);
> +	return ret;
> +}
> +
> +static int ext4_fc_replay_link_internal(struct super_block *sb,
> +				struct dentry_info_args *darg,
> +				struct inode *inode)
> +{
> +	struct inode *dir = NULL;
> +	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
> +	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
> +	int ret = 0;
> +
> +	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR(dir)) {
> +		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
> +		dir = NULL;
> +		goto out;
> +	}
> +
> +	dentry_dir = d_obtain_alias(dir);
> +	if (IS_ERR(dentry_dir)) {
> +		jbd_debug(1, "Failed to obtain dentry");
> +		dentry_dir = NULL;
> +		goto out;
> +	}
> +
> +	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
> +	if (!dentry_inode) {
> +		jbd_debug(1, "Inode dentry not created.");
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = __ext4_link(dir, inode, dentry_inode);
> +	/*
> +	 * It's possible that link already existed since data blocks
> +	 * for the dir in question got persisted before we crashed OR
> +	 * we replayed this tag and crashed before the entire replay
> +	 * could complete.
> +	 */
> +	if (ret && ret != -EEXIST) {
> +		jbd_debug(1, "Failed to link\n");
> +		goto out;
> +	}
> +
> +	ret = 0;
> +out:
> +	if (dentry_dir) {
> +		d_drop(dentry_dir);
> +		dput(dentry_dir);
> +	} else if (dir) {
> +		iput(dir);
> +	}
> +	if (dentry_inode) {
> +		d_drop(dentry_inode);
> +		dput(dentry_inode);
> +	}
> +
> +	return ret;
> +}
> +
> +/* Link replay function */
> +static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct inode *inode;
> +	struct dentry_info_args darg;
> +	int ret = 0;
> +
> +	tl_to_darg(&darg, tl);
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
> +			darg.parent_ino, darg.dname_len);
> +
> +	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode not found.");
> +		return 0;
> +	}
> +
> +	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
> +	iput(inode);
> +	return ret;
> +}
> +
> +/*
> + * Record all the modified inodes during replay. We use this later to setup
> + * block bitmaps correctly.
> + */
> +static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
> +{
> +	struct ext4_fc_replay_state *state;
> +	int i;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	for (i = 0; i < state->fc_modified_inodes_used; i++)
> +		if (state->fc_modified_inodes[i] == ino)
> +			return 0;
> +	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
> +		state->fc_modified_inodes_size +=
> +			EXT4_FC_REPLAY_REALLOC_INCREMENT;
> +		state->fc_modified_inodes = krealloc(
> +					state->fc_modified_inodes, sizeof(int) *
> +					state->fc_modified_inodes_size,
> +					GFP_KERNEL);
> +		if (!state->fc_modified_inodes)
> +			return -ENOMEM;
> +	}
> +	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
> +	return 0;
> +}
> +
> +/*
> + * Inode replay function
> + *
> + * If the tag is EXT4_FC_TAG_INODE_FULL, copy the entire inode to its location.
> + * If the tag is EXT4_FC_TAG_INODE_PARTIAL, copy everything except i_block.
> + * This is useful if i_block has been modified due to previous ADD_RANGE /
> + * DEL_RANGE tags.
> + */
> +static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct ext4_fc_inode *fc_inode;
> +	u8 *raw_fc_inode;
> +	struct inode *inode = NULL;
> +	struct ext4_iloc iloc;
> +	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
> +
> +	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
> +
> +	ino = le32_to_cpu(fc_inode->fc_ino);
> +	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
> +
> +	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
> +	if (!IS_ERR_OR_NULL(inode)) {
> +		ext4_ext_clear_bb(inode);
> +		iput(inode);
> +	}
> +
> +	ext4_fc_record_modified_inode(sb, ino);
> +
> +	raw_fc_inode = fc_inode->fc_raw_inode;
> +	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
> +	if (ret)
> +		goto out;
> +
> +	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
> +
> +	if (tag == EXT4_FC_TAG_INODE_FULL) {
> +		memcpy(ext4_raw_inode(&iloc), raw_fc_inode, inode_len);
> +	} else {
> +		memcpy(ext4_raw_inode(&iloc), raw_fc_inode,
> +			offsetof(struct ext4_inode, i_block));
> +		memcpy(&ext4_raw_inode(&iloc)->i_generation,
> +			&((struct ext4_inode *)(raw_fc_inode))->i_generation,
> +			inode_len -
> +			offsetof(struct ext4_inode, i_generation));
> +	}
> +
> +	/* Immediately update the inode on disk. */
> +	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
> +	sync_dirty_buffer(iloc.bh);
> +
> +	ret = ext4_mark_inode_used(sb, ino);
> +	if (ret)
> +		goto out;
> +
> +	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
> +	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode not found.");
> +		return -EFSCORRUPTED;
> +	}
> +
> +	/*
> +	 * Our allocator could have made different decisions than before
> +	 * crashing. This should be fixed but until then, we calculate
> +	 * the number of blocks the inode.
> +	 */
> +	if (tag == EXT4_FC_TAG_INODE_PARTIAL)
> +		ext4_ext_replay_set_iblocks(inode);
> +
> +	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
> +	ext4_reset_inode_seed(inode);
> +
> +	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
> +	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
> +	sync_dirty_buffer(iloc.bh);
> +	brelse(iloc.bh);
> +out:
> +	iput(inode);
> +	if (!ret)
> +		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/*
> + * Dentry create replay function.
> + *
> + * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
> + * inode for which we are trying to create a dentry here, should already have
> + * been replayed before we start here.
> + */
> +static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	int ret = 0;
> +	struct inode *inode = NULL;
> +	struct inode *dir = NULL;
> +	struct dentry_info_args darg;
> +
> +	tl_to_darg(&darg, tl);
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
> +			darg.parent_ino, darg.dname_len);
> +
> +	/* This takes care of update group descriptor and other metadata */
> +	ret = ext4_mark_inode_used(sb, darg.ino);
> +	if (ret)
> +		goto out;
> +
> +	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "inode %d not found.", darg.ino);
> +		inode = NULL;
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		/*
> +		 * If we are creating a directory, we need to make sure that the
> +		 * dot and dot dot dirents are setup properly.
> +		 */
> +		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
> +		if (IS_ERR_OR_NULL(dir)) {
> +			jbd_debug(1, "Dir %d not found.", darg.ino);
> +			goto out;
> +		}
> +		ret = ext4_init_new_dir(NULL, dir, inode);
> +		iput(dir);
> +		if (ret) {
> +			ret = 0;
> +			goto out;
> +		}
> +	}
> +	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
> +	if (ret)
> +		goto out;
> +	set_nlink(inode, 1);
> +	ext4_mark_inode_dirty(NULL, inode);
> +out:
> +	if (inode)
> +		iput(inode);
> +	return ret;
> +}
> +
> +/*
> + * Record physical disk regions which are in use as per fast commit area. Our
> + * simple replay phase allocator excludes these regions from allocation.
> + */
> +static int ext4_fc_record_regions(struct super_block *sb, int ino,
> +		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
> +{
> +	struct ext4_fc_replay_state *state;
> +	struct ext4_fc_alloc_region *region;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	if (state->fc_regions_used == state->fc_regions_size) {
> +		state->fc_regions_size +=
> +			EXT4_FC_REPLAY_REALLOC_INCREMENT;
> +		state->fc_regions = krealloc(
> +					state->fc_regions,
> +					state->fc_regions_size *
> +					sizeof(struct ext4_fc_alloc_region),
> +					GFP_KERNEL);
> +		if (!state->fc_regions)
> +			return -ENOMEM;
> +	}
> +	region = &state->fc_regions[state->fc_regions_used++];
> +	region->ino = ino;
> +	region->lblk = lblk;
> +	region->pblk = pblk;
> +	region->len = len;
> +
> +	return 0;
> +}
> +
> +/* Replay add range tag */
> +static int ext4_fc_replay_add_range(struct super_block *sb,
> +				struct ext4_fc_tl *tl)
> +{
> +	struct ext4_fc_add_range *fc_add_ex;
> +	struct ext4_extent newex, *ex;
> +	struct inode *inode;
> +	ext4_lblk_t start, cur;
> +	int remaining, len;
> +	ext4_fsblk_t start_pblk;
> +	struct ext4_map_blocks map;
> +	struct ext4_ext_path *path = NULL;
> +	int ret;
> +
> +	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
> +	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
> +		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
> +		ext4_ext_get_actual_len(ex));
> +
> +	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
> +				EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode not found.");
> +		return 0;
> +	}
> +
> +	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
> +
> +	start = le32_to_cpu(ex->ee_block);
> +	start_pblk = ext4_ext_pblock(ex);
> +	len = ext4_ext_get_actual_len(ex);
> +
> +	cur = start;
> +	remaining = len;
> +	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
> +		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
> +		  inode->i_ino);
> +
> +	while (remaining > 0) {
> +		map.m_lblk = cur;
> +		map.m_len = remaining;
> +		map.m_pblk = 0;
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +
> +		if (ret < 0) {
> +			iput(inode);
> +			return 0;
> +		}
> +
> +		if (ret == 0) {
> +			/* Range not mapped */
> +			path = ext4_find_extent(inode, cur, NULL, 0);
> +			if (!path)
> +				continue;
> +			memset(&newex, 0, sizeof(newex));
> +			newex.ee_block = cpu_to_le32(cur);
> +			ext4_ext_store_pblock(
> +				&newex, start_pblk + cur - start);
> +			newex.ee_len = cpu_to_le16(map.m_len);
> +			if (ext4_ext_is_unwritten(ex))
> +				ext4_ext_mark_unwritten(&newex);
> +			down_write(&EXT4_I(inode)->i_data_sem);
> +			ret = ext4_ext_insert_extent(
> +				NULL, inode, &path, &newex, 0);
> +			up_write((&EXT4_I(inode)->i_data_sem));
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			if (ret) {
> +				iput(inode);
> +				return 0;
> +			}
> +			goto next;
> +		}
> +
> +		if (start_pblk + cur - start != map.m_pblk) { > +			/* Logical to physical mapping changed */


Sorry I am not sure if I understand this correctly. Can we pls put more
comments on when and how can this condition happen?
I am sure I am mising something.

Also what about if the mapping changed and the start pblk is different
but it's still an overlapping mapping?
Do we take care of that case here? why I ask this, because we are
clearing the block bitmaps for map.m_len below.

> +			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
> +					ext4_ext_is_unwritten(ex),
> +					start_pblk + cur - start);
> +			if (ret) {
> +				iput(inode);
> +				return 0;
> +			}
> +			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
> +			goto next;
> +		}
> +
> +		/* Range is mapped and needs a state change */
> +		jbd_debug(1, "Converting from %d to %d %lld",
> +				map.m_flags & EXT4_MAP_UNWRITTEN,
> +			ext4_ext_is_unwritten(ex), map.m_pblk);
> +		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
> +					ext4_ext_is_unwritten(ex), map.m_pblk);
> +		if (ret) {
> +			iput(inode);
> +			return 0;
> +		}
> +		/*
> +		 * We may have split the extent tree while toggling the state.
> +		 * Try to shrink the exten tree now.

s/exten/extent



> +		 */
> +		ext4_ext_replay_shrink_inode(inode, start + len);
> +next:
> +		cur += map.m_len;
> +		remaining -= map.m_len;
> +	}
> +	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
> +					sb->s_blocksize_bits);
> +	iput(inode);
> +	return 0;
> +}
> +
> +/* Replay DEL_RANGE tag */
> +static int
> +ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct inode *inode;
> +	struct ext4_fc_del_range *lrange;
> +	struct ext4_map_blocks map;
> +	ext4_lblk_t cur, remaining;
> +	int ret;
> +
> +	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
> +	cur = le32_to_cpu(lrange->fc_lblk);
> +	remaining = le32_to_cpu(lrange->fc_len);
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
> +		le32_to_cpu(lrange->fc_ino), cur, remaining);
> +
> +	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
> +		return 0;
> +	}
> +
> +	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
> +
> +	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
> +			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
> +			le32_to_cpu(lrange->fc_len));
> +	while (remaining > 0) {
> +		map.m_lblk = cur;
> +		map.m_len = remaining;
> +
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0) {
> +			iput(inode);
> +			return 0;
> +		}
> +		if (ret > 0) {
> +			remaining -= ret;
> +			cur += ret;
> +			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
> +		} else {
> +			remaining -= map.m_len;
> +			cur += map.m_len;
> +		}
> +	}
> +
> +	ret = ext4_punch_hole(inode,
> +		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
> +		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
> +	if (ret)
> +		jbd_debug(1, "ext4_punch_hole returned %d", ret);
> +	ext4_ext_replay_shrink_inode(inode,
> +		i_size_read(inode) >> sb->s_blocksize_bits);
> +	ext4_mark_inode_dirty(NULL, inode);
> +	iput(inode);
> +
> +	return 0;
> +}
> +
> +static inline const char *tag2str(u16 tag)
> +{
> +	switch (tag) {
> +	case EXT4_FC_TAG_LINK:
> +		return "TAG_ADD_ENTRY";
> +	case EXT4_FC_TAG_UNLINK:
> +		return "TAG_DEL_ENTRY";
> +	case EXT4_FC_TAG_ADD_RANGE:
> +		return "TAG_ADD_RANGE";
> +	case EXT4_FC_TAG_CREAT:
> +		return "TAG_CREAT_DENTRY";
> +	case EXT4_FC_TAG_DEL_RANGE:
> +		return "TAG_DEL_RANGE";
> +	case EXT4_FC_TAG_INODE_FULL:
> +		return "TAG_INODE_FULL";
> +	case EXT4_FC_TAG_INODE_PARTIAL:
> +		return "TAG_INODE_PARTIAL";
> +	case EXT4_FC_TAG_PAD:
> +		return "TAG_PAD";
> +	case EXT4_FC_TAG_TAIL:
> +		return "TAG_TAIL";
> +	case EXT4_FC_TAG_HEAD:
> +		return "TAG_HEAD";
> +	default:
> +		return "TAG_ERROR";
> +	}
> +}
> +
> +void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)

static ?

> +{
> +	struct ext4_fc_replay_state *state;
> +	struct inode *inode;
> +	struct ext4_ext_path *path = NULL;
> +	struct ext4_map_blocks map;
> +	int i, ret, j;
> +	ext4_lblk_t cur, end;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	for (i = 0; i < state->fc_modified_inodes_used; i++) {
> +		inode = ext4_iget(sb, state->fc_modified_inodes[i],
> +			EXT4_IGET_NORMAL);
> +		if (IS_ERR_OR_NULL(inode)) {
> +			jbd_debug(1, "Inode %d not found.",
> +				state->fc_modified_inodes[i]);
> +			continue;
> +		}
> +		cur = 0;
> +		end = EXT_MAX_BLOCKS;
> +		while (cur < end) {
> +			map.m_lblk = cur;
> +			map.m_len = end - cur;
> +
> +			ret = ext4_map_blocks(NULL, inode, &map, 0);
> +			if (ret < 0)
> +				break;
> +
> +			if (ret > 0) {
> +				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
> +				if (!IS_ERR_OR_NULL(path)) {
> +					for (j = 0; j < path->p_depth; j++)
> +						ext4_mb_mark_bb(inode->i_sb,
> +							path[j].p_block, 1, 1);
> +					ext4_ext_drop_refs(path);
> +					kfree(path);
> +				}
> +				cur += ret;
> +				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
> +							map.m_len, 1);
> +			} else {
> +				cur = cur + (map.m_len ? map.m_len : 1);
> +			}
> +		}
> +		iput(inode);
> +	}
> +}
> +
> +/*
> + * Check if block is in excluded regions for block allocation. The simple
> + * allocator that runs during replay phase is calls this function to see
> + * if it is okay to use a block.
> + */
> +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
> +{
> +	int i;
> +	struct ext4_fc_replay_state *state;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	for (i = 0; i < state->fc_regions_valid; i++) {
> +		if (state->fc_regions[i].ino == 0 ||
> +			state->fc_regions[i].len == 0)
> +			continue;
> +		if (blk >= state->fc_regions[i].pblk &&
> +		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/* Cleanup function called after replay */
> +void ext4_fc_replay_cleanup(struct super_block *sb)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> +	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
> +	kfree(sbi->s_fc_replay_state.fc_regions);
> +	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
> +}
> +
> +/*
> + * Recovery Scan phase handler
> + *
> + * This function is called during the scan phase and is responsible
> + * for doing following things:
> + * - Make sure the fast commit area has valid tags for replay
> + * - Count number of tags that need to be replayed by the replay handler
> + * - Verify CRC
> + * - Create a list of excluded blocks for allocation during replay phase
> + *
> + * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
> + * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
> + * to indicate that scan has finished and JBD2 can now start replay phase.
> + * It returns a negative error to indicate that there was an error. At the end
> + * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
> + * to indicate the number of tags that need to replayed during the replay phase.
> + */
> +static int ext4_fc_replay_scan(journal_t *journal,
> +				struct buffer_head *bh, int off,
> +				tid_t expected_tid)
> +{
> +	struct super_block *sb = journal->j_private;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_fc_replay_state *state;
> +	int ret = JBD2_FC_REPLAY_CONTINUE;
> +	struct ext4_fc_add_range *ext;
> +	struct ext4_fc_tl *tl;
> +	struct ext4_fc_tail *tail;
> +	__u8 *start, *end;
> +	struct ext4_fc_head *head;
> +	struct ext4_extent *ex;
> +
> +	state = &sbi->s_fc_replay_state;
> +
> +	start = (u8 *)bh->b_data;
> +	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
> +
> +	if (state->fc_replay_expected_off == 0) {
> +		state->fc_cur_tag = 0;
> +		state->fc_replay_num_tags = 0;
> +		state->fc_crc = 0;
> +		state->fc_regions = NULL;
> +		state->fc_regions_valid = state->fc_regions_used =
> +			state->fc_regions_size = 0;
> +		/* Check if we can stop early */
> +		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
> +			!= EXT4_FC_TAG_HEAD)
> +			return 0;
> +	}
> +
> +	if (off != state->fc_replay_expected_off) {
> +		ret = -EFSCORRUPTED;
> +		goto out_err;
> +	}
> +
> +	state->fc_replay_expected_off++;
> +	fc_for_each_tl(start, end, tl) {
> +		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
> +			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
> +		switch (le16_to_cpu(tl->fc_tag)) {
> +		case EXT4_FC_TAG_ADD_RANGE:
> +			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
> +			ex = (struct ext4_extent *)&ext->fc_ex;
> +			ret = ext4_fc_record_regions(sb,
> +				le32_to_cpu(ext->fc_ino),
> +				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
> +				ext4_ext_get_actual_len(ex));
> +			if (ret < 0)
> +				break;
> +			ret = JBD2_FC_REPLAY_CONTINUE;
> +			fallthrough;
> +		case EXT4_FC_TAG_DEL_RANGE:
> +		case EXT4_FC_TAG_LINK:
> +		case EXT4_FC_TAG_UNLINK:
> +		case EXT4_FC_TAG_CREAT:
> +		case EXT4_FC_TAG_INODE_FULL:
> +		case EXT4_FC_TAG_INODE_PARTIAL:
> +		case EXT4_FC_TAG_PAD:
> +			state->fc_cur_tag++;
> +			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
> +					sizeof(*tl) + ext4_fc_tag_len(tl));
> +			break;
> +		case EXT4_FC_TAG_TAIL:
> +			state->fc_cur_tag++;
> +			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
> +			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
> +						sizeof(*tl) +
> +						offsetof(struct ext4_fc_tail,
> +						fc_crc));
> +			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
> +				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
> +				state->fc_replay_num_tags = state->fc_cur_tag;
> +				state->fc_regions_valid =
> +					state->fc_regions_used;
> +			} else {
> +				ret = state->fc_replay_num_tags ?
> +					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
> +			}
> +			state->fc_crc = 0;
> +			break;
> +		case EXT4_FC_TAG_HEAD:
> +			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
> +			if (le32_to_cpu(head->fc_features) &
> +				~EXT4_FC_SUPPORTED_FEATURES) {
> +				ret = -EOPNOTSUPP;
> +				break;
> +			}
> +			if (le32_to_cpu(head->fc_tid) != expected_tid) {
> +				ret = JBD2_FC_REPLAY_STOP;
> +				break;
> +			}
> +			state->fc_cur_tag++;
> +			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
> +					sizeof(*tl) + ext4_fc_tag_len(tl));


why do we need to calculate state->fc_crc for HEAD?
I don't see we comparing this anywhere right? anything I missed?

> +			break;
> +		default:
> +			ret = state->fc_replay_num_tags ?
> +				JBD2_FC_REPLAY_STOP : -ECANCELED;
> +		}
> +		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
> +			break;
> +	}
> +
> +out_err:
> +	trace_ext4_fc_replay_scan(sb, ret, off);
> +	return ret;
> +}
> +
>   /*
>    * Main recovery path entry point.
> + * The meaning of return codes is similar as above.
>    */
>   static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
>   				enum passtype pass, int off, tid_t expected_tid)
>   {
> -	return 0;
> +	struct super_block *sb = journal->j_private;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_fc_tl *tl;
> +	__u8 *start, *end;
> +	int ret = JBD2_FC_REPLAY_CONTINUE;
> +	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
> +	struct ext4_fc_tail *tail;
> +
> +	if (pass == PASS_SCAN) {
> +		state->fc_current_pass = PASS_SCAN;
> +		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
> +	}
> +
> +	if (state->fc_current_pass != pass) {
> +		state->fc_current_pass = pass;
> +		sbi->s_mount_state |= EXT4_FC_REPLAY;
> +	}
> +	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
> +		jbd_debug(1, "Replay stops\n");
> +		ext4_fc_set_bitmaps_and_counters(sb);
> +		return 0;
> +	}
> +
> +#ifdef CONFIG_EXT4_DEBUG
> +	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
> +		pr_warn("Dropping fc block %d because max_replay set\n", off);
> +		return -EINVAL;
> +	}
> +#endif
> +
> +	start = (u8 *)bh->b_data;
> +	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
> +
> +	fc_for_each_tl(start, end, tl) {
> +		if (state->fc_replay_num_tags == 0) {
> +			ret = JBD2_FC_REPLAY_STOP;
> +			ext4_fc_set_bitmaps_and_counters(sb);
> +			break;
> +		}
> +		jbd_debug(3, "Replay phase, tag:%s\n",
> +				tag2str(le16_to_cpu(tl->fc_tag)));
> +		state->fc_replay_num_tags--;
> +		switch (le16_to_cpu(tl->fc_tag)) {
> +		case EXT4_FC_TAG_LINK:
> +			ret = ext4_fc_replay_link(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_UNLINK:
> +			ret = ext4_fc_replay_unlink(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_ADD_RANGE:
> +			ret = ext4_fc_replay_add_range(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_CREAT:
> +			ret = ext4_fc_replay_create(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_DEL_RANGE:
> +			ret = ext4_fc_replay_del_range(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_INODE_PARTIAL:
> +		case EXT4_FC_TAG_INODE_FULL:
> +			ret = ext4_fc_replay_inode(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_PAD:
> +			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
> +				ext4_fc_tag_len(tl), 0);
> +			break;
> +		case EXT4_FC_TAG_TAIL:
> +			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
> +				ext4_fc_tag_len(tl), 0);
> +			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
> +			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
> +			break;
> +		case EXT4_FC_TAG_HEAD:
> +			break;
> +		default:
> +			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
> +				ext4_fc_tag_len(tl), 0);
> +			ret = -ECANCELED;
> +			break;
> +		}
> +		if (ret < 0)
> +			break;
> +		ret = JBD2_FC_REPLAY_CONTINUE;
> +	}
> +	return ret;
>   }
> 
>   void ext4_fc_init(struct super_block *sb, journal_t *journal)
> diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
> index a541d2bbe24b..cf4d8772d055 100644
> --- a/fs/ext4/fast_commit.h
> +++ b/fs/ext4/fast_commit.h
> @@ -117,4 +117,44 @@ struct ext4_fc_stats {
>   	int fc_numblks;
>   };
> 
> +#define EXT4_FC_REPLAY_REALLOC_INCREMENT	4
> +
> +/*
> + * Physical block regions added to different inodes due to fast commit
> + * recovery. These are set during the SCAN phase. During the replay phase,
> + * our allocator excludes these from its allocation. This ensures that
> + * we don't accidentally allocating a block that is going to be used by
> + * another inode.
> + */
> +struct ext4_fc_alloc_region {
> +	ext4_lblk_t lblk;
> +	ext4_fsblk_t pblk;
> +	int ino, len;
> +};
> +
> +/*
> + * Fast commit replay state.
> + */
> +struct ext4_fc_replay_state {
> +	int fc_replay_num_tags;
> +	int fc_replay_expected_off;
> +	int fc_current_pass;
> +	int fc_cur_tag;
> +	int fc_crc;
> +	struct ext4_fc_alloc_region *fc_regions;
> +	int fc_regions_size, fc_regions_used, fc_regions_valid;
> +	int *fc_modified_inodes;
> +	int fc_modified_inodes_used, fc_modified_inodes_size;
> +};
> +
> +#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
> +
> +#define fc_for_each_tl(__start, __end, __tl)				\
> +	for (tl = (struct ext4_fc_tl *)start;				\
> +		(u8 *)tl < (u8 *)end;					\
> +		tl = (struct ext4_fc_tl *)((u8 *)tl +			\
> +					sizeof(struct ext4_fc_tl) +	\
> +					+ le16_to_cpu(tl->fc_len)))
> +
> +
>   #endif /* __FAST_COMMIT_H__ */
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index df25d38d6539..db9b9eeb9560 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -82,7 +82,12 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
>   				      struct buffer_head *bh)
>   {
>   	ext4_fsblk_t	blk;
> -	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
> +	struct ext4_group_info *grp;
> +
> +	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
> +	grp = ext4_get_group_info(sb, block_group);
> 
>   	if (buffer_verified(bh))
>   		return 0;
> @@ -284,15 +289,17 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
>   	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
>   	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
>   	/* Don't bother if the inode bitmap is corrupt. */
> -	grp = ext4_get_group_info(sb, block_group);
>   	if (IS_ERR(bitmap_bh)) {
>   		fatal = PTR_ERR(bitmap_bh);
>   		bitmap_bh = NULL;
>   		goto error_return;
>   	}
> -	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
> -		fatal = -EFSCORRUPTED;
> -		goto error_return;
> +	if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +		grp = ext4_get_group_info(sb, block_group);
> +		if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
> +			fatal = -EFSCORRUPTED;
> +			goto error_return;
> +		}
>   	}
> 
>   	BUFFER_TRACE(bitmap_bh, "get_write_access");
> @@ -742,6 +749,119 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group,
>   	return 1;
>   }
> 
> +int ext4_mark_inode_used(struct super_block *sb, int ino)
> +{
> +	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
> +	struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
> +	struct ext4_group_desc *gdp;
> +	ext4_group_t group;
> +	int bit;
> +	int err = -EFSCORRUPTED;
> +
> +	if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
> +		goto out;
> +
> +	group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
> +	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
> +	inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
> +	if (IS_ERR(inode_bitmap_bh))
> +		return PTR_ERR(inode_bitmap_bh);
> +
> +	if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
> +		err = 0;
> +		goto out;
> +	}
> +
> +	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
> +	if (!gdp || !group_desc_bh) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	ext4_set_bit(bit, inode_bitmap_bh->b_data);
> +
> +	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
> +	err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
> +	if (err) {
> +		ext4_std_error(sb, err);
> +		goto out;
> +	}
> +	sync_dirty_buffer(inode_bitmap_bh);

Shouldn't we handle error from sync_dirty_buffer()?

> +	BUFFER_TRACE(group_desc_bh, "get_write_access");

The above BUFFER_TRACE() is not correct. We should remove it from here.



> +
> +	/* We may have to initialize the block bitmap if it isn't already */
> +	if (ext4_has_group_desc_csum(sb) &&
> +	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> +		struct buffer_head *block_bitmap_bh;
> +
> +		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
> +		if (IS_ERR(block_bitmap_bh)) {
> +			err = PTR_ERR(block_bitmap_bh);
> +			goto out;
> +		}
> +
> +		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
> +		err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
> +		sync_dirty_buffer(block_bitmap_bh);
> +
> +		/* recheck and clear flag under lock if we still need to */
> +		ext4_lock_group(sb, group);
> +		if (ext4_has_group_desc_csum(sb) &&
> +		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
> +			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
> +			ext4_free_group_clusters_set(sb, gdp,
> +				ext4_free_clusters_after_init(sb, group, gdp));
> +			ext4_block_bitmap_csum_set(sb, group, gdp,
> +						   block_bitmap_bh);
> +			ext4_group_desc_csum_set(sb, group, gdp);
> +		}
> +		ext4_unlock_group(sb, group);
> +		brelse(block_bitmap_bh);
> +
> +		if (err) {
> +			ext4_std_error(sb, err);
> +			goto out;
> +		}
> +	}
> +
> +	/* Update the relevant bg descriptor fields */
> +	if (ext4_has_group_desc_csum(sb)) {
> +		int free;
> +
> +		ext4_lock_group(sb, group); /* while we modify the bg desc */
> +		free = EXT4_INODES_PER_GROUP(sb) -
> +			ext4_itable_unused_count(sb, gdp);
> +		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
> +			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
> +			free = 0;
> +		}
> +
> +		/*
> +		 * Check the relative inode number against the last used
> +		 * relative inode number in this group. if it is greater
> +		 * we need to update the bg_itable_unused count
> +		 */
> +		if (bit >= free)
> +			ext4_itable_unused_set(sb, gdp,
> +					(EXT4_INODES_PER_GROUP(sb) - bit - 1));
> +	} else {
> +		ext4_lock_group(sb, group);
> +	}
> +
> +	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
> +	if (ext4_has_group_desc_csum(sb)) {
> +		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
> +					   EXT4_INODES_PER_GROUP(sb) / 8);
> +		ext4_group_desc_csum_set(sb, group, gdp);
> +	}
> +
> +	ext4_unlock_group(sb, group);
> +	err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
> +	sync_dirty_buffer(group_desc_bh);
> +out:
> +	return err;
> +}
> +
>   /*
>    * There are two policies for allocating an inode.  If the new inode is
>    * a directory, then a forward search is made for a block group with both
> @@ -771,7 +891,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   	struct inode *ret;
>   	ext4_group_t i;
>   	ext4_group_t flex_group;
> -	struct ext4_group_info *grp;
> +	struct ext4_group_info *grp = NULL;
>   	int encrypt = 0;
> 
>   	/* Cannot create files in a deleted directory */
> @@ -909,15 +1029,21 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   		if (ext4_free_inodes_count(sb, gdp) == 0)
>   			goto next_group;
> 
> -		grp = ext4_get_group_info(sb, group);
> -		/* Skip groups with already-known suspicious inode tables */
> -		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
> -			goto next_group;
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +			grp = ext4_get_group_info(sb, group);
> +			/*
> +			 * Skip groups with already-known suspicious inode
> +			 * tables
> +			 */
> +			if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
> +				goto next_group;
> +		}
> 
>   		brelse(inode_bitmap_bh);
>   		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
>   		/* Skip groups with suspicious inode tables */
> -		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
> +		if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
> +		     && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
>   		    IS_ERR(inode_bitmap_bh)) {
>   			inode_bitmap_bh = NULL;
>   			goto next_group;
> @@ -936,7 +1062,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   			goto next_group;
>   		}
> 
> -		if (!handle) {
> +		if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
>   			BUG_ON(nblocks <= 0);
>   			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
>   				 handle_type, nblocks, 0,
> @@ -1040,9 +1166,15 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   	/* Update the relevant bg descriptor fields */
>   	if (ext4_has_group_desc_csum(sb)) {
>   		int free;
> -		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> -
> -		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
> +		struct ext4_group_info *grp = NULL;
> +
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +			grp = ext4_get_group_info(sb, group);
> +			down_read(&grp->alloc_sem); /*
> +						     * protect vs itable
> +						     * lazyinit
> +						     */
> +		}
>   		ext4_lock_group(sb, group); /* while we modify the bg desc */
>   		free = EXT4_INODES_PER_GROUP(sb) -
>   			ext4_itable_unused_count(sb, gdp);
> @@ -1058,7 +1190,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   		if (ino > free)
>   			ext4_itable_unused_set(sb, gdp,
>   					(EXT4_INODES_PER_GROUP(sb) - ino));
> -		up_read(&grp->alloc_sem);
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
> +			up_read(&grp->alloc_sem);
>   	} else {
>   		ext4_lock_group(sb, group);
>   	}
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 26eed76812f9..9dce088171cc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -101,8 +101,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
>   	return provided == calculated;
>   }
> 
> -static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
> -				struct ext4_inode_info *ei)
> +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
> +			 struct ext4_inode_info *ei)
>   {
>   	__u32 csum;
> 
> @@ -514,7 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>   		return -EFSCORRUPTED;
> 
>   	/* Lookup extent status tree firstly */
> -	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
> +	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
> +	    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
>   		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
>   			map->m_pblk = ext4_es_pblock(&es) +
>   					map->m_lblk - es.es_lblk;
> @@ -827,7 +828,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
>   	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
>   	int err;
> 
> -	J_ASSERT(handle != NULL || create == 0);
> +	J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		 || handle != NULL || create == 0);
> 
>   	map.m_lblk = block;
>   	map.m_len = 1;
> @@ -843,7 +845,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
>   		return ERR_PTR(-ENOMEM);
>   	if (map.m_flags & EXT4_MAP_NEW) {
>   		J_ASSERT(create != 0);
> -		J_ASSERT(handle != NULL);
> +		J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +			 || (handle != NULL));
> 
>   		/*
>   		 * Now that we do not always journal data, we should
> @@ -4255,22 +4258,22 @@ int ext4_truncate(struct inode *inode)
>    * data in memory that is needed to recreate the on-disk version of this
>    * inode.
>    */
> -static int __ext4_get_inode_loc(struct inode *inode,
> -				struct ext4_iloc *iloc, int in_mem)
> +static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
> +				struct ext4_iloc *iloc, int in_mem,
> +				ext4_fsblk_t *ret_block)
>   {
>   	struct ext4_group_desc	*gdp;
>   	struct buffer_head	*bh;
> -	struct super_block	*sb = inode->i_sb;
>   	ext4_fsblk_t		block;
>   	struct blk_plug		plug;
>   	int			inodes_per_block, inode_offset;
> 
>   	iloc->bh = NULL;
> -	if (inode->i_ino < EXT4_ROOT_INO ||
> -	    inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
> +	if (ino < EXT4_ROOT_INO ||
> +	    ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
>   		return -EFSCORRUPTED;
> 
> -	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
> +	iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
>   	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
>   	if (!gdp)
>   		return -EIO;
> @@ -4279,7 +4282,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   	 * Figure out the offset within the block group inode table
>   	 */
>   	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> -	inode_offset = ((inode->i_ino - 1) %
> +	inode_offset = ((ino - 1) %
>   			EXT4_INODES_PER_GROUP(sb));
>   	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
>   	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
> @@ -4380,7 +4383,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   		 * has in-inode xattrs, or we don't have this inode in memory.
>   		 * Read the block from disk.
>   		 */
> -		trace_ext4_load_inode(inode);
> +		trace_ext4_load_inode(sb, ino);
>   		get_bh(bh);
>   		bh->b_end_io = end_buffer_read_sync;
>   		submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
> @@ -4388,8 +4391,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   		wait_on_buffer(bh);
>   		if (!buffer_uptodate(bh)) {
>   		simulate_eio:
> -			ext4_error_inode_block(inode, block, EIO,
> -					       "unable to read itable block");
> +			if (ret_block)
> +				*ret_block = block;
>   			brelse(bh);
>   			return -EIO;
>   		}
> @@ -4399,11 +4402,43 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   	return 0;
>   }
> 
> +static int __ext4_get_inode_loc_noinmem(struct inode *inode,
> +					struct ext4_iloc *iloc)
> +{
> +	ext4_fsblk_t err_blk;
> +	int ret;
> +
> +	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
> +					&err_blk);
> +
> +	if (ret == -EIO)
> +		ext4_error_inode_block(inode, err_blk, EIO,
> +					"unable to read itable block");
> +
> +	return ret;
> +}
> +
>   int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
>   {
> +	ext4_fsblk_t err_blk;
> +	int ret;
> +
>   	/* We have all inode data except xattrs in memory here. */
> -	return __ext4_get_inode_loc(inode, iloc,
> -		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
> +	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
> +		!ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
> +
> +	if (ret == -EIO)
> +		ext4_error_inode_block(inode, err_blk, EIO,
> +					"unable to read itable block");
> +
> +	return ret;
> +}
> +
> +
> +int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
> +			  struct ext4_iloc *iloc)
> +{
> +	return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
>   }
> 
>   static bool ext4_should_enable_dax(struct inode *inode)
> @@ -4569,7 +4604,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   	ei = EXT4_I(inode);
>   	iloc.bh = NULL;
> 
> -	ret = __ext4_get_inode_loc(inode, &iloc, 0);
> +	ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
>   	if (ret < 0)
>   		goto bad_inode;
>   	raw_inode = ext4_raw_inode(&iloc);
> @@ -4615,10 +4650,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   					      sizeof(gen));
>   	}
> 
> -	if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
> -	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
> -		ext4_error_inode_err(inode, function, line, 0, EFSBADCRC,
> -				     "iget: checksum invalid");
> +	if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
> +	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
> +	     (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
> +		ext4_error_inode_err(inode, function, line, 0,
> +				EFSBADCRC, "iget: checksum invalid");
>   		ret = -EFSBADCRC;
>   		goto bad_inode;
>   	}
> @@ -4772,9 +4808,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   		goto bad_inode;
>   	} else if (!ext4_has_inline_data(inode)) {
>   		/* validate the block references in the inode */
> -		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
> -		   (S_ISLNK(inode->i_mode) &&
> -		    !ext4_inode_is_fast_symlink(inode))) {
> +		if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
> +			(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
> +			(S_ISLNK(inode->i_mode) &&
> +			!ext4_inode_is_fast_symlink(inode)))) {
>   			if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>   				ret = ext4_ext_check_inode(inode);
>   			else
> @@ -5158,7 +5195,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
>   	} else {
>   		struct ext4_iloc iloc;
> 
> -		err = __ext4_get_inode_loc(inode, &iloc, 0);
> +		err = __ext4_get_inode_loc_noinmem(inode, &iloc);
>   		if (err)
>   			return err;
>   		/*
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index d2f8f50deef6..f0381876a7e5 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -86,7 +86,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
>   	i_size_write(inode2, isize);
>   }
> 
> -static void reset_inode_seed(struct inode *inode)
> +void ext4_reset_inode_seed(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
>   	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> @@ -200,8 +200,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
> 
>   	inode->i_generation = prandom_u32();
>   	inode_bl->i_generation = prandom_u32();
> -	reset_inode_seed(inode);
> -	reset_inode_seed(inode_bl);
> +	ext4_reset_inode_seed(inode);
> +	ext4_reset_inode_seed(inode_bl);
> 
>   	ext4_discard_preallocations(inode, 0);
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 132c118d12e1..ea894529118a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -1508,14 +1508,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
> 
>   		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
>   		blocknr += EXT4_C2B(sbi, block);
> -		ext4_grp_locked_error(sb, e4b->bd_group,
> -				      inode ? inode->i_ino : 0,
> -				      blocknr,
> -				      "freeing already freed block "
> -				      "(bit %u); block bitmap corrupt.",
> -				      block);
> -		ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +			ext4_grp_locked_error(sb, e4b->bd_group,
> +					      inode ? inode->i_ino : 0,
> +					      blocknr,
> +					      "freeing already freed block (bit %u); block bitmap corrupt.",
> +					      block);
> +			ext4_mark_group_bitmap_corrupted(
> +				sb, e4b->bd_group,
>   				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
> +		}
>   		mb_regenerate_buddy(e4b);
>   		goto done;
>   	}
> @@ -3302,6 +3304,86 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
>   	return err;
>   }
> 
> +/*
> + * Idempotent helper for Ext4 fast commit replay path to set the state of
> + * blocks in bitmaps and update counters.
> + */
> +void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
> +			int len, int state)
> +{
> +	struct buffer_head *bitmap_bh = NULL;
> +	struct ext4_group_desc *gdp;
> +	struct buffer_head *gdp_bh;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	ext4_group_t group;
> +	ext4_fsblk_t cluster;

I guess we never use this variable cluster. We can as well drop it.

-ritesh


  parent reply	other threads:[~2020-10-09 17:14 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-19  0:54 [PATCH v9 0/9] ext4: add fast commits feature Harshad Shirwadkar
2020-09-19  0:54 ` [PATCH v9 1/9] doc: update ext4 and journalling docs to include fast commit feature Harshad Shirwadkar
2020-09-22 17:50   ` Darrick J. Wong
2020-09-24  6:56     ` harshad shirwadkar
2020-10-09 18:28   ` Theodore Y. Ts'o
2020-10-13  0:27     ` harshad shirwadkar
2020-09-19  0:54 ` [PATCH v9 2/9] ext4: add fast_commit feature and handling for extended mount options Harshad Shirwadkar
2020-10-09 17:58   ` Theodore Y. Ts'o
2020-10-13  0:27     ` harshad shirwadkar
2020-09-19  0:54 ` [PATCH v9 3/9] ext4 / jbd2: add fast commit initialization Harshad Shirwadkar
2020-09-19 15:22   ` kernel test robot
2020-09-19 15:22     ` kernel test robot
2020-10-09 16:10   ` Ritesh Harjani
2020-10-13  0:28     ` harshad shirwadkar
2020-09-19  0:54 ` [PATCH v9 4/9] jbd2: add fast commit machinery Harshad Shirwadkar
2020-10-09 16:16   ` Ritesh Harjani
2020-10-13  0:27     ` harshad shirwadkar
2020-09-19  0:54 ` [PATCH v9 5/9] ext4: main fast-commit commit path Harshad Shirwadkar
2020-09-19  4:03   ` kernel test robot
2020-09-19  8:19   ` kernel test robot
2020-09-19  8:19     ` kernel test robot
2020-09-20  1:34   ` kernel test robot
2020-10-09 17:04   ` Ritesh Harjani
2020-10-13  0:25     ` harshad shirwadkar
2020-10-09 19:14   ` Theodore Y. Ts'o
2020-10-13  0:27     ` harshad shirwadkar
2020-09-19  0:54 ` [PATCH v9 6/9] jbd2: fast commit recovery path Harshad Shirwadkar
2020-09-19 11:27   ` kernel test robot
2020-09-19  0:54 ` [PATCH v9 7/9] ext4: " Harshad Shirwadkar
2020-09-19 14:15   ` kernel test robot
2020-09-19 14:15     ` kernel test robot
2020-10-09 17:14   ` Ritesh Harjani [this message]
2020-10-13  0:27     ` harshad shirwadkar
2020-09-19  0:54 ` [PATCH v9 8/9] ext4: add a mount opt to forcefully turn fast commits on Harshad Shirwadkar
2020-09-19  0:54 ` [PATCH v9 9/9] ext4: add fast commit stats in procfs Harshad Shirwadkar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=dcf720bd-8644-3001-75b4-d845a2495f72@linux.ibm.com \
    --to=riteshh@linux.ibm.com \
    --cc=harshadshirwadkar@gmail.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.