All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lukas Czerner <lczerner@redhat.com>
To: Wang Shilong <wangshilong1991@gmail.com>
Cc: linux-ext4@vger.kernel.org, Wang Shilong <wshilong@ddn.com>,
	Shuichi Ihara <sihara@ddn.com>,
	Andreas Dilger <adilger@dilger.ca>
Subject: Re: [PATCH] ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim
Date: Wed, 27 May 2020 11:19:38 +0200	[thread overview]
Message-ID: <20200527091938.647363ekmnz7av7y@work> (raw)
In-Reply-To: <1590565130-23773-1-git-send-email-wangshilong1991@gmail.com>

On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
> From: Wang Shilong <wshilong@ddn.com>
> 
> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> remounted, fstrim need walk all block groups again, the problem with
> this is FSTRIM could be slow on very large LUN SSD based filesystem.
> 
> To avoid this kind of problem, we introduce a block group flag
> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> extra one block group dirty write after trimming block group.

Hi

that's fair enough, however once you make this persistent we also need
to have a way to clear the flag, or at least bypass it. Storage can be
changed and if it does we might want to re-run the fstrim.

We also need to set this flag in mke2fs and e2fsck if appropriate.

more below...

> 
> And When clearing TRIMMED flag, block group will be journalled
> anyway, so it won't introduce any overhead.
> 
> Cc: Shuichi Ihara <sihara@ddn.com>
> Cc: Andreas Dilger <adilger@dilger.ca>
> Cc: Wang Shilong <wangshilong1991@gmail.com>
> Signed-off-by: Wang Shilong <wshilong@ddn.com>
> ---
>  fs/ext4/ext4.h      | 18 +++++++--------
>  fs/ext4/ext4_jbd2.h |  3 ++-
>  fs/ext4/mballoc.c   | 54 ++++++++++++++++++++++++++++++++++-----------
>  3 files changed, 52 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index ad2dbf6e4924..23c2dc529a28 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -357,6 +357,7 @@ struct flex_groups {
>  #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
>  #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
>  #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
> +#define EXT4_BG_WAS_TRIMMED	0x0008 /* block group was trimmed */
>  
>  /*
>   * Macro-instructions used to manage group descriptors
> @@ -3112,9 +3113,8 @@ struct ext4_group_info {
>  };
>  
>  #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
> -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
> -#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
> -#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
> +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	1
> +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	2
>  #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
>  	(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
>  #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
> @@ -3127,12 +3127,12 @@ struct ext4_group_info {
>  #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
>  	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
>  
> -#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
> -	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
> -	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
> -	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GDP_WAS_TRIMMED(gdp)	\
> +	(gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> +#define EXT4_MB_GDP_SET_TRIMMED(gdp)	\
> +	(gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> +#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)	\
> +	(gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
>  
>  #define EXT4_MAX_CONTENTION		8
>  #define EXT4_CONTENTION_THRESHOLD	2
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 4b9002f0e84c..4094a5b247f7 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -123,7 +123,8 @@
>  #define EXT4_HT_MOVE_EXTENTS     9
>  #define EXT4_HT_XATTR           10
>  #define EXT4_HT_EXT_CONVERT     11
> -#define EXT4_HT_MAX             12
> +#define EXT4_HT_FS_TRIM		12
> +#define EXT4_HT_MAX             13
>  
>  /**
>   *   struct ext4_journal_cb_entry - Base structure for callback information.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 30d5d97548c4..d25377948994 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2829,15 +2829,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
>  	rb_erase(&entry->efd_node, &(db->bb_free_root));
>  	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
>  
> -	/*
> -	 * Clear the trimmed flag for the group so that the next
> -	 * ext4_trim_fs can trim it.
> -	 * If the volume is mounted with -o discard, online discard
> -	 * is supported and the free blocks will be trimmed online.
> -	 */
> -	if (!test_opt(sb, DISCARD))
> -		EXT4_MB_GRP_CLEAR_TRIMMED(db);
> -
>  	if (!db->bb_free_root.rb_node) {
>  		/* No more items in the per group rb tree
>  		 * balance refcounts from ext4_mb_free_metadata()
> @@ -4928,8 +4919,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>  					 " group:%d block:%d count:%lu failed"
>  					 " with %d", block_group, bit, count,
>  					 err);
> -		} else
> -			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
> +		}
>  
>  		ext4_lock_group(sb, block_group);
>  		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
> @@ -4939,6 +4929,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>  	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
>  	ext4_free_group_clusters_set(sb, gdp, ret);
>  	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
> +	/*
> +	 * Clear the trimmed flag for the group so that the next
> +	 * ext4_trim_fs can trim it.
> +	 * If the volume is mounted with -o discard, online discard
> +	 * is supported and the free blocks will be trimmed online.
> +	 */
> +	if (!test_opt(sb, DISCARD))
> +		EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
>  	ext4_group_desc_csum_set(sb, block_group, gdp);
>  	ext4_unlock_group(sb, block_group);
>  
> @@ -5192,8 +5190,15 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
>  	ext4_grpblk_t next, count = 0, free_count = 0;
>  	struct ext4_buddy e4b;
>  	int ret = 0;
> +	struct ext4_group_desc *gdp;
> +	struct buffer_head *gdp_bh;
>  
>  	trace_ext4_trim_all_free(sb, group, start, max);
> +	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
> +	if (!gdp) {
> +		ret = -EIO;
> +		return ret;
> +	}
>  
>  	ret = ext4_mb_load_buddy(sb, group, &e4b);
>  	if (ret) {
> @@ -5204,7 +5209,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
>  	bitmap = e4b.bd_bitmap;
>  
>  	ext4_lock_group(sb, group);
> -	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
> +	if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
>  	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
>  		goto out;
>  
> @@ -5245,12 +5250,35 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
>  
>  	if (!ret) {
>  		ret = count;
> -		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> +		EXT4_MB_GDP_SET_TRIMMED(gdp);
> +		ext4_group_desc_csum_set(sb, group, gdp);
>  	}
>  out:
>  	ext4_unlock_group(sb, group);
>  	ext4_mb_unload_buddy(&e4b);
> +	if (ret > 0) {
> +		int err;
> +		handle_t *handle;
>  
> +		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
> +		if (IS_ERR(handle)) {
> +			ret = PTR_ERR(handle);
> +			goto out_return;
> +		}
> +		err = ext4_journal_get_write_access(handle, gdp_bh);
> +		if (err) {
> +			ret = err;
> +			goto out_journal;
> +		}

Don't we need to do this before we set the flag in gdp?

-Lukas

> +		err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
> +		if (err)
> +			ret = err;
> +out_journal:
> +		err = ext4_journal_stop(handle);
> +		if (err)
> +			ret = err;
> +	}
> +out_return:
>  	ext4_debug("trimmed %d blocks in the group %d\n",
>  		count, group);
>  
> -- 
> 2.25.4
> 


  reply	other threads:[~2020-05-27  9:19 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-27  7:38 [PATCH] ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim Wang Shilong
2020-05-27  9:19 ` Lukas Czerner [this message]
2020-05-27  9:32   ` Reindl Harald
2020-05-27  9:57     ` Lukas Czerner
2020-05-27 10:11       ` Reindl Harald
2020-05-27 10:32         ` Lukas Czerner
2020-05-27 10:56           ` Reindl Harald
2020-05-27 19:11             ` Andreas Dilger
2020-05-27 10:06   ` Wang Shilong
2020-05-27 10:21     ` Lukas Czerner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200527091938.647363ekmnz7av7y@work \
    --to=lczerner@redhat.com \
    --cc=adilger@dilger.ca \
    --cc=linux-ext4@vger.kernel.org \
    --cc=sihara@ddn.com \
    --cc=wangshilong1991@gmail.com \
    --cc=wshilong@ddn.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.