Re: [PATCH v3 4/5] erofs: support inline data decompression

From: Yue Hu <zbestahu@gmail.com>
To: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: linux-erofs@lists.ozlabs.org, Chao Yu <chao@kernel.org>,
	Yue Hu <huyue2@yulong.com>, LKML <linux-kernel@vger.kernel.org>,
	geshifei@coolpad.com, zhangwen@coolpad.com,
	shaojunjun@coolpad.com
Subject: Re: [PATCH v3 4/5] erofs: support inline data decompression
Date: Mon, 27 Dec 2021 16:18:19 +0800	[thread overview]
Message-ID: <20211227161819.00000148.zbestahu@gmail.com> (raw)
In-Reply-To: <20211225070626.74080-5-hsiangkao@linux.alibaba.com>

Hi Xiang,

On Sat, 25 Dec 2021 15:06:25 +0800
Gao Xiang <hsiangkao@linux.alibaba.com> wrote:

> From: Yue Hu <huyue2@yulong.com>
> 
> Currently, we have already support tail-packing inline for
> uncompressed file, let's also implement this for compressed
> files to save I/Os and storage space.
> 
> Different from normal pclusters, compressed data is available
> in advance because of other metadata I/Os. Therefore, they
> directly move into the bypass queue without extra I/O submission.
> 
> It's the last compression feature before folio/subpage support.
> 
> Signed-off-by: Yue Hu <huyue2@yulong.com>
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
>  fs/erofs/zdata.c | 128 ++++++++++++++++++++++++++++++++---------------
>  fs/erofs/zdata.h |  24 ++++++++-
>  2 files changed, 109 insertions(+), 43 deletions(-)
> 
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index bc765d8a6dc2..e6ef02634e08 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -82,12 +82,13 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
>  
>  static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
>  {
> +	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
>  	int i;
>  
>  	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
>  		struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
>  
> -		if (pcl->pclusterpages > pcs->maxpages)
> +		if (pclusterpages > pcs->maxpages)
>  			continue;
>  
>  		kmem_cache_free(pcs->slab, pcl);
> @@ -298,6 +299,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
>  		container_of(grp, struct z_erofs_pcluster, obj);
>  	int i;
>  
> +	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
>  	/*
>  	 * refcount of workgroup is now freezed as 1,
>  	 * therefore no need to worry about available decompression users.
> @@ -331,6 +333,7 @@ int erofs_try_to_free_cached_page(struct page *page)
>  	if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
>  		unsigned int i;
>  
> +		DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
>  		for (i = 0; i < pcl->pclusterpages; ++i) {
>  			if (pcl->compressed_pages[i] == page) {
>  				WRITE_ONCE(pcl->compressed_pages[i], NULL);
> @@ -458,6 +461,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
>  				       struct inode *inode,
>  				       struct erofs_map_blocks *map)
>  {
> +	bool ztailpacking = map->m_flags & EROFS_MAP_META;
>  	struct z_erofs_pcluster *pcl;
>  	struct z_erofs_collection *cl;
>  	struct erofs_workgroup *grp;
> @@ -469,12 +473,12 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
>  	}
>  
>  	/* no available pcluster, let's allocate one */
> -	pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
> +	pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
> +				     map->m_plen >> PAGE_SHIFT);
>  	if (IS_ERR(pcl))
>  		return PTR_ERR(pcl);
>  
>  	atomic_set(&pcl->obj.refcount, 1);
> -	pcl->obj.index = map->m_pa >> PAGE_SHIFT;
>  	pcl->algorithmformat = map->m_algorithmformat;
>  	pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
>  		(map->m_flags & EROFS_MAP_FULL_MAPPED ?
> @@ -494,16 +498,25 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
>  	mutex_init(&cl->lock);
>  	DBG_BUGON(!mutex_trylock(&cl->lock));
>  
> -	grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
> -	if (IS_ERR(grp)) {
> -		err = PTR_ERR(grp);
> -		goto err_out;
> -	}
> +	if (ztailpacking) {
> +		pcl->obj.index = 0;	/* which indicates ztailpacking */
> +		pcl->pageofs_in = erofs_blkoff(map->m_pa);
> +		pcl->tailpacking_size = map->m_plen;
> +	} else {
> +		pcl->obj.index = map->m_pa >> PAGE_SHIFT;
>  
> -	if (grp != &pcl->obj) {
> -		clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
> -		err = -EEXIST;
> -		goto err_out;
> +		grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
> +		if (IS_ERR(grp)) {
> +			err = PTR_ERR(grp);
> +			goto err_out;
> +		}
> +
> +		if (grp != &pcl->obj) {
> +			clt->pcl = container_of(grp,
> +					struct z_erofs_pcluster, obj);
> +			err = -EEXIST;
> +			goto err_out;
> +		}
>  	}
>  	/* used to check tail merging loop due to corrupted images */
>  	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
> @@ -532,17 +545,20 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
>  	DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
>  	DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
>  
> -	if (!PAGE_ALIGNED(map->m_pa)) {
> -		DBG_BUGON(1);
> -		return -EINVAL;
> +	if (map->m_flags & EROFS_MAP_META) {
> +		if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
> +			DBG_BUGON(1);
> +			return -EFSCORRUPTED;
> +		}
> +		goto tailpacking;
>  	}
>  
>  	grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
>  	if (grp) {
>  		clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
>  	} else {
> +tailpacking:
>  		ret = z_erofs_register_collection(clt, inode, map);
> -
>  		if (!ret)
>  			goto out;
>  		if (ret != -EEXIST)
> @@ -558,9 +574,9 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
>  out:
>  	z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
>  				  clt->cl->pagevec, clt->cl->vcnt);
> -
>  	/* since file-backed online pages are traversed in reverse order */
> -	clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
> +	clt->icpage_ptr = clt->pcl->compressed_pages +
> +			z_erofs_pclusterpages(clt->pcl);
>  	return 0;
>  }
>  
> @@ -687,8 +703,26 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
>  	else
>  		cache_strategy = DONTALLOC;
>  
> -	preload_compressed_pages(clt, MNGD_MAPPING(sbi),
> -				 cache_strategy, pagepool);
> +	if (z_erofs_is_inline_pcluster(clt->pcl)) {

current cache_strategy is only for preload_compressed_pages(), so the cache_strategy should be
needless for inline branch.

> +		struct page *mpage;
> +
> +		mpage = erofs_get_meta_page(inode->i_sb,
> +					    erofs_blknr(map->m_pa));

could we just use the map->mpage directly if it's what we want(which is the most cases when test),
if not we erofs_get_meta_page()?

> +		if (IS_ERR(mpage)) {
> +			err = PTR_ERR(mpage);
> +			erofs_err(inode->i_sb,
> +				  "failed to get inline page, err %d", err);
> +			goto err_out;
> +		}
> +		/* TODO: new subpage feature will get rid of it */
> +		unlock_page(mpage);
> +
> +		WRITE_ONCE(clt->pcl->compressed_pages[0], mpage);
> +		clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
> +	} else {
> +		preload_compressed_pages(clt, MNGD_MAPPING(sbi),
> +					 cache_strategy, pagepool);
> +	}
>  
>  hitted:
>  	/*
> @@ -844,6 +878,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
>  				       struct page **pagepool)
>  {
>  	struct erofs_sb_info *const sbi = EROFS_SB(sb);
> +	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
>  	struct z_erofs_pagevec_ctor ctor;
>  	unsigned int i, inputsize, outputsize, llen, nr_pages;
>  	struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
> @@ -925,16 +960,15 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
>  	overlapped = false;
>  	compressed_pages = pcl->compressed_pages;
>  
> -	for (i = 0; i < pcl->pclusterpages; ++i) {
> +	for (i = 0; i < pclusterpages; ++i) {
>  		unsigned int pagenr;
>  
>  		page = compressed_pages[i];
> -
>  		/* all compressed pages ought to be valid */
>  		DBG_BUGON(!page);
> -		DBG_BUGON(z_erofs_page_is_invalidated(page));
>  
> -		if (!z_erofs_is_shortlived_page(page)) {
> +		if (!z_erofs_is_inline_pcluster(pcl) &&

some inline checks may exist for noinline case if it's bigpcluster. And i understand the
behavior of ztailpacking page is differ from normal page. So better to branch them? moving
the inline check outside the for loop? 

> +		    !z_erofs_is_shortlived_page(page)) {
>  			if (erofs_page_is_managed(sbi, page)) {
>  				if (!PageUptodate(page))
>  					err = -EIO;
> @@ -960,10 +994,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
>  		}
>  
>  		/* PG_error needs checking for all non-managed pages */
> -		if (PageError(page)) {
> -			DBG_BUGON(PageUptodate(page));
> +		if (!PageUptodate(page))
>  			err = -EIO;
> -		}
>  	}
>  
>  	if (err)
> @@ -978,11 +1010,16 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
>  		partial = true;
>  	}
>  
> -	inputsize = pcl->pclusterpages * PAGE_SIZE;
> +	if (z_erofs_is_inline_pcluster(pcl))
> +		inputsize = pcl->tailpacking_size;
> +	else
> +		inputsize = pclusterpages * PAGE_SIZE;
> +
>  	err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
>  					.sb = sb,
>  					.in = compressed_pages,
>  					.out = pages,
> +					.pageofs_in = pcl->pageofs_in,
>  					.pageofs_out = cl->pageofs,
>  					.inputsize = inputsize,
>  					.outputsize = outputsize,
> @@ -992,17 +1029,22 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
>  				 }, pagepool);
>  
>  out:
> -	/* must handle all compressed pages before ending pages */
> -	for (i = 0; i < pcl->pclusterpages; ++i) {
> -		page = compressed_pages[i];
> -
> -		if (erofs_page_is_managed(sbi, page))
> -			continue;
> +	/* must handle all compressed pages before actual file pages */
> +	if (z_erofs_is_inline_pcluster(pcl)) {
> +		page = compressed_pages[0];
> +		WRITE_ONCE(compressed_pages[0], NULL);
> +		put_page(page);
> +	} else {
> +		for (i = 0; i < pclusterpages; ++i) {
> +			page = compressed_pages[i];
>  
> -		/* recycle all individual short-lived pages */
> -		(void)z_erofs_put_shortlivedpage(pagepool, page);
> +			if (erofs_page_is_managed(sbi, page))
> +				continue;
>  
> -		WRITE_ONCE(compressed_pages[i], NULL);
> +			/* recycle all individual short-lived pages */
> +			(void)z_erofs_put_shortlivedpage(pagepool, page);
> +			WRITE_ONCE(compressed_pages[i], NULL);
> +		}
>  	}
>  
>  	for (i = 0; i < nr_pages; ++i) {
> @@ -1288,6 +1330,14 @@ static void z_erofs_submit_queue(struct super_block *sb,
>  
>  		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
>  
> +		/* close the main owned chain at first */
> +		owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
> +				     Z_EROFS_PCLUSTER_TAIL_CLOSED);
> +		if (z_erofs_is_inline_pcluster(pcl)) {
> +			move_to_bypass_jobqueue(pcl, qtail, owned_head);
> +			continue;
> +		}
> +
>  		/* no device id here, thus it will always succeed */
>  		mdev = (struct erofs_map_dev) {
>  			.m_pa = blknr_to_addr(pcl->obj.index),
> @@ -1297,10 +1347,6 @@ static void z_erofs_submit_queue(struct super_block *sb,
>  		cur = erofs_blknr(mdev.m_pa);
>  		end = cur + pcl->pclusterpages;
>  
> -		/* close the main owned chain at first */
> -		owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
> -				     Z_EROFS_PCLUSTER_TAIL_CLOSED);
> -
>  		do {
>  			struct page *page;
>  
> diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
> index 4a69515dea75..e043216b545f 100644
> --- a/fs/erofs/zdata.h
> +++ b/fs/erofs/zdata.h
> @@ -62,8 +62,16 @@ struct z_erofs_pcluster {
>  	/* A: lower limit of decompressed length and if full length or not */
>  	unsigned int length;
>  
> -	/* I: physical cluster size in pages */
> -	unsigned short pclusterpages;
> +	/* I: page offset of inline compressed data */
> +	unsigned short pageofs_in;
> +
> +	union {
> +		/* I: physical cluster size in pages */
> +		unsigned short pclusterpages;
> +
> +		/* I: tailpacking inline compressed size */
> +		unsigned short tailpacking_size;
> +	};
>  
>  	/* I: compression algorithm format */
>  	unsigned char algorithmformat;
> @@ -94,6 +102,18 @@ struct z_erofs_decompressqueue {
>  	} u;
>  };
>  
> +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
> +{
> +	return !pcl->obj.index;
> +}
> +
> +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
> +{
> +	if (z_erofs_is_inline_pcluster(pcl))
> +		return 1;
> +	return pcl->pclusterpages;
> +}
> +
>  #define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
>  #define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
>  #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)