All of lore.kernel.org
 help / color / mirror / Atom feed
* [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem
@ 2018-06-27 14:20 Gao Xiang
  2018-06-29 23:45 ` Chao Yu
                   ` (10 more replies)
  0 siblings, 11 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-27 14:20 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---

The patch is temporarily based on
[RFC PATCH RESEND 12/12] erofs: introduce VLE decompression support (experimental)

STILL BUGGY, NOT FOR DAILY USE!

 fs/erofs/Makefile        |    7 +-
 fs/erofs/data.c          |  189 +------
 fs/erofs/inode.c         |    2 +-
 fs/erofs/internal.h      |   54 ++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |    8 +
 fs/erofs/unzip_vle.c     | 1261 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  236 +++++++--
 fs/erofs/unzip_vle_lz4.c |  145 ++++++
 fs/erofs/utils.c         |   31 ++
 10 files changed, 1742 insertions(+), 233 deletions(-)
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 3086d08..6622e68 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,9 +1,8 @@
 EROFS_VERSION = "1.0"
 
-EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\" -DCONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=1
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip.o unzip_generic.o unzip_lz4.o
-
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c54495d..4817e16 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -43,33 +43,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -92,7 +65,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -233,6 +206,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -250,6 +225,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -257,8 +235,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -267,7 +244,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -291,7 +268,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -414,155 +391,7 @@ struct inode *erofs_init_page_bundle(struct super_block *sb)
 	return inode;
 }
 
-/*
- * Our zip(compression) subsystem wants to get the page bundle
- * in the non-blocking way. In that case, we could dynamically add
- * filemap pages to a zipped pack on-the-fly before decompressing.
- *
- * Different from buffer head (fs/buffer.c) using a private_lock
- * which is slightly slow in the high-concurrency scenarios,
- * we introduce a bit_spinlock to serialize and close all races.
- */
-struct page *erofs_grab_bundle_page(struct super_block *sb,
-	pgoff_t index, bool *created, struct list_head *page_pool)
-{
-	struct address_space *const mapping =
-		EROFS_SB(sb)->ibundle->i_mapping;
-	/* page, alternate page (if page is not exist in the mapping) */
-	struct page *page, *alt = NULL;
-
-	/* currectly, the fail path is still unimplemented */
-	const gfp_t gfp = mapping_gfp_mask(mapping) | __GFP_NOFAIL;
-
-	/* first, we try to find a unlock page */
-	*created = false;
-
-	/*
-	 * In order to reduce the memory pressure, we don't mark
-	 * the page accessed again.
-	 */
-	page = find_get_page(mapping, index);
-
-	if (page != NULL)
-		return page;
-
-	/* then, get a new free page if not found */
-	if (!list_empty(page_pool)) {
-		alt = list_last_entry(page_pool, struct page, lru);
-		list_del(&alt->lru);
-	} else {
-		alt = __page_cache_alloc(gfp);
-		DBG_BUGON(alt == NULL);
-	}
-
-	prefetchw(&alt->flags);
-	/* clean page private for the later page bundle use */
-	set_page_private(alt, 0);
-
-	do {
-		int err = add_to_page_cache_lru(alt, mapping, index, gfp);
-		if (!err) {
-			*created = true;
-			return alt;
-		} else if (err != -EEXIST) {
-			/* Presumably ENOMEM for radix tree node */
-			page = ERR_PTR(err);
-			break;
-		}
-		page = find_get_page(mapping, index);
-	} while(page == NULL);
-
-	/* put the unused alternate page back to the free pool */
-	list_add(&alt->lru, page_pool);
-	return page;
-}
-
-void erofs_add_to_page_bundle(struct erofs_page_bundle *bundle,
-	unsigned nr, struct page *page)
-{
-	struct erofs_page_bundle *b = erofs_lock_page_private(page);
-
-	if (has_page_bundle(page))
-		goto exist;
-
-	page_cache_get(page);
-	if (test_set_page_bundle(page)) {
-		page_cache_release(page);
-exist:
-		BUG_ON(bundle != b);
-		lockref_get(&b->lockref);
-		goto out;
-	}
-
-	spin_lock(&bundle->lockref.lock);
-	BUG_ON(b != NULL);
-	BUG_ON(bundle->lockref.count <= 0);
-	BUG_ON(bundle->pages[nr] != NULL);
-
-	++bundle->lockref.count;
-	bundle->pages[nr] = page;
-	spin_unlock(&bundle->lockref.lock);
-out:
-	erofs_set_page_private(page, bundle);
-	erofs_unlock_page_private(page);
-}
-
-struct erofs_page_bundle *erofs_get_page_bundle(struct page *page,
-	unsigned nr, erofs_page_bundle_ctor_t ctor)
-{
-	struct erofs_page_bundle *b = erofs_lock_page_private(page);
-
-	if (!has_page_bundle(page))
-		ctor(page, nr);
-	else {
-		DBG_BUGON(b == NULL);
-		DBG_BUGON(b->pages[nr] != page);
-
-		lockref_get(&b->lockref);
-	}
-	erofs_unlock_page_private(page);
-
-	/* page private must be available now */
-	return erofs_page_private(page);
-}
-
-extern int erofs_try_to_free_vle_zipped_page(struct page *page);
-
-static int page_bundle_releasepage(struct page *page, gfp_t gfp_mask)
-{
-	int ret = 1;	/* 0 - busy */
-	struct address_space *const mapping = page->mapping;
-
-	BUG_ON(!PageLocked(page));
-	BUG_ON(mapping->a_ops != &erofs_page_bundle_aops);
-
-	if (has_page_bundle(page)) {
-		debugln("%s, page: %p", __func__, page);
-
-		/* currently we have the only user */
-		ret = erofs_try_to_free_vle_zipped_page(page);
-	}
-	return ret;
-}
-
-static void page_bundle_invalidatepage(struct page *page,
-                                       unsigned int offset,
-                                       unsigned int length)
-{
-	const unsigned int stop = length + offset;
-
-	BUG_ON(!PageLocked(page));
-	/* Check for overflow */
-	BUG_ON(stop > PAGE_SIZE || stop < length);
-
-	if (offset == 0 && stop == PAGE_SIZE)
-		while(!page_bundle_releasepage(page, GFP_NOFS))
-			cond_resched();
-}
-
 const struct address_space_operations erofs_page_bundle_aops = {
-	.releasepage = page_bundle_releasepage,
-	.invalidatepage = page_bundle_invalidatepage,
 };
 
 #endif
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 61010c0..12f2e1c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -181,7 +181,7 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
 #ifdef CONFIG_EROFS_FS_ZIP
 		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
 #else
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 307f435..726636e 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -64,8 +64,20 @@ struct erofs_sb_info {
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
 #ifdef CONFIG_EROFS_FS_ZIP
+
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* dedicated workspace for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} zwrksp;
+
 #endif
 
 	u32 build_time_nsec;
@@ -94,6 +106,16 @@ struct erofs_sb_info {
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define z_erofs_workspace_lock(sbi) spin_lock(&(sbi)->zwrksp.lock)
+#define z_erofs_workspace_unlock(sbi) spin_unlock(&(sbi)->zwrksp.lock)
+#else
+#define z_erofs_workspace_lock(sbi) xa_lock(&(sbi)->zwrksp.tree)
+#define z_erofs_workspace_unlock(sbi) xa_unlock(&(sbi)->zwrksp.tree)
+#endif
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
@@ -247,6 +269,35 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+
+static inline struct bio *prepare_bio(struct super_block *sb,
+				      erofs_blk_t blkaddr,
+				      unsigned nr_pages, bio_end_io_t endio)
+{
+	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
+
+	BUG_ON(bio == NULL);
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
@@ -409,5 +460,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,45 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c46d1c6..3de0631 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -113,6 +113,10 @@ static int superblock_read(struct super_block *sb)
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
 #ifdef CONFIG_EROFS_FS_ZIP
 	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
 #endif
 
 	sbi->root_nid = le64_to_cpu(layout->root_nid);
@@ -195,6 +199,10 @@ static int erofs_read_super(struct super_block *sb,
 		goto err_sbi;
 	}
 #endif
+	INIT_RADIX_TREE(&sbi->zwrksp.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->zwrksp.lock);
+#endif
 
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..5fa10db
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,1261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+/* -- zip subsystem overall -- */
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+/* -- pagevec implementation -- */
+struct z_erofs_pagevec_collector {
+	struct page *curr, *next;
+	uintptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void
+z_erofs_pagevec_collector_exit(
+	struct z_erofs_pagevec_collector *collector,
+	bool atomic)
+{
+	if (collector->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(collector->pages);
+	else
+		kunmap(collector->curr);
+}
+
+union z_erofs_page_converter {
+	struct page *page;
+	uintptr_t v;
+};
+
+enum z_erofs_vle_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_VLE_PAGE_TYPE_EXCLUSIVE,
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MASK
+};
+
+static inline struct page *
+z_erofs_pagevec_collector_next_page(
+	struct z_erofs_pagevec_collector *collector, unsigned nr)
+{
+	struct page *next = collector->next;
+
+	/* keep away from occupied pages */
+	if (next == NULL) {
+		unsigned index;
+
+		for(index = 0; index < nr; ++index) {
+			union z_erofs_page_converter cvt =
+				{.v = collector->pages[index]};
+
+			if (!(cvt.v & Z_EROFS_VLE_PAGE_TYPE_MASK)) {
+				cvt.v &= ~Z_EROFS_VLE_PAGE_TYPE_MASK;
+				next = cvt.page;
+				break;
+			}
+		}
+	}
+	return next;
+}
+
+static inline void
+z_erofs_pagevec_collector_pagedown(
+	struct z_erofs_pagevec_collector *ctor,
+	bool atomic)
+{
+	struct page *next;
+
+	next = z_erofs_pagevec_collector_next_page(ctor, ctor->nr);
+	z_erofs_pagevec_collector_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_collector_init(
+	struct z_erofs_pagevec_collector *ctor,
+	uintptr_t *pages, unsigned i)
+{
+	const unsigned inline_nr = Z_EROFS_VLE_INLINE_PAGEVECS;
+
+	ctor->nr = inline_nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= inline_nr) {
+		i -= inline_nr;
+		z_erofs_pagevec_collector_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_collector_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_collector_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool z_erofs_pagevec_collector_enqueue(
+	struct z_erofs_pagevec_collector *collector,
+	struct page *page,
+	enum z_erofs_vle_page_type type,
+	bool *occupied)
+{
+	union z_erofs_page_converter cvt;
+
+	*occupied = false;
+	if (unlikely(collector->next == NULL && type))
+		if (collector->index + 1 == collector->nr)
+			return false;
+
+	if (unlikely(collector->index >= collector->nr))
+		z_erofs_pagevec_collector_pagedown(collector, false);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)collector->next) {
+		collector->next = page;
+		*occupied = true;
+	}
+
+	cvt.page = page;
+	BUG_ON(cvt.v & Z_EROFS_VLE_PAGE_TYPE_MASK);
+	collector->pages[collector->index++] = cvt.v | type;
+	return true;
+}
+
+static inline struct page *z_erofs_pagevec_collector_dequeue(
+	struct z_erofs_pagevec_collector *collector,
+	enum z_erofs_vle_page_type *type)
+{
+	union z_erofs_page_converter cvt;
+
+	if (unlikely(collector->index >= collector->nr)) {
+		BUG_ON(collector->next == NULL);
+		z_erofs_pagevec_collector_pagedown(collector, true);
+	}
+	cvt.v = collector->pages[collector->index];
+
+	*type = cvt.v & Z_EROFS_VLE_PAGE_TYPE_MASK;
+	cvt.v &= ~Z_EROFS_VLE_PAGE_TYPE_MASK;
+
+	if (collector->next == NULL)
+		collector->next = cvt.page;
+
+	collector->pages[collector->index++] = (uintptr_t)NULL;
+	return cvt.page;
+}
+
+struct z_erofs_vle_work_pageldr {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_collector vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_pageldr *l,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (l->compressed_deficit) {
+		--l->compressed_deficit;
+		if (cmpxchg(l->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_pageldr *l,
+	struct page *page,
+	enum z_erofs_vle_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_VLE_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(l, page))
+		return 0;
+
+	ret = z_erofs_pagevec_collector_enqueue(&l->vector,
+		page, type, &occupied);
+	l->curr->vcnt += (unsigned)ret;
+	return ret ? 0 : -EAGAIN;
+}
+
+static struct z_erofs_vle_workgroup *
+z_erofs_vle_workgroup_find(struct super_block *sb,
+			   pgoff_t index,
+			   bool *cached)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+		void *ptr;
+	} u;
+
+repeat:
+	rcu_read_lock();
+	u.ptr = radix_tree_lookup(&sbi->zwrksp.tree, index);
+	if (u.ptr != NULL) {
+		*cached = radix_tree_exceptional_entry(u.ptr);
+		u.v &= ~RADIX_TREE_EXCEPTIONAL_ENTRY;
+
+		if (z_erofs_vle_workgroup_get(u.grp)) {
+			rcu_read_unlock();
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+	return u.grp;
+}
+
+static int z_erofs_vle_workgroup_register(struct super_block *sb,
+					  struct z_erofs_vle_workgroup *grp,
+					  bool cached)
+{
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+	} u;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	z_erofs_workspace_lock(sbi);
+	u.grp = grp;
+	u.v |= (unsigned)cached << RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+	err = radix_tree_insert(&sbi->zwrksp.tree, grp->index, u.grp);
+	if (!err)
+		__z_erofs_vle_workgroup_get(grp);
+
+	z_erofs_workspace_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_pageldr *l,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       uintptr_t *chained_page)
+{
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	struct z_erofs_vle_workgroup *grp;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+	int err;
+
+	BUG_ON(l->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained page */
+	BUG_ON(*chained_page == (uintptr_t)NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	grp = z_erofs_vle_workgroup_find(sb, index, &cached);
+	if (grp != NULL) {
+		BUG_ON(index != grp->index);
+
+		if (!cached) {
+			work = z_erofs_vle_work_uncached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			l->compressed_pages =
+				z_erofs_vle_work_uncached_mux(work);
+			l->compressed_deficit = clusterpages;
+		} else {
+			work = z_erofs_vle_work_cached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			/* TODO! get cached pages before submitting io */
+			l->compressed_pages = NULL;
+			l->compressed_deficit = 0;
+		}
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		l->owner = false;
+		/* let's claim these following types of work */
+		if (work->next == Z_EROFS_WORK_TAIL) {
+			/* type 2 */
+			work->next = *chained_page;
+			*chained_page = Z_EROFS_WORK_TAIL;
+			l->owner = true;
+		} else if (work->next == (uintptr_t)NULL) {
+			/* type 1 */
+			work->next = *chained_page;
+			*chained_page = (uintptr_t)work | cached;
+			l->owner = true;
+		}
+		goto got_it;
+	}
+
+	/* no available workgroup, let's allocate one */
+retry:
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep,
+		GFP_NOFS | __GFP_NOFAIL);
+
+	/* it is not allowed to fail (-ENOMEM / -EIO, no...) */
+	if (unlikely(grp == NULL))
+		goto retry;
+
+	/* fill general fields */
+	grp->index = index;
+	grp->llen = map->m_llen;
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		grp->flags |= Z_EROFS_WORK_FORMAT_LZ4;
+
+	/* currently, we implement uncached work at first */
+	cached = false;
+	work = z_erofs_vle_work_uncached(grp, 0);
+	work->pageofs = pageofs;
+	atomic_set(&work->refcount, 1);
+	l->compressed_pages = z_erofs_vle_work_uncached_mux(work);
+	l->compressed_deficit = clusterpages;
+
+	mutex_init(&work->lock);
+	/* type 1 */
+	WRITE_ONCE(work->next, *chained_page);
+
+	err = z_erofs_vle_workgroup_register(sb, grp, cached);
+	if (err) {
+		kmem_cache_free(z_erofs_workgroup_cachep, grp);
+		goto repeat;
+	}
+
+	*chained_page = (uintptr_t)work | cached;
+	l->owner = true;
+	mutex_lock(&work->lock);
+got_it:
+	z_erofs_pagevec_collector_init(&l->vector, work->pagevec, work->vcnt);
+	l->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+static void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *g)
+{
+	struct z_erofs_vle_work *work = &g->u.work;
+
+	if (!atomic_dec_return(&work->refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_pageldr *l)
+{
+	if (l->curr == NULL)
+		return;
+
+	z_erofs_pagevec_collector_exit(&l->vector, false);
+	mutex_unlock(&l->curr->lock);
+	l->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_pageldr *l,
+				struct erofs_map_blocks_iter *m,
+				uintptr_t *chained_page)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = l->curr;
+	enum z_erofs_vle_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(l);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(l, sb, &m->map, chained_page);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= l->owner;
+	work = l->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_VLE_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_VLE_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(l, page, page_type);
+	/* should allocate an additional page */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = alloc_pages(GFP_KERNEL | __GFP_NOFAIL, 0);
+		newpage->mapping = NULL;
+		err = z_erofs_vle_work_add_page(l, newpage, page_type);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *io, int bios)
+{
+	union {
+		struct z_erofs_vle_unzip_io *ptr;
+		unsigned v;
+	} u = { .ptr = io };
+
+	bool async = u.v & 1;
+	u.v &= ~1UL;
+
+	if (!atomic_add_return(bios, &u.ptr->pending_bios)) {
+		if (async)
+			queue_work(z_erofs_workqueue, &u.ptr->u.work);
+		else
+			wake_up(&u.ptr->u.wait);
+	}
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_collector ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_vle_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+use_global_pagemap:
+		pages = z_pagemap_global;
+	else {
+		pages = kvmalloc(nr_pages, GFP_KERNEL | __GFP_NOFAIL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			mutex_lock(&z_pagemap_global_lock);
+			goto use_global_pagemap;
+		}
+	}
+
+	for(i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_collector_init(&ctor, work->pagevec, 0);
+
+	for(i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_collector_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_collector_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for(i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for(i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for(i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+	WRITE_ONCE(work->next, NULL);
+
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+#define for_each_chained_work_safe(chained, n, work, cached) \
+for(; (cached) = (chained) & 1, \
+	(work) = (struct z_erofs_vle_work *)((chained) & ~1UL), \
+	(chained) != Z_EROFS_WORK_TAIL && ((n) = (work)->next, 1); \
+	(chained) = (n))
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	struct z_erofs_vle_work *work;
+	bool cached;
+	uintptr_t chained_page = io->head, tmp;
+
+	for_each_chained_work_safe(chained_page, tmp, work, cached) {
+		struct z_erofs_vle_workgroup *g =
+			z_erofs_vle_work_workgroup(work);
+
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+		z_erofs_vle_workgroup_put(g);
+	}
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static void z_erofs_vle_submit_all(struct super_block *sb,
+				   uintptr_t chained_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	uintptr_t tmp;
+	struct z_erofs_vle_work *work;
+	bool sync, cached;
+	unsigned bios_submitted;
+	union {
+		struct z_erofs_vle_unzip_io *ptr;
+		unsigned v;
+	} u;
+
+	if (unlikely(chained_head == Z_EROFS_WORK_TAIL))
+		return;
+
+	sync = true;
+	u.ptr = io;
+
+	/* allocate io descriptor in async mode */
+	if (io != NULL) {
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+	} else {
+		struct z_erofs_vle_unzip_io_sb *iosb;
+
+		sync = false;
+
+		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+			GFP_KERNEL | __GFP_NOFAIL);
+		BUG_ON(iosb == NULL);
+
+		iosb->sb = sb;
+		io = &iosb->io;
+		INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+		u.v |= 1;
+	}
+	io->head = chained_head;
+
+	bios_submitted = 0;
+	for_each_chained_work_safe(chained_head, tmp, work, cached) {
+		struct z_erofs_vle_workgroup *grp =
+			z_erofs_vle_work_workgroup(work);
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		BUG_ON(cached);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		/* fulfill all compressed pages */
+		for(i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+
+			page->mapping = NULL;
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		current_page = grp->index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = u.ptr;
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+		if (++i < clusterpages)
+			goto repeat;
+	}
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(u.ptr, bios_submitted);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_pageldr l = { .curr = NULL };
+	uintptr_t chained_page = Z_EROFS_WORK_TAIL;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page, &l, &m_iter, &chained_page);
+
+	z_erofs_vle_work_iter_end(&l);
+
+	if (!err) {
+		struct super_block *sb = page->mapping->host->i_sb;
+
+		z_erofs_vle_submit_all(sb, chained_page, &pagepool, &io);
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	} else {
+		errln("%s, failed to read, err [%d]", __func__, err);
+	}
+
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_pageldr l = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	LIST_HEAD(pagepool);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	uintptr_t chained_page = Z_EROFS_WORK_TAIL;
+
+	for(; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while(head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+		err = z_erofs_do_read_page(page, &l, &m_iter, &chained_page);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&l);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, chained_page, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		z_erofs_vle_submit_all(sb, chained_page, &pagepool, &io);
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index cf7ef9f..ca90fd8 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -10,70 +10,210 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
 
-#ifndef __EROFS_UNZIP_H
-#error "Please don't include unzip_vle.h directly, use unzip.h instead."
-#endif
+#include "internal.h"
+
+#define Z_EROFS_WORK_TAIL	0x5F0ECAFE
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS  3
+
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+	atomic_t refcount;
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	uintptr_t next;
+
+	union {
+		/* L: pagevec */
+		uintptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
 
-#define __vle_cluster_advise(x, bit, bits) \
-	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
 
-#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
-	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
 
-enum {
-	EROFS_VLE_CLUSTER_TYPE_PLAIN,
-	EROFS_VLE_CLUSTER_TYPE_HEAD,
-	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
-	EROFS_VLE_CLUSTER_TYPE_RESERVED,
-	EROFS_VLE_CLUSTER_TYPE_MAX
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
 };
 
-#define vle_cluster_type(di)	\
-	__vle_cluster_type((di)->di_advise)
+struct z_erofs_vle_workgroup {
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+	erofs_blk_t index;
+};
 
-static inline unsigned
-vle_compressed_index_clusterofs(unsigned clustersize,
-	struct erofs_decompressed_index_vle *di)
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+static inline int z_erofs_vle_workgroup_get(struct z_erofs_vle_workgroup *g)
 {
-	debugln("%s, vle=%p, advise=%x (type %u), clusterofs=%x blkaddr=%x",
-		__func__, di, di->di_advise, vle_cluster_type(di),
-		di->di_clusterofs, di->di_u.blkaddr);
-
-	switch(vle_cluster_type(di)) {
-	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-		break;
-	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
-	case EROFS_VLE_CLUSTER_TYPE_HEAD:
-		return di->di_clusterofs;
-	default:
-		BUG_ON(1);
-	}
-	return clustersize;
+	int o;
+
+repeat:
+	o = atomic_read(&g->u.work.refcount);
+	if (unlikely(o <= 0))
+		return -1;
+	if (unlikely(atomic_cmpxchg(&g->u.work.refcount, o, o + 1) != o))
+		goto repeat;
+	return 0;
+}
+
+#define __z_erofs_vle_workgroup_get(g)  atomic_inc(&(g)->u.work.refcount)
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	uintptr_t head;
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
 }
 
-static inline erofs_blk_t
-vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
 {
-	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
-	struct erofs_vnode *vi = EROFS_V(inode);
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
 
-	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
-		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
-		index * sizeof(struct erofs_decompressed_index_vle);
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
 
-	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
 }
 
-static inline unsigned int
-vle_extent_blkoff(struct inode *inode, pgoff_t index)
+static inline void z_erofs_onlinepage_endio(struct page *page)
 {
-	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
-	struct erofs_vnode *vi = EROFS_V(inode);
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
 
-	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
-		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
-		index * sizeof(struct erofs_decompressed_index_vle);
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
 
-	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
 }
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	(min(THREAD_SIZE >> 3, 96 * sizeof(struct page *)) / sizeof(struct page *))
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..bb5d830
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool backedup[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	for(i = 0; i < nr_pages; ++i) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL && !backedup[i-1])
+				kunmap_atomic(src);
+
+			src = NULL;
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(j = 0; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(backedup[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			backedup[j] = true;
+			break;
+		}
+
+		if (src == NULL && i) {
+			if (backedup[i-1])
+				src = percpu_data + i-1;
+			else
+				src = kmap_atomic(compressed_pages[i-1]);
+		}
+
+		memcpy(dst, src + righthalf, pageofs);
+
+		if (!backedup[i-1])
+			kunmap_atomic(src);
+
+		if (i >= clusterpages) {
+			kunmap_atomic(dst);
+			break;
+		}
+
+		if (backedup[i])
+			src = percpu_data + i;
+		else
+			src = kmap_atomic(compressed_pages[i]);
+		memcpy(dst + pageofs, src, righthalf);
+		kunmap_atomic(dst);
+	}
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-06-29 23:45 ` Chao Yu
  2018-06-30  0:25   ` Gao Xiang
  2018-06-30  9:18 ` [WIP] [NOMERGE] [RFC PATCH v0.2 1/2] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 102+ messages in thread
From: Chao Yu @ 2018-06-29 23:45 UTC (permalink / raw)


On 2018/6/27 22:20, Gao Xiang wrote:
> Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
> ---
> 
> The patch is temporarily based on
> [RFC PATCH RESEND 12/12] erofs: introduce VLE decompression support (experimental)
> 
> STILL BUGGY, NOT FOR DAILY USE!

I've just updated erofs tree. :)

Thanks,

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem
  2018-06-29 23:45 ` Chao Yu
@ 2018-06-30  0:25   ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30  0:25 UTC (permalink / raw)


Hi Chao,

On 2018/6/30 7:45, Chao Yu wrote:
> On 2018/6/27 22:20, Gao Xiang wrote:
>> Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
>> ---
>>
>> The patch is temporarily based on
>> [RFC PATCH RESEND 12/12] erofs: introduce VLE decompression support (experimental)
>>
>> STILL BUGGY, NOT FOR DAILY USE!
> 
> I've just updated erofs tree. :)
> 

Thanks, I am currently adjusting and rebasing this patch on the new <linux/tagptr.h> for the furthermore cleanup.
It is also lack of some parts, I will fix in the next v0 RFC patches. :)

> Thanks,
> 

Thanks,
Gao Xiang

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.2 1/2] <linux/tagptr.h>: Introduce tagged pointer
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
  2018-06-29 23:45 ` Chao Yu
@ 2018-06-30  9:18 ` Gao Xiang
  2018-06-30  9:18   ` [WIP] [NOMERGE] [RFC PATCH v0.2 2/2] erofs: introduce the new VLE unzip subsystem Gao Xiang
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-06-30  9:18 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Refer to:
https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.2 2/2] erofs: introduce the new VLE unzip subsystem
  2018-06-30  9:18 ` [WIP] [NOMERGE] [RFC PATCH v0.2 1/2] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-06-30  9:18   ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30  9:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

TODO:
 - spilt into more understandable patches
 - add missing functions and bugfix

The patchset is temporarily based on
[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression

STILL BUGGY, NOT FOR DAILY USE!

 fs/erofs/Kconfig         |   17 +
 fs/erofs/Makefile        |    6 +-
 fs/erofs/data.c          |   69 +--
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |   81 ++++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   49 +-
 fs/erofs/unzip.c         | 1039 ++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_pagevec.h |  165 +++++++
 fs/erofs/unzip_vle.c     | 1170 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  236 ++++++++++
 fs/erofs/unzip_vle_lz4.c |  145 ++++++
 fs/erofs/utils.c         |   31 ++
 13 files changed, 3015 insertions(+), 41 deletions(-)
 create mode 100644 fs/erofs/unzip.c
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index c244cf3..752f0e0 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -69,3 +69,20 @@ config EROFS_FS_USE_VM_MAP_RAM
 
 	  If you don't know what these are, say N.
 
+config EROFS_FS_PAGE_BUNDLE
+	bool "EROFS Page Bundle Feature"
+	depends on EROFS_FS
+	help
+	  Page Bundles manager several meta pages as a whole.
+
+	  If you don't use compression or don't know what these are, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS_PAGE_BUNDLE
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..6622e68 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,8 +1,8 @@
 EROFS_VERSION = "1.0"
 
-EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\" -DCONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=1
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9b30095..4817e16 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -43,33 +43,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -92,7 +65,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -233,6 +206,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -250,6 +225,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -257,8 +235,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -267,7 +244,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -291,7 +268,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -391,3 +368,31 @@ static int erofs_raw_access_readpages(struct file *filp,
 	.readpages = erofs_raw_access_readpages,
 };
 
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+
+struct inode *erofs_init_page_bundle(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (unlikely(inode == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &erofs_page_bundle_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+	                     GFP_NOFS | __GFP_HIGHMEM |
+	                     __GFP_MOVABLE |  __GFP_NOFAIL
+#if defined(CONFIG_CMA) && defined(___GFP_CMA)
+	                     | ___GFP_CMA
+#endif
+	                    );
+	return inode;
+}
+
+const struct address_space_operations erofs_page_bundle_aops = {
+};
+
+#endif
+
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 7391ef6..12f2e1c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -181,8 +181,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1dd783c..1efaeac 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -20,6 +20,9 @@
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/cleancache.h>
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+#include <linux/swap.h>
+#endif
 #include <linux/vmalloc.h>
 #include "erofs_fs.h"
 
@@ -54,8 +57,28 @@ struct erofs_sb_info {
 	u32 xattr_blkaddr;
 #endif
 
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+	struct inode *ibundle;
+#endif
+
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+
+	/* dedicated workspace for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} zwrksp;
+
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
@@ -83,6 +106,16 @@ struct erofs_sb_info {
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define z_erofs_workspace_lock(sbi) spin_lock(&(sbi)->zwrksp.lock)
+#define z_erofs_workspace_unlock(sbi) spin_unlock(&(sbi)->zwrksp.lock)
+#else
+#define z_erofs_workspace_lock(sbi) xa_lock(&(sbi)->zwrksp.tree)
+#define z_erofs_workspace_unlock(sbi) xa_unlock(&(sbi)->zwrksp.tree)
+#endif
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
@@ -100,6 +133,10 @@ struct erofs_sb_info {
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#define erofs_clusterpages(sbi)	((1 << (sbi)->clusterbits) / PAGE_SIZE)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
@@ -181,6 +218,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
@@ -229,6 +269,35 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+
+static inline struct bio *prepare_bio(struct super_block *sb,
+				      erofs_blk_t blkaddr,
+				      unsigned nr_pages, bio_end_io_t endio)
+{
+	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
+
+	BUG_ON(bio == NULL);
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
@@ -248,6 +317,15 @@ static inline struct page *erofs_get_inline_page(struct inode *inode,
 		blkaddr, S_ISDIR(inode->i_mode));
 }
 
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+
+extern struct inode *erofs_init_page_bundle(struct super_block *);
+
+extern const struct address_space_operations erofs_page_bundle_aops;
+
+#endif
+
+
 /* inode.c */
 extern struct inode *erofs_iget(struct super_block *sb,
 	erofs_nid_t nid, bool dir);
@@ -316,5 +394,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,45 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b41613f..3de0631 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -111,6 +111,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le64_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -185,12 +192,23 @@ static int erofs_read_super(struct super_block *sb,
 
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+	sbi->ibundle = erofs_init_page_bundle(sb);
+	if (sbi->ibundle == NULL) {
+		err = -ENOMEM;
+		goto err_sbi;
+	}
+#endif
+	INIT_RADIX_TREE(&sbi->zwrksp.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->zwrksp.lock);
+#endif
 
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto err_sbi;
+		goto err_ibundle;
 	}
 
 	if (!S_ISDIR(inode->i_mode)) {
@@ -231,6 +249,10 @@ static int erofs_read_super(struct super_block *sb,
 err_iput:
 	if (sb->s_root == NULL)
 		iput(inode);
+err_ibundle:
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+	iput(sbi->ibundle);
+#endif
 err_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -252,7 +274,9 @@ static void erofs_put_super(struct super_block *sb)
 
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
-
+#ifdef CONFIG_EROFS_FS_PAGE_BUNDLE
+	iput(sbi->ibundle);
+#endif
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -301,6 +325,11 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -309,11 +338,18 @@ int __init erofs_module_init(void)
 
 	err = erofs_init_inode_cache();
 	if (!err) {
-		err = register_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+		err = z_erofs_init_zip_subsystem();
 		if (!err) {
-			infoln("Successfully to initialize erofs");
-			return 0;
+#endif
+			err = register_filesystem(&erofs_fs_type);
+			if (!err) {
+				infoln("Successfully to initialize erofs");
+				return 0;
+			}
+#ifdef CONFIG_EROFS_FS_ZIP
 		}
+#endif
 	}
 	return err;
 }
@@ -321,6 +357,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	infoln("Successfully finalize erofs");
 }
 
diff --git a/fs/erofs/unzip.c b/fs/erofs/unzip.c
new file mode 100644
index 0000000..171aec1
--- /dev/null
+++ b/fs/erofs/unzip.c
@@ -0,0 +1,1039 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip.c
+ *
+ * Copyright (c) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip.h"
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_pack_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_pack_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_pack_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_pack_cachep =
+		kmem_cache_create("erofs_compressed_pack",
+		Z_EROFS_PACK_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_pack_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_pack_cachep);
+	}
+	return -ENOMEM;
+}
+
+static inline void put_vle_zipped_pack(struct z_erofs_vle_zipped_pack *z,
+                                       bool __maybe_unused allow_free)
+{
+	if (erofs_put_page_bundle(&z->bundle))
+		return;
+
+	DBG_BUGON(mutex_is_locked(&z->lock));
+	DBG_BUGON(!allow_free);
+	kmem_cache_free(z_erofs_pack_cachep, z);
+}
+
+int erofs_try_to_free_vle_zipped_page(struct page *page)
+{
+	struct erofs_page_bundle *b;
+	struct z_erofs_vle_zipped_pack *zip;
+	unsigned i;
+	bool will_free;
+
+	erofs_dbg_might_sleep();
+	b = erofs_lock_page_private(page);
+
+	DBG_BUGON(!has_page_bundle(page));
+	zip = container_of(b, struct z_erofs_vle_zipped_pack, bundle);
+
+	/* I prefer not to sleep in the reclaim path, try_lock instead */
+	if (!mutex_trylock(&zip->lock)) {
+busy_unlock_page_private:
+		erofs_unlock_page_private(page);
+		return 0;
+	}
+
+	/* freeze the whole page bundle */
+	spin_lock(&b->lockref.lock);
+
+	/* the page bundle still has active users */
+	if (b->lockref.count > 1) {
+busy_unlock_bundle:
+		spin_unlock(&b->lockref.lock);
+		mutex_unlock(&zip->lock);
+		goto busy_unlock_page_private;
+	}
+
+	/* try to release the head zipped page */
+	if (page == b->pages[0]) {
+		/* the rest zpages should be released */
+		for(i = 1; i < EROFS_PAGE_BUNDLE_MAX_PAGES; ++i)
+			if (b->pages[i] != NULL)
+				goto busy_unlock_bundle;
+		b->pages[0] = NULL;
+		will_free = true;
+		goto reclaim;
+	}
+
+	for(i = 1; i < EROFS_PAGE_BUNDLE_MAX_PAGES; ++i) {
+		if (b->pages[i] == page) {
+			b->pages[i] = NULL;
+			will_free = false;
+			goto reclaim;
+		}
+	}
+
+	BUG();
+reclaim:
+	ClearPagePrivate(page);
+	erofs_set_page_private(page, NULL);
+	spin_unlock(&b->lockref.lock);
+	mutex_unlock(&zip->lock);
+	erofs_unlock_page_private(page);
+
+	if (will_free)
+		put_vle_zipped_pack(zip, true);
+	put_page(page);
+	return 1;
+}
+
+/* zip should be locked by callers */
+static void z_erofs_vle_unzip(struct z_erofs_vle_zipped_pack *const zip)
+{
+	struct erofs_page_bundle *const b = &zip->bundle;
+	struct z_erofs_pack_info pack;
+	struct inode *inode;
+	struct page *page;
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+	unsigned clusterpages, i;
+#else
+	const unsigned clusterpages = 1;
+#endif
+	void *in;
+
+	/*
+	 * end_io queue work start
+	 * end_io work queue end (queued_pages == 0)
+	 * z_erofs_vle_do_read_page, queue work again
+	 */
+	if (unlikely(!READ_ONCE(zip->queued_pages)))
+		goto out_unlock;
+
+	page = zip->pages[0];
+	DBG_BUGON(page == NULL);
+	inode = page->mapping->host;
+
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+	clusterpages = erofs_clusterpages(EROFS_I_SB(inode));
+
+	for(i = 0; i < clusterpages; ++i) {
+		DBG_BUGON(b->pages[i] == NULL);
+		DBG_BUGON(!PageUptodate(b->pages[i]));
+	}
+#else
+	DBG_BUGON(b->pages[0] == NULL);
+	DBG_BUGON(!PageUptodate(b->pages[0]));
+#endif
+
+	debugln("%s, zip=%p la = %llu, llen = %u", __func__, zip, zip->la, zip->llen);
+
+	pack.pages = zip->pages;
+	pack.nr_pages = zip->nr_pages;
+	pack.queued_pages = zip->queued_pages;
+
+	if (!(zip->flags & Z_EROFS_PACK_ZIPPED))
+		z_erofs_plain_copy(&pack, b->pages, clusterpages, zip->la);
+	else {
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+		in = clusterpages == 1 ? kmap(b->pages[0]):
+			vmap(b->pages, clusterpages, VM_MAP, PAGE_KERNEL);
+#else
+		in = kmap(b->pages[0]);
+#endif
+
+		z_erofs_unzip_generic(&pack, in, clusterpages * PAGE_SIZE,
+		                      zip->la, zip->llen);
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+		if (clusterpages == 1)
+			kunmap(b->pages[0]);
+		else
+			vunmap(in);
+#else
+		kunmap(b->pages[0]);
+#endif
+	}
+
+	/* check decompressor has filled all queued pages */
+	DBG_BUGON(pack.queued_pages);
+	zip->queued_pages = 0;
+	zip->nr_pages = 0;		/* FIXME later */
+out_unlock:
+	mutex_unlock(&zip->lock);
+	put_vle_zipped_pack(zip, false);
+
+}
+
+static void z_erofs_vle_decompress_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_zipped_pack *const zip =
+		container_of(work, struct z_erofs_vle_zipped_pack, work);
+
+	if (!READ_ONCE(zip->queued_pages)) {
+		put_vle_zipped_pack(zip, false);
+		return;
+	}
+	mutex_lock(&zip->lock);
+	z_erofs_vle_unzip(zip);
+}
+
+static void __vle_zipped_bundle_alloc(struct page *page, unsigned nr)
+{
+	struct erofs_page_bundle *b;
+	struct z_erofs_vle_zipped_pack *zip =
+		kmem_cache_zalloc(z_erofs_pack_cachep, GFP_ATOMIC);
+
+	/* here we grab an extra page reference for page private */
+	get_page(page);
+
+	/* if we cannot allocate memory in atomic, try sleeping way instead */
+	if (unlikely(zip == NULL)) {
+		erofs_unlock_page_private(page);
+
+		erofs_dbg_might_sleep();
+		zip = kmem_cache_zalloc(z_erofs_pack_cachep,
+		                        GFP_KERNEL | __GFP_NOFAIL);
+
+		b = erofs_lock_page_private(page);
+		if (test_set_page_bundle(page)) {
+			DBG_BUGON(b == NULL);
+			DBG_BUGON(b->pages[nr] != page);
+
+			lockref_get(&b->lockref);
+			kmem_cache_free(z_erofs_pack_cachep, zip);
+			put_page(page);
+			return;
+		}
+
+		DBG_BUGON(b != NULL);
+	} else if (test_set_page_bundle(page))
+		BUG();
+
+	mutex_init(&zip->lock);
+	INIT_WORK(&zip->work, z_erofs_vle_decompress_wq);
+
+	b = &zip->bundle;
+	/* initialize global page bundle */
+	b->pages[nr] = page;
+	b->lockref.count = 2;
+	spin_lock_init(&b->lockref.lock);
+	erofs_set_page_private(page, b);
+}
+
+static inline struct page *grab_vle_zipped_page(struct super_block *sb,
+                                                pgoff_t index,
+                                                struct erofs_page_bundle **b,
+                                                bool *created,
+                                                struct list_head *page_pool)
+{
+	struct page *page;
+
+	page = erofs_grab_bundle_page(sb, index, created, page_pool);
+	if (!IS_ERR(page)) {
+		/* we only get a new page bundle from the head page */
+		*b = erofs_get_page_bundle(page, 0, __vle_zipped_bundle_alloc);
+	}
+	return page;
+}
+
+/* TODO! FIXME!!! this function is still broken :( */
+static int z_erofs_add_tailpage(struct z_erofs_zipped_pagevec *z_pvec,
+                                struct super_block *sb,
+                                pgoff_t hi, pgoff_t ti,
+                                struct erofs_page_bundle *b,
+                                struct list_head *page_pool)
+{
+	return -ENOTSUPP;
+}
+
+struct z_erofs_zipped_pack_collector {
+	struct list_head list;
+	bool sync;
+};
+
+static inline void vle_zipped_iter_dispatch(struct z_erofs_vle_zipped_iter *z,
+	struct z_erofs_zipped_pack_collector *c)
+{
+	struct z_erofs_vle_zipped_pack *const zip = z->zip;
+	struct list_head *const e = z_erofs_vle_zipped_list_entry(zip);
+
+	/* decompressed pages is already ok? */
+	if (!z->already) {
+		if (c->sync) {
+			if (!z_erofs_vle_zipped_protect_list_entry(zip))
+				return;
+			list_add_tail(e, &c->list);
+		}
+	} else {
+		if (!z_erofs_vle_zipped_protect_list_entry(zip))
+			return;
+		list_add(e, &c->list);
+	}
+	lockref_get(&zip->bundle.lockref);
+}
+
+static inline void vle_zipped_iter_end(struct z_erofs_vle_zipped_iter *z)
+{
+	z_erofs_de_pagevec_end(&z->d_pvec, false);
+	mutex_unlock(&z->zip->lock);
+
+	put_vle_zipped_pack(z->zip, false);
+}
+
+static inline void vle_zipped_collected_enqueue_all(struct list_head *list)
+{
+	struct list_head *e, *tmp;
+
+	list_for_each_safe(e, tmp, list) {
+		struct work_struct *work = container_of(e,
+			struct work_struct, entry);
+		struct z_erofs_vle_zipped_pack *zip;
+
+		list_del(e);
+		INIT_LIST_HEAD(e);
+
+		zip = container_of(work, struct z_erofs_vle_zipped_pack, work);
+		z_erofs_vle_zipped_unprotect_list_entry(zip);
+
+		/* there is no need to lock strictly */
+		if (unlikely(!READ_ONCE(zip->queued_pages))) {
+			put_vle_zipped_pack(zip, false);
+			continue;
+		}
+		debugln("%s, queue work %p", __func__, &zip->work);
+		queue_work(z_erofs_workqueue, work);
+	}
+}
+
+static inline void vle_zipped_collected_unzip_all(struct super_block *sb,
+	struct list_head *list)
+{
+	struct work_struct *work;
+	struct z_erofs_vle_zipped_pack *zip;
+	struct erofs_page_bundle *b;
+	struct page *victim;
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned j;
+#endif
+	struct list_head *e, *tmp;
+	unsigned round = 0;
+
+repeat:
+	/* why isn't blk_flush_plug_list() exported? :-( */
+	if (round == 1 && blk_needs_flush_plug(current))
+		io_schedule();
+
+	/* wait on a single page at each end of a round */
+	victim = NULL;
+
+	list_for_each_safe(e, tmp, list) {
+		work = container_of(e, struct work_struct, entry);
+		zip = container_of(work, struct z_erofs_vle_zipped_pack, work);
+		b = &zip->bundle;
+
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+		for (j = 0; j < clusterpages; ++j) {
+			if (!PageLocked(b->pages[j]))
+				continue;
+			if (round >= 4)
+				if (victim == NULL || !PageLocked(victim))
+					victim = b->pages[j];
+			break;
+		}
+		if (j < clusterpages) {
+#else
+		if (PageLocked(b->pages[0])) {
+			if (victim == NULL || !PageLocked(victim))
+				victim = b->pages[0];
+#endif
+			continue;
+		}
+
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+		for (j = 0; j < clusterpages; ++j)
+			BUG_ON(!PageUptodate(b->pages[j]));
+#else
+		BUG_ON(!PageUptodate(b->pages[0]));
+#endif
+
+		if (round >= 6)
+			mutex_lock(&zip->lock);
+		else if (!mutex_trylock(&zip->lock))
+			continue;
+
+		list_del(e);
+		INIT_LIST_HEAD(e);
+		z_erofs_vle_zipped_unprotect_list_entry(zip);
+		z_erofs_vle_unzip(zip);
+	}
+
+	if (!list_empty(list)) {
+		if (victim != NULL)
+			wait_on_page_locked(victim);
+
+		++round;
+		goto repeat;
+	}
+}
+
+static int z_erofs_vle_do_read_page(
+	struct page *page,
+	struct z_erofs_zipped_pagevec *z_pvec,
+	struct z_erofs_vle_zipped_iter *z,
+	struct erofs_map_blocks_iter *m,
+	struct list_head *page_pool,
+	struct z_erofs_zipped_pack_collector *collector)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const loff_t offset = page_offset(page);
+	struct z_erofs_vle_zipped_pack *zip = z->zip;
+	unsigned cur, end, spiltted;
+	int err;
+	bool creat;
+	struct page *zpage;
+	struct erofs_page_bundle *b;
+	unsigned clusterpages;
+	pgoff_t hi, ti;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (zip != NULL) {
+		vle_zipped_iter_dispatch(z, collector);
+		vle_zipped_iter_end(z);
+	}
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zip = NULL;
+		goto hitted;
+	}
+
+	DBG_BUGON(m->map.m_plen != 1 << sbi->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	/* grab the zipped head page and bundle */
+	hi = m->map.m_pa / PAGE_SIZE;
+	zpage = grab_vle_zipped_page(sb, hi, &b, &creat, page_pool);
+
+	zip = container_of(b, struct z_erofs_vle_zipped_pack, bundle);
+	if (IS_ERR(zpage))
+		goto err_out;
+
+	debugln("%s, (head zipped page %p, index=%lu) page %p "
+		"created=%d", __func__, zpage, hi, page, creat);
+
+	clusterpages = erofs_clusterpages(sbi);
+
+	/* already = true iff no zpage adds to zipped_pagevec */
+	z->already = true;
+
+	/* as others above, add tail zpages in the reserve order */
+	ti = DIV_ROUND_UP(m->map.m_pa + m->map.m_plen, PAGE_SIZE);
+	while(ti > hi + 1) {
+		err = z_erofs_add_tailpage(z_pvec, sb, hi, --ti, b, page_pool);
+		z->already &= !err;
+	}
+
+	if (!creat) {
+		/* why do this? -- see comment in "do_read_cache_page" */
+		wait_on_page_locked(zpage);
+
+		if (PageUptodate(zpage))
+			goto has_data;
+
+		lock_page(zpage);
+		if (PageUptodate(zpage)) {
+			unlock_page(zpage);
+			goto has_data;
+		}
+	}
+
+	z_erofs_zipped_pagevec_push(z_pvec, zpage);
+	z->already = false;
+
+has_data:
+	mutex_lock(&zip->lock);
+
+	z->zip = zip;
+
+	if (!(zip->flags & Z_EROFS_PACK_INITIALIZED)) {
+		zip->la = m->map.m_la;
+		if (m->map.m_flags & EROFS_MAP_ZIPPED)
+			zip->flags |= Z_EROFS_PACK_ZIPPED;
+		zip->flags |= Z_EROFS_PACK_INITIALIZED;
+	} else {
+		BUG_ON(zip->la != m->map.m_la);
+		BUG_ON(!(zip->flags & Z_EROFS_PACK_ZIPPED) !=
+			!(m->map.m_flags & EROFS_MAP_ZIPPED));
+	}
+
+	/* physical address should be equal */
+	DBG_BUGON(m->map.m_pa != page_offset(b->pages[0]));
+
+	/* update logical extent length */
+	if (m->map.m_llen > zip->llen)
+		zip->llen = m->map.m_llen;
+
+	put_page(zpage);
+	z_erofs_de_pagevec_init(&z->d_pvec, zip->pages, zip->queued_pages);
+
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	++spiltted;
+	z_erofs_de_pagevec_enqueue(&z->d_pvec, page);
+
+	/* also update nr_pages and increase queued_pages */
+	zip->nr_pages = max_t(pgoff_t, zip->nr_pages,
+	                      page->index - m->map.m_la / PAGE_SIZE + 1);
+	++zip->queued_pages;
+
+next_part:
+	/* used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	debugln("%s, finish page: %p spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+
+	/* the online file page could be unlocked after this line */
+	z_erofs_onlinepage_setup(page, spiltted);
+	return 0;
+
+err_out:
+	/* TODO! the missing error handing cases */
+	return err;
+}
+
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+		break;
+	default:
+		errln("%s, invalid cluster type %u on m_la %llu of nid %llu",
+			__func__, vle_cluster_type(di), ofs,
+			EROFS_V(inode)->nid);
+		BUG();
+		pcn = ~0;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void zipped_async_read_endio(struct bio *bio, int err)
+#else
+static inline void zipped_async_read_endio(struct bio *bio)
+#endif
+{
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+	struct super_block *sb = bio->bi_private;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_vle_zipped_pack *victim = NULL;
+	unsigned j, z_avail = 0; /* avoid the false uninitialized warning */
+#endif
+	unsigned i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+		const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+		const int err = bio->bi_error;
+#endif
+		struct z_erofs_vle_zipped_pack *zip;
+
+		/* page is already locked */
+		DBG_BUGON(PageUptodate(page));
+
+		if (unlikely(err))
+			SetPageError(page);
+		else
+			SetPageUptodate(page);
+
+		debugln("%s: %d zpage %p index: %lu", __func__, __LINE__,
+			page, page->index);
+
+		zip = (void *)erofs_page_private(page);
+		DBG_BUGON(zip == NULL);
+
+		DBG_BUGON(!has_page_bundle(page));
+
+#if EROFS_PAGE_BUNDLE_MAX_PAGES > 1
+		/* for multiple bundle pages */
+		if (zip == victim)
+			++z_avail;
+		else {
+			z_avail = 0;
+			for(j = 0; j < EROFS_PAGE_BUNDLE_MAX_PAGES; ++j)
+				z_avail += PageUptodate(zip->bundle.pages[j]);
+			victim = zip;
+		}
+
+		if (z_avail == clusterpages) {
+#else
+		if (PageUptodate(zip->bundle.pages[0])) {
+#endif
+
+			debugln("queue work %p zpage %p zip %p", &zip->work, page, zip);
+
+			queue_work(z_erofs_workqueue, &zip->work);
+		}
+
+		unlock_page(page);
+		/* page could be reclaimed now */
+	}
+	bio_put(bio);
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void zipped_sync_read_endio(struct bio *bio, int err)
+#else
+static inline void zipped_sync_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+		const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+		const int err = bio->bi_error;
+#endif
+
+		/* page is already locked */
+		DBG_BUGON(PageUptodate(page));
+
+		if (unlikely(err))
+			SetPageError(page);
+		else
+			SetPageUptodate(page);
+
+		unlock_page(page);
+		/* page could be reclaimed now */
+	}
+	bio_put(bio);
+}
+
+static struct bio *zipped_prepare_bio(struct super_block *sb,
+	erofs_blk_t blkaddr, bool sync)
+{
+	/* FIXME, need optimise */
+	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+
+	BUG_ON(bio == NULL);
+	bio->bi_end_io = sync ? zipped_sync_read_endio :
+	                        zipped_async_read_endio;
+	bio_set_dev(bio, sb->s_bdev);
+	bio->bi_private = sb;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
+static void z_erofs_vle_submit_all(struct super_block *sb,
+                                   struct z_erofs_zipped_pagevec *vec,
+                                   bool sync)
+{
+	struct page *page, *tmp;
+	pgoff_t last_page;
+	struct bio *bio = NULL;
+
+	if (z_erofs_zipped_pagevec_empty(vec))
+		return;
+
+	/* should not be NULL */
+	tmp = z_erofs_zipped_pagevec_pop(vec);
+	do {
+		pgoff_t current_page;
+
+		page = tmp;
+		current_page = page->index;
+
+		/* could contain the pagevec itself, pop "tmp" in advance */
+		tmp = z_erofs_zipped_pagevec_pop(vec);
+
+		debugln("%s, found vec=%p page %p, index=%lu",
+			__func__, vec, page, current_page);
+
+		DBG_BUGON(!PageLocked(page));
+
+		if (bio != NULL && last_page + 1 != page->index) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+
+		if (bio == NULL)
+			bio = zipped_prepare_bio(sb, current_page, sync);
+
+		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+	} while (tmp != NULL);
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = {.m_llen = 0, .m_plen = 0},
+		.mpage = NULL
+	};
+	struct z_erofs_vle_zipped_iter z_iter = { .zip = NULL };
+	struct z_erofs_zipped_pagevec z_pvec = { .page = NULL };
+	struct z_erofs_zipped_pack_collector collector = {
+		.list = LIST_HEAD_INIT(collector.list),
+		.sync = true
+	};
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_vle_do_read_page(page, &z_pvec,
+		&z_iter, &m_iter, &pagepool, &collector);
+
+	if (z_iter.zip != NULL) {
+		vle_zipped_iter_dispatch(&z_iter, &collector);
+		vle_zipped_iter_end(&z_iter);
+	}
+
+	if (!err) {
+		struct super_block *sb = page->mapping->host->i_sb;
+
+		/* submit all compressed page in the forward order */
+		z_erofs_vle_submit_all(sb, &z_pvec, true);
+		/* unzip all collected compressed pages */
+		vle_zipped_collected_unzip_all(sb, &collector.list);
+	} else {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		z_erofs_zipped_pagevec_end(&z_pvec);
+	}
+
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return err;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = {.m_llen = 0, .m_plen = 0},
+		.mpage = NULL
+	};
+	struct z_erofs_vle_zipped_iter z_iter = { .zip = NULL };
+	struct z_erofs_zipped_pagevec z_pvec = { .page = NULL };
+	struct z_erofs_zipped_pack_collector collector = {
+		.list = LIST_HEAD_INIT(collector.list),
+		.sync = sync
+	};
+	struct super_block *sb = mapping->host->i_sb;
+	LIST_HEAD(pagepool);
+
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+
+	for (; nr_pages; --nr_pages) {
+		/* traversal in reverse order */
+		struct page *page = list_entry(pages->next, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp))
+			list_add(&page->lru, &pagepool);
+		else {
+			int err = z_erofs_vle_do_read_page(page, &z_pvec,
+				&z_iter, &m_iter, &pagepool, &collector);
+
+			if (err) {
+				errln("%s, readahead error at page %lu of nid %llu",
+					__func__, page->index,
+					EROFS_V(mapping->host)->nid);
+			}
+			put_page(page);
+		}
+	}
+
+	if (z_iter.zip != NULL) {
+		vle_zipped_iter_dispatch(&z_iter, &collector);
+		vle_zipped_iter_end(&z_iter);
+	}
+
+	/* submit all compresssed page in the forward order */
+	z_erofs_vle_submit_all(sb, &z_pvec, sync);
+
+	if (!sync)
+		/* queue all collected compressed pages (ready) for workers */
+		vle_zipped_collected_enqueue_all(&collector.list);
+	else
+		/* unzip all collected compressed pages */
+		vle_zipped_collected_unzip_all(sb, &collector.list);
+
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
+
diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..9441750
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (c) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..aac339a
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,1170 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_pageldr {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_pageldr *l,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (l->compressed_deficit) {
+		--l->compressed_deficit;
+		if (cmpxchg(l->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_pageldr *l,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(l, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&l->vector,
+		page, type, &occupied);
+	l->curr->vcnt += (unsigned)ret;
+	return ret ? 0 : -EAGAIN;
+}
+
+static struct z_erofs_vle_workgroup *
+z_erofs_vle_workgroup_find(struct super_block *sb,
+			   pgoff_t index,
+			   bool *cached)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+		void *ptr;
+	} u;
+
+repeat:
+	rcu_read_lock();
+	u.ptr = radix_tree_lookup(&sbi->zwrksp.tree, index);
+	if (u.ptr != NULL) {
+		*cached = radix_tree_exceptional_entry(u.ptr);
+		u.v &= ~RADIX_TREE_EXCEPTIONAL_ENTRY;
+
+		if (z_erofs_vle_workgroup_get(u.grp)) {
+			rcu_read_unlock();
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+	return u.grp;
+}
+
+static int z_erofs_vle_workgroup_register(struct super_block *sb,
+					  struct z_erofs_vle_workgroup *grp,
+					  bool cached)
+{
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+	} u;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	z_erofs_workspace_lock(sbi);
+	u.grp = grp;
+	u.v |= (unsigned)cached << RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+	err = radix_tree_insert(&sbi->zwrksp.tree, grp->index, u.grp);
+	if (!err)
+		__z_erofs_vle_workgroup_get(grp);
+
+	z_erofs_workspace_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+retry:
+	/* let's claim these following types of work */
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;
+
+	return true;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_pageldr *l,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	struct z_erofs_vle_workgroup *grp;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+	int err;
+
+	BUG_ON(l->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+restart:
+	grp = z_erofs_vle_workgroup_find(sb, index, &cached);
+	if (grp != NULL) {
+		BUG_ON(index != grp->index);
+
+		if (!cached) {
+			work = z_erofs_vle_work_uncached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			l->compressed_pages =
+				z_erofs_vle_work_uncached_mux(work);
+			l->compressed_deficit = clusterpages;
+		} else {
+			work = z_erofs_vle_work_cached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			/* TODO! get cached pages before submitting io */
+			l->compressed_pages = NULL;
+			l->compressed_deficit = 0;
+		}
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		l->owner = false;
+
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			l->owner = true;
+
+		goto got_it;
+	}
+
+	/* no available workgroup, let's allocate one */
+retry:
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep,
+		GFP_NOFS | __GFP_NOFAIL);
+
+	/* it is not allowed to fail (-ENOMEM / -EIO, no...) */
+	if (unlikely(grp == NULL))
+		goto retry;
+
+	/* fill general fields */
+	grp->index = index;
+	grp->llen = map->m_llen;
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		grp->flags |= Z_EROFS_WORK_FORMAT_LZ4;
+
+	/* currently, we implement uncached work at first */
+	cached = false;
+	work = z_erofs_vle_work_uncached(grp, 0);
+	work->pageofs = pageofs;
+	atomic_set(&work->refcount, 1);
+	l->compressed_pages = z_erofs_vle_work_uncached_mux(work);
+	l->compressed_deficit = clusterpages;
+
+	mutex_init(&work->lock);
+	/* type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	err = z_erofs_vle_workgroup_register(sb, grp, cached);
+	if (err) {
+		kmem_cache_free(z_erofs_workgroup_cachep, grp);
+		goto restart;
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	l->owner = true;
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&l->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	l->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+static void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *g)
+{
+	struct z_erofs_vle_work *work = &g->u.work;
+
+	if (!atomic_dec_return(&work->refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_pageldr *l)
+{
+	if (l->curr == NULL)
+		return;
+
+	z_erofs_pagevec_ctor_exit(&l->vector, false);
+	mutex_unlock(&l->curr->lock);
+	l->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_pageldr *l,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = l->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(l);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(l, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= l->owner;
+	work = l->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(l, page, page_type);
+	/* should allocate an additional page */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = alloc_pages(GFP_KERNEL | __GFP_NOFAIL, 0);
+		newpage->mapping = NULL;
+		err = z_erofs_vle_work_add_page(l, newpage, page_type);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (!atomic_add_return(bios, &io->pending_bios)) {
+		if (async)
+			queue_work(z_erofs_workqueue, &io->u.work);
+		else
+			wake_up(&io->u.wait);
+	}
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* TODO: design for pages for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+use_global_pagemap:
+		pages = z_pagemap_global;
+	else {
+		pages = kvmalloc(nr_pages, GFP_KERNEL | __GFP_NOFAIL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			mutex_lock(&z_pagemap_global_lock);
+			goto use_global_pagemap;
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_workgroup_put(z_erofs_vle_work_workgroup(work));
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_descriptor(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_descriptor(sb, io, &sync);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+
+			page->mapping = NULL;
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_pageldr l = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page, &l, &m_iter, &owned_head);
+
+	z_erofs_vle_work_iter_end(&l);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_pageldr l = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	LIST_HEAD(pagepool);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+		err = z_erofs_do_read_page(page, &l, &m_iter, &owned_head);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&l);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..a74a4fc
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+	atomic_t refcount;
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+	erofs_blk_t index;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+static inline int z_erofs_vle_workgroup_get(struct z_erofs_vle_workgroup *g)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&g->u.work.refcount);
+	if (unlikely(o <= 0))
+		return -1;
+	if (unlikely(atomic_cmpxchg(&g->u.work.refcount, o, o + 1) != o))
+		goto repeat;
+	return 0;
+}
+
+#define __z_erofs_vle_workgroup_get(g)  atomic_inc(&(g)->u.work.refcount)
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	(min(THREAD_SIZE >> 3, 96 * sizeof(struct page *)) / sizeof(struct page *))
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..bb5d830
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool backedup[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	for(i = 0; i < nr_pages; ++i) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL && !backedup[i-1])
+				kunmap_atomic(src);
+
+			src = NULL;
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(j = 0; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(backedup[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			backedup[j] = true;
+			break;
+		}
+
+		if (src == NULL && i) {
+			if (backedup[i-1])
+				src = percpu_data + i-1;
+			else
+				src = kmap_atomic(compressed_pages[i-1]);
+		}
+
+		memcpy(dst, src + righthalf, pageofs);
+
+		if (!backedup[i-1])
+			kunmap_atomic(src);
+
+		if (i >= clusterpages) {
+			kunmap_atomic(dst);
+			break;
+		}
+
+		if (backedup[i])
+			src = percpu_data + i;
+		else
+			src = kmap_atomic(compressed_pages[i]);
+		memcpy(dst + pageofs, src, righthalf);
+		kunmap_atomic(dst);
+	}
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
  2018-06-29 23:45 ` Chao Yu
  2018-06-30  9:18 ` [WIP] [NOMERGE] [RFC PATCH v0.2 1/2] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-06-30 15:17 ` Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 2/6] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (4 more replies)
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (7 subsequent siblings)
  10 siblings, 5 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30 15:17 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Refer to:
https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 2/6] erofs: introduce pagevec for unzip subsystem
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-06-30 15:17   ` Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter Gao Xiang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30 15:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 2/6] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-06-30 15:17   ` Gao Xiang
  2018-07-01  3:56     ` Chao Yu
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 4/6] erofs: add erofs_allocpage Gao Xiang
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-06-30 15:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |   8 ++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index c244cf3..3b34402 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -69,3 +69,11 @@ config EROFS_FS_USE_VM_MAP_RAM
 
 	  If you don't know what these are, say N.
 
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1dd783c..d327de2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -56,6 +56,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..300f556
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 4/6] erofs: add erofs_allocpage
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 2/6] erofs: introduce pagevec for unzip subsystem Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter Gao Xiang
@ 2018-06-30 15:17   ` Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 5/6] erofs: globalize prepare_bio and __submit_bio Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression subsystem Gao Xiang
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30 15:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  3 +++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index d327de2..6d9a927 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -320,5 +320,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 5/6] erofs: globalize prepare_bio and __submit_bio
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (2 preceding siblings ...)
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 4/6] erofs: add erofs_allocpage Gao Xiang
@ 2018-06-30 15:17   ` Gao Xiang
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression subsystem Gao Xiang
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30 15:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9b30095..45ad829 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -43,33 +43,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -92,7 +65,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -233,6 +206,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -250,6 +225,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -257,8 +235,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -267,7 +244,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -291,7 +268,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6d9a927..c9482fe 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -233,6 +233,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression subsystem
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (3 preceding siblings ...)
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 5/6] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-06-30 15:17   ` Gao Xiang
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-06-30 15:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

Todo list:
 - spilt into more understandable patches
 - add missing functions and bugfix

The patchset is temporarily based on
[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression

The new unzip system is still _buggy_, not for _daily_ use!

 fs/erofs/Kconfig         |  15 +
 fs/erofs/Makefile        |   4 +-
 fs/erofs/inode.c         |   6 +-
 fs/erofs/internal.h      |  29 ++
 fs/erofs/staging.h       |  42 +++
 fs/erofs/super.c         |  36 +-
 fs/erofs/unzip_vle.c     | 943 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h     | 236 ++++++++++++
 fs/erofs/unzip_vle_lz4.c | 145 ++++++++
 9 files changed, 1449 insertions(+), 7 deletions(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 3b34402..c7fea19 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -77,3 +77,18 @@ config EROFS_FS_ZIP
 	  Play at your own risk.
 
 	  If you don't want to use compression feature, say N.
+
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 7391ef6..12f2e1c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -181,8 +181,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c9482fe..f015e1d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* dedicated workspace for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} zwrksp;
 #endif
 
 	u32 build_time_nsec;
@@ -87,6 +95,16 @@ struct erofs_sb_info {
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define z_erofs_workspace_lock(sbi) spin_lock(&(sbi)->zwrksp.lock)
+#define z_erofs_workspace_unlock(sbi) spin_unlock(&(sbi)->zwrksp.lock)
+#else
+#define z_erofs_workspace_lock(sbi) xa_lock(&(sbi)->zwrksp.tree)
+#define z_erofs_workspace_unlock(sbi) xa_unlock(&(sbi)->zwrksp.tree)
+#endif
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
@@ -104,6 +122,14 @@ struct erofs_sb_info {
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
@@ -185,6 +211,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,45 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b41613f..297dc78 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -111,6 +111,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le64_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -186,6 +193,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->zwrksp.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->zwrksp.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -301,6 +315,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -309,11 +329,18 @@ int __init erofs_module_init(void)
 
 	err = erofs_init_inode_cache();
 	if (!err) {
-		err = register_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+		err = z_erofs_init_zip_subsystem();
 		if (!err) {
-			infoln("Successfully to initialize erofs");
-			return 0;
+#endif
+			err = register_filesystem(&erofs_fs_type);
+			if (!err) {
+				infoln("Successfully to initialize erofs");
+				return 0;
+			}
+#ifdef CONFIG_EROFS_FS_ZIP
 		}
+#endif
 	}
 	return err;
 }
@@ -321,6 +348,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	infoln("Successfully finalize erofs");
 }
 
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 300f556..f553f5e 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,948 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_pageldr {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_pageldr *l,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (l->compressed_deficit) {
+		--l->compressed_deficit;
+		if (cmpxchg(l->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_pageldr *l,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(l, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&l->vector,
+		page, type, &occupied);
+	l->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static struct z_erofs_vle_workgroup *
+z_erofs_vle_workgroup_find(struct super_block *sb,
+			   pgoff_t index,
+			   bool *cached)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+		void *ptr;
+	} u;
+
+repeat:
+	rcu_read_lock();
+	u.ptr = radix_tree_lookup(&sbi->zwrksp.tree, index);
+	if (u.ptr != NULL) {
+		*cached = radix_tree_exceptional_entry(u.ptr);
+		u.v &= ~RADIX_TREE_EXCEPTIONAL_ENTRY;
+
+		if (z_erofs_vle_workgroup_get(u.grp)) {
+			rcu_read_unlock();
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+	return u.grp;
+}
+
+static int z_erofs_vle_workgroup_register(struct super_block *sb,
+					  struct z_erofs_vle_workgroup *grp,
+					  bool cached)
+{
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+	} u;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	z_erofs_workspace_lock(sbi);
+	u.grp = grp;
+	u.v |= (unsigned)cached << RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+	err = radix_tree_insert(&sbi->zwrksp.tree, grp->index, u.grp);
+	if (!err)
+		__z_erofs_vle_workgroup_get(grp);
+
+	z_erofs_workspace_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+retry:
+	/* let's claim these following types of work */
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;
+
+	return true;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_pageldr *l,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+	int err;
+
+	BUG_ON(l->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	grp = z_erofs_vle_workgroup_find(sb, index, &cached);
+	if (grp != NULL) {
+		BUG_ON(index != grp->index);
+
+		if (!cached) {
+			work = z_erofs_vle_work_uncached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			l->compressed_pages =
+				z_erofs_vle_work_uncached_mux(work);
+			l->compressed_deficit = clusterpages;
+		} else {
+			work = z_erofs_vle_work_cached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			/* TODO! get cached pages before submitting io */
+			l->compressed_pages = NULL;
+			l->compressed_deficit = 0;
+		}
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		l->owner = false;
+
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			l->owner = true;
+
+		goto got_it;
+	}
+
+	/* no available workgroup, let's allocate one */
+	do {
+		grp = kmem_cache_zalloc(z_erofs_workgroup_cachep,
+			GFP_NOFS | __GFP_NOFAIL);
+
+		/* it is not allowed to fail (-ENOMEM / -EIO, no...) */
+	} while (unlikely(grp == NULL));
+
+	/* fill general fields */
+	grp->index = index;
+	grp->llen = map->m_llen;
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		grp->flags |= Z_EROFS_WORK_FORMAT_LZ4;
+
+	/* currently, we implement uncached work at first */
+	cached = false;
+	work = z_erofs_vle_work_uncached(grp, 0);
+	work->pageofs = pageofs;
+	atomic_set(&work->refcount, 1);
+	l->compressed_pages = z_erofs_vle_work_uncached_mux(work);
+	l->compressed_deficit = clusterpages;
+
+	mutex_init(&work->lock);
+	/* new works has type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	err = z_erofs_vle_workgroup_register(sb, grp, cached);
+	if (err) {
+		kmem_cache_free(z_erofs_workgroup_cachep, grp);
+		goto repeat;
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	l->owner = true;
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&l->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	l->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+static void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *g)
+{
+	struct z_erofs_vle_work *work = &g->u.work;
+
+	if (!atomic_dec_return(&work->refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_pageldr *l)
+{
+	if (l->curr == NULL)
+		return;
+
+	z_erofs_pagevec_ctor_exit(&l->vector, false);
+	mutex_unlock(&l->curr->lock);
+	l->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_pageldr *l,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = l->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(l);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(l, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= l->owner;
+	work = l->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(l, page, page_type);
+	/* should allocate an additional page */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = alloc_pages(GFP_KERNEL | __GFP_NOFAIL, 0);
+		newpage->mapping = NULL;
+		err = z_erofs_vle_work_add_page(l, newpage, page_type);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* TODO: design for pages for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+use_global_pagemap:
+		pages = z_pagemap_global;
+	else {
+		pages = kvmalloc(nr_pages, GFP_KERNEL | __GFP_NOFAIL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			mutex_lock(&z_pagemap_global_lock);
+			goto use_global_pagemap;
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_workgroup_put(z_erofs_vle_work_workgroup(work));
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_descriptor(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_descriptor(sb, io, &sync);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+
+			page->mapping = NULL;
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_pageldr l = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page, &l, &m_iter, &owned_head);
+
+	z_erofs_vle_work_iter_end(&l);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_pageldr l = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+		err = z_erofs_do_read_page(page, &l, &m_iter, &owned_head);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&l);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..a74a4fc
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+	atomic_t refcount;
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+	erofs_blk_t index;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+static inline int z_erofs_vle_workgroup_get(struct z_erofs_vle_workgroup *g)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&g->u.work.refcount);
+	if (unlikely(o <= 0))
+		return -1;
+	if (unlikely(atomic_cmpxchg(&g->u.work.refcount, o, o + 1) != o))
+		goto repeat;
+	return 0;
+}
+
+#define __z_erofs_vle_workgroup_get(g)  atomic_inc(&(g)->u.work.refcount)
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	(min(THREAD_SIZE >> 3, 96 * sizeof(struct page *)) / sizeof(struct page *))
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..bb5d830
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool backedup[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	for(i = 0; i < nr_pages; ++i) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL && !backedup[i-1])
+				kunmap_atomic(src);
+
+			src = NULL;
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(j = 0; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(backedup[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			backedup[j] = true;
+			break;
+		}
+
+		if (src == NULL && i) {
+			if (backedup[i-1])
+				src = percpu_data + i-1;
+			else
+				src = kmap_atomic(compressed_pages[i-1]);
+		}
+
+		memcpy(dst, src + righthalf, pageofs);
+
+		if (!backedup[i-1])
+			kunmap_atomic(src);
+
+		if (i >= clusterpages) {
+			kunmap_atomic(dst);
+			break;
+		}
+
+		if (backedup[i])
+			src = percpu_data + i;
+		else
+			src = kmap_atomic(compressed_pages[i]);
+		memcpy(dst + pageofs, src, righthalf);
+		kunmap_atomic(dst);
+	}
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter Gao Xiang
@ 2018-07-01  3:56     ` Chao Yu
  2018-07-01  4:17       ` Gao Xiang
  0 siblings, 1 reply; 102+ messages in thread
From: Chao Yu @ 2018-07-01  3:56 UTC (permalink / raw)


Hi Xiang,

It fails 'git-am' or 'git apply --reject' when applying patches beginning from
this one in the patch set to erofs branch, could you rebase the code?

Thanks,

On 2018/6/30 23:17, Gao Xiang wrote:
> Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
> ---
>  fs/erofs/Kconfig     |   8 ++
>  fs/erofs/Makefile    |   1 +
>  fs/erofs/internal.h  |   4 +
>  fs/erofs/unzip_vle.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 244 insertions(+)
>  create mode 100644 fs/erofs/unzip_vle.c
> 
> diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
> index c244cf3..3b34402 100644
> --- a/fs/erofs/Kconfig
> +++ b/fs/erofs/Kconfig
> @@ -69,3 +69,11 @@ config EROFS_FS_USE_VM_MAP_RAM
>  
>  	  If you don't know what these are, say N.
>  
> +config EROFS_FS_ZIP
> +	bool "EROFS Data Compresssion Support"
> +	depends on EROFS_FS
> +	help
> +	  Currently we support VLE Compression only.
> +	  Play at your own risk.
> +
> +	  If you don't want to use compression feature, say N.
> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
> index 9d7f90a..0b3db0a 100644
> --- a/fs/erofs/Makefile
> +++ b/fs/erofs/Makefile
> @@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
>  obj-$(CONFIG_EROFS_FS) += erofs.o
>  erofs-objs := super.o inode.o data.o namei.o dir.o
>  erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
> +erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
>  
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 1dd783c..d327de2 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -56,6 +56,10 @@ struct erofs_sb_info {
>  
>  	/* inode slot unit size in bit shift */
>  	unsigned char islotbits;
> +#ifdef CONFIG_EROFS_FS_ZIP
> +	/* cluster size in bit shift */
> +	unsigned char clusterbits;
> +#endif
>  
>  	u32 build_time_nsec;
>  	u64 build_time;
> diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
> new file mode 100644
> index 0000000..300f556
> --- /dev/null
> +++ b/fs/erofs/unzip_vle.c
> @@ -0,0 +1,231 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * linux/fs/erofs/unzip_vle.c
> + *
> + * Copyright (C) 2018 HUAWEI, Inc.
> + *             http://www.huawei.com/
> + * Created by Gao Xiang <gaoxiang25 at huawei.com>
> + *
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file COPYING in the main directory of the Linux
> + * distribution for more details.
> + */
> +#include "internal.h"
> +
> +#define __vle_cluster_advise(x, bit, bits) \
> +	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
> +
> +#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
> +	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
> +
> +enum {
> +	EROFS_VLE_CLUSTER_TYPE_PLAIN,
> +	EROFS_VLE_CLUSTER_TYPE_HEAD,
> +	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
> +	EROFS_VLE_CLUSTER_TYPE_RESERVED,
> +	EROFS_VLE_CLUSTER_TYPE_MAX
> +};
> +
> +#define vle_cluster_type(di)	\
> +	__vle_cluster_type((di)->di_advise)
> +
> +static inline unsigned
> +vle_compressed_index_clusterofs(unsigned clustersize,
> +	struct erofs_decompressed_index_vle *di)
> +{
> +	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
> +		__func__, di, di->di_advise, vle_cluster_type(di),
> +		di->di_clusterofs, di->di_u.blkaddr);
> +
> +	switch(vle_cluster_type(di)) {
> +	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
> +		break;
> +	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
> +	case EROFS_VLE_CLUSTER_TYPE_HEAD:
> +		return di->di_clusterofs;
> +	default:
> +		BUG_ON(1);
> +	}
> +	return clustersize;
> +}
> +
> +static inline erofs_blk_t
> +vle_extent_blkaddr(struct inode *inode, pgoff_t index)
> +{
> +	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
> +	struct erofs_vnode *vi = EROFS_V(inode);
> +
> +	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
> +		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
> +		index * sizeof(struct erofs_decompressed_index_vle);
> +
> +	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
> +}
> +
> +static inline unsigned int
> +vle_extent_blkoff(struct inode *inode, pgoff_t index)
> +{
> +	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
> +	struct erofs_vnode *vi = EROFS_V(inode);
> +
> +	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
> +		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
> +		index * sizeof(struct erofs_decompressed_index_vle);
> +
> +	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
> +}
> +
> +/*
> + * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
> + * ---
> + * VLE compression mode attempts to compress a number of logical data into
> + * a physical cluster with a fixed size.
> + * VLE compression mode uses "struct erofs_decompressed_index_vle".
> + */
> +static erofs_off_t vle_get_logical_extent_head(
> +	struct inode *inode,
> +	struct page **page_iter,
> +	void **kaddr_iter,
> +	unsigned lcn,	/* logical cluster number */
> +	erofs_blk_t *pcn,
> +	unsigned *flags)
> +{
> +	/* for extent meta */
> +	struct page *page = *page_iter;
> +	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
> +	struct erofs_decompressed_index_vle *di;
> +	unsigned long long ofs;
> +	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
> +
> +	if (page->index != blkaddr) {
> +		kunmap_atomic(*kaddr_iter);
> +		unlock_page(page);
> +		put_page(page);
> +
> +		*page_iter = page = erofs_get_meta_page(inode->i_sb,
> +			blkaddr, false);
> +		*kaddr_iter = kmap_atomic(page);
> +	}
> +
> +	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
> +	switch(vle_cluster_type(di)) {
> +	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
> +		BUG_ON(!di->di_u.delta[0]);
> +		BUG_ON(lcn < di->di_u.delta[0]);
> +
> +		ofs = vle_get_logical_extent_head(inode,
> +			page_iter, kaddr_iter,
> +			lcn - di->di_u.delta[0], pcn, flags);
> +		break;
> +	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
> +		*flags ^= EROFS_MAP_ZIPPED;
> +	case EROFS_VLE_CLUSTER_TYPE_HEAD:
> +		ofs = lcn * clustersize +
> +			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
> +		*pcn = le32_to_cpu(di->di_u.blkaddr);
> +		break;
> +	default:
> +		BUG_ON(1);
> +	}
> +	return ofs;
> +}
> +
> +int erofs_map_blocks_iter(struct inode *inode,
> +	struct erofs_map_blocks *map,
> +	struct page **mpage_ret, int flags)
> +{
> +	/* logicial extent (start, end) offset */
> +	unsigned long long ofs, end;
> +	struct erofs_decompressed_index_vle *di;
> +	erofs_blk_t e_blkaddr, pcn;
> +	unsigned lcn, logical_cluster_ofs;
> +	struct page *mpage = *mpage_ret;
> +	void *kaddr;
> +	bool initial;
> +	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
> +
> +	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
> +	initial = !map->m_llen;
> +
> +	if (unlikely(map->m_la >= inode->i_size)) {
> +		BUG_ON(!initial);
> +		map->m_la = inode->i_size - 1;
> +	}
> +
> +	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
> +		map->m_la, map->m_llen);
> +
> +	ofs = map->m_la + map->m_llen;
> +
> +	lcn = ofs / clustersize;
> +	e_blkaddr = vle_extent_blkaddr(inode, lcn);
> +
> +	if (mpage == NULL || mpage->index != e_blkaddr) {
> +		if (mpage != NULL)
> +			put_page(mpage);
> +
> +		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
> +		*mpage_ret = mpage;
> +	} else {
> +		lock_page(mpage);
> +		DBG_BUGON(!PageUptodate(mpage));
> +	}
> +
> +	kaddr = kmap_atomic(mpage);
> +	di = kaddr + vle_extent_blkoff(inode, lcn);
> +
> +	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
> +		e_blkaddr, vle_extent_blkoff(inode, lcn));
> +
> +	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
> +	if (!initial) {
> +		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
> +		map->m_llen += logical_cluster_ofs;
> +		goto out;
> +	}
> +
> +	/* by default, compressed */
> +	map->m_flags |= EROFS_MAP_ZIPPED;
> +
> +	end = (u64)(lcn + 1) * clustersize;
> +
> +	switch(vle_cluster_type(di)) {
> +	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
> +		if (ofs % clustersize >= logical_cluster_ofs)
> +			map->m_flags ^= EROFS_MAP_ZIPPED;
> +	case EROFS_VLE_CLUSTER_TYPE_HEAD:
> +		if (ofs % clustersize == logical_cluster_ofs) {
> +			pcn = le32_to_cpu(di->di_u.blkaddr);
> +			goto unneed;
> +		}
> +
> +		if (ofs % clustersize > logical_cluster_ofs) {
> +			ofs = lcn * clustersize | logical_cluster_ofs;
> +			pcn = le32_to_cpu(di->di_u.blkaddr);
> +			break;
> +		}
> +
> +		BUG_ON(!lcn);	/* logical cluster number >= 1 */
> +		end = (lcn-- * clustersize) | logical_cluster_ofs;
> +	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
> +		/* get the correspoinding first chunk */
> +		ofs = vle_get_logical_extent_head(inode, mpage_ret,
> +			&kaddr, lcn, &pcn, &map->m_flags);
> +		mpage = *mpage_ret;
> +	}
> +
> +	map->m_la = ofs;
> +unneed:
> +	map->m_llen = end - ofs;
> +	map->m_plen = clustersize;
> +	map->m_pa = blknr_to_addr(pcn);
> +	map->m_flags |= EROFS_MAP_MAPPED;
> +	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
> +		__func__, map->m_la, map->m_pa,
> +		map->m_llen, map->m_plen, map->m_flags);
> +out:
> +	kunmap_atomic(kaddr);
> +	unlock_page(mpage);
> +	return 0;
> +}
> +
> 

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-01  3:56     ` Chao Yu
@ 2018-07-01  4:17       ` Gao Xiang
  2018-07-01  4:26         ` Gao Xiang
  0 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-01  4:17 UTC (permalink / raw)


Hi Chao,

On 2018/7/1 11:56, Chao Yu wrote:
> Hi Xiang,
>
> It fails 'git-am' or 'git apply --reject' when applying patches beginning from
> this one in the patch set to erofs branch, could you rebase the code?
>
> Thanks,

this patchset is now based on

[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression

, I permanently dropped the last patch in EROFS v1 locally

 ?[RFC PATCH RESEND 12/12] erofs: introduce VLE decompression support 
(experimental)

Sorry for the confusion :'(


Thanks,
Gao Xiang

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-01  4:17       ` Gao Xiang
@ 2018-07-01  4:26         ` Gao Xiang
  2018-07-02  1:47           ` Chao Yu
  0 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-01  4:26 UTC (permalink / raw)


Hi Chao,


On 2018/7/1 12:17, Gao Xiang wrote:
>
> , I permanently dropped the last patch of EROFS v1 locally
>
> ?[RFC PATCH RESEND 12/12] erofs: introduce VLE decompression support 
> (experimental)
>
> Sorry for the confusion :'(

I left some words on the last patch of this patchset,
[WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression 
subsystem:

"The patchset is temporarily based on
[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression
The new unzip system is still _buggy_, not for _daily_ use!"

which seems improper, sorry... I will take care and leave message at the 
beginning from now on.

Thanks,
Gao Xiang

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-01  4:26         ` Gao Xiang
@ 2018-07-02  1:47           ` Chao Yu
  2018-07-02  2:48             ` Gao Xiang
  0 siblings, 1 reply; 102+ messages in thread
From: Chao Yu @ 2018-07-02  1:47 UTC (permalink / raw)


Hi Xiang,

On 2018/7/1 12:26, Gao Xiang via Linux-erofs wrote:
> Hi Chao,
> 
> 
> On 2018/7/1 12:17, Gao Xiang wrote:
>>
>> , I permanently dropped the last patch of EROFS v1 locally
>>
>> ?[RFC PATCH RESEND 12/12] erofs: introduce VLE decompression support 
>> (experimental)
>>
>> Sorry for the confusion :'(
> 
> I left some words on the last patch of this patchset,
> [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression 
> subsystem:
> 
> "The patchset is temporarily based on
> [RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression
> The new unzip system is still _buggy_, not for _daily_ use!"
> 
> which seems improper, sorry... I will take care and leave message at the 
> beginning from now on.

No problem, I think we can add revert patch for [PATCH 12/12] erofs: introduce a
customized LZ4 decompression, so that we can track erofs code change history
more clearly, and once you want to look into it or make code going back to the
old implementation, we can remove that revert patch later.

The history will be only existed in current erofs dev branch, once the code is
pushed or merged by Linux or Android OS, it will invisible.

Or you're sure about remove old implementation, let's do it...

Thanks,

> 
> Thanks,
> Gao Xiang
> 
> 
> 

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-02  1:47           ` Chao Yu
@ 2018-07-02  2:48             ` Gao Xiang
  2018-07-02  3:36               ` Chao Yu
  0 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-02  2:48 UTC (permalink / raw)


Hi Chao,

On 2018/7/2 9:47, Chao Yu wrote:
>> I left some words on the last patch of this patchset,
>> [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression 
>> subsystem:
>>
>> "The patchset is temporarily based on
>> [RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression
>> The new unzip system is still _buggy_, not for _daily_ use!"
>>
>> which seems improper, sorry... I will take care and leave message at the 
>> beginning from now on.
> No problem, I think we can add revert patch for [PATCH 12/12] erofs: introduce a
> customized LZ4 decompression, so that we can track erofs code change history
> more clearly, and once you want to look into it or make code going back to the
> old implementation, we can remove that revert patch later.
> 
> The history will be only existed in current erofs dev branch, once the code is
> pushed or merged by Linux or Android OS, it will invisible.
> 
> Or you're sure about remove old implementation, let's do it...
> 
> Thanks> 

I am also thinking about this yesterday, how about rebasing the following patches:

erofs: support tracepoint
erofs: introduce error injection infrastructure
erofs: introduce parse_options()
erofs: support special inode
erofs: remove unused EROFS_XATTR_INDEX_ADVISE
erofs: fix to return correct value of alloc_inode
erofs: fix to do endian conversion correctly
erofs: fix missing endian conversion
erofs: fix to avoid potential overflow
erofs: fix compile error
erofs: add the missing header <linux/prefetch.h>
erofs: update SPDX-License-Identifier   <- Could we drop this patch temporarily?
                                           Some files could be re-licensed in the future for mkfs,
                                           especially the file "erofs_fs.h"
erofs: fix ifnullfree.cocci warnings

on "[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression temporarily"
then apply [PATCH 12/12] and make the following patch on the top of [PATCH 12/12]
erofs: fix to handle return value of erofs_init_page_bundle() correctly <-

"git am -k -3" could be a powerful way to get this done..

In addition to that, we could have 2 choises to apply the new unzip subsystem:
1) We could revert [PATCH 12/12] and "erofs: fix to handle return value of erofs_init_page_bundle() correctly"
and then apply the new patchset just as you said,
or
2) Introduce a new 'dev-test' branch as Jaegeuk Kim's branch.

Boths for me are ok. :)

Thanks,
Gao Xiang

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-02  2:48             ` Gao Xiang
@ 2018-07-02  3:36               ` Chao Yu
  2018-07-02  3:47                 ` Gao Xiang
  0 siblings, 1 reply; 102+ messages in thread
From: Chao Yu @ 2018-07-02  3:36 UTC (permalink / raw)


Hi Xiang,

On 2018/7/2 10:48, Gao Xiang wrote:
> Hi Chao,
> 
> On 2018/7/2 9:47, Chao Yu wrote:
>>> I left some words on the last patch of this patchset,
>>> [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression 
>>> subsystem:
>>>
>>> "The patchset is temporarily based on
>>> [RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression
>>> The new unzip system is still _buggy_, not for _daily_ use!"
>>>
>>> which seems improper, sorry... I will take care and leave message at the 
>>> beginning from now on.
>> No problem, I think we can add revert patch for [PATCH 12/12] erofs: introduce a
>> customized LZ4 decompression, so that we can track erofs code change history
>> more clearly, and once you want to look into it or make code going back to the
>> old implementation, we can remove that revert patch later.
>>
>> The history will be only existed in current erofs dev branch, once the code is
>> pushed or merged by Linux or Android OS, it will invisible.
>>
>> Or you're sure about remove old implementation, let's do it...
>>
>> Thanks> 
> 
> I am also thinking about this yesterday, how about rebasing the following patches:
> 
> erofs: support tracepoint
> erofs: introduce error injection infrastructure
> erofs: introduce parse_options()
> erofs: support special inode
> erofs: remove unused EROFS_XATTR_INDEX_ADVISE
> erofs: fix to return correct value of alloc_inode
> erofs: fix to do endian conversion correctly
> erofs: fix missing endian conversion
> erofs: fix to avoid potential overflow
> erofs: fix compile error
> erofs: add the missing header <linux/prefetch.h>
> erofs: update SPDX-License-Identifier   <- Could we drop this patch temporarily?
>                                            Some files could be re-licensed in the future for mkfs,
>                                            especially the file "erofs_fs.h"

Okay,

> erofs: fix ifnullfree.cocci warnings
> 
> on "[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression temporarily"
> then apply [PATCH 12/12] and make the following patch on the top of [PATCH 12/12]
> erofs: fix to handle return value of erofs_init_page_bundle() correctly <-
> 
> "git am -k -3" could be a powerful way to get this done..

No problem.

> 
> In addition to that, we could have 2 choises to apply the new unzip subsystem:
> 1) We could revert [PATCH 12/12] and "erofs: fix to handle return value of erofs_init_page_bundle() correctly"
> and then apply the new patchset just as you said,
> or
> 2) Introduce a new 'dev-test' branch as Jaegeuk Kim's branch.

We can do both of them, let's treat erofs branch as master branch, and we can
split erofs-dev branch (dev-test will not be so obvious to indicate that branch
is belong to erofs, so let's use erofs-dev) from erofs branch for further
development.

So in erofs-dev, we can do anything temporarily change and testing, and once it
becomes stable, we can merge there-in patches back to erofs branch.

And for any old codes or implementation obsolescing in erofs branch, let's use
git-revert or remove codes in a separated patch instead of dropping a patch
directly. :)

How do you think?

Thanks,

> 
> Boths for me are ok. :)
> 
> Thanks,
> Gao Xiang
> 
> .
> 

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-02  3:36               ` Chao Yu
@ 2018-07-02  3:47                 ` Gao Xiang
  2018-07-02 10:48                   ` Chao Yu
  0 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-02  3:47 UTC (permalink / raw)


Hi Chao,

On 2018/7/2 11:36, Chao Yu wrote:
> We can do both of them, let's treat erofs branch as master branch, and we can
> split erofs-dev branch (dev-test will not be so obvious to indicate that branch
> is belong to erofs, so let's use erofs-dev) from erofs branch for further
> development.
> 
> So in erofs-dev, we can do anything temporarily change and testing, and once it
> becomes stable, we can merge there-in patches back to erofs branch.
> 
> And for any old codes or implementation obsolescing in erofs branch, let's use
> git-revert or remove codes in a separated patch instead of dropping a patch
> directly. :)
> 
> How do you think?

*nod* *nod* *nod*, this policy is fine with me :)

and for these backport branches, I try to stablize the new unzip subsystem this week...

and then we could start to begin backport and the whole fs bugfix work,
eg. starting from seperate and discard these confusing LINUX_VERSION marcos...

Thanks,

> 
> Thanks,

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-02  3:47                 ` Gao Xiang
@ 2018-07-02 10:48                   ` Chao Yu
  2018-07-02 11:53                     ` Gao Xiang
  0 siblings, 1 reply; 102+ messages in thread
From: Chao Yu @ 2018-07-02 10:48 UTC (permalink / raw)


Hi Xiang,

On 2018/7/2 11:47, Gao Xiang wrote:
> Hi Chao,
> 
> On 2018/7/2 11:36, Chao Yu wrote:
>> We can do both of them, let's treat erofs branch as master branch, and we can
>> split erofs-dev branch (dev-test will not be so obvious to indicate that branch
>> is belong to erofs, so let's use erofs-dev) from erofs branch for further
>> development.
>>
>> So in erofs-dev, we can do anything temporarily change and testing, and once it
>> becomes stable, we can merge there-in patches back to erofs branch.
>>
>> And for any old codes or implementation obsolescing in erofs branch, let's use
>> git-revert or remove codes in a separated patch instead of dropping a patch
>> directly. :)
>>
>> How do you think?
> 
> *nod* *nod* *nod*, this policy is fine with me :)
> 
> and for these backport branches, I try to stablize the new unzip subsystem this week...

Alright,

> 
> and then we could start to begin backport and the whole fs bugfix work,
> eg. starting from seperate and discard these confusing LINUX_VERSION marcos...

Forgot to compile old version kernels, let me start to prepare them from this week.

Thanks,

> 
> Thanks,
> 
>>
>> Thanks,
> 
> .
> 

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter
  2018-07-02 10:48                   ` Chao Yu
@ 2018-07-02 11:53                     ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 11:53 UTC (permalink / raw)


Hi Chao,

On 2018/7/2 18:48, Chao Yu wrote:
>> and then we could start to begin backport and the whole fs bugfix work,
>> eg. starting from seperate and discard these confusing LINUX_VERSION marcos...
> Forgot to compile old version kernels, let me start to prepare them from this week.

No rush :) I can also take some part of backport work.
Besides, some paper work are in queue as well...

Let's finish them and then release v2 patchset to the linux mailing list.

Thanks,
Gao Xiang

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (2 preceding siblings ...)
  2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-02 14:53 ` Gao Xiang
  2018-07-02 14:53     ` Gao Xiang
                     ` (6 more replies)
  2018-07-03 16:12 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (6 subsequent siblings)
  10 siblings, 7 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

Todo list:
 - spilt into more understandable patches
 - add missing functions and bugfix


The patchset is temporarily based on
[RFC PATCH RESEND 11/12] erofs: introduce a customized LZ4 decompression

The new unzip system is still _buggy_, not for _daily_ use!


Gao Xiang (7):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce VLE decompression subsystem

 fs/erofs/Kconfig         |   23 +
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |   77 +++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   36 +-
 fs/erofs/unzip_pagevec.h |  172 +++++++
 fs/erofs/unzip_vle.c     | 1176 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  236 ++++++++++
 fs/erofs/unzip_vle_lz4.c |  145 ++++++
 fs/erofs/utils.c         |   31 ++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 +++++
 15 files changed, 2084 insertions(+), 53 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-02 14:53     ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 2/7] erofs: introduce pagevec for unzip subsystem Gao Xiang
                       ` (5 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)
  To: linux-erofs
  Cc: yuchao0, miaoxie, weidu.du, Gao Xiang, Greg Kroah-Hartman,
	Kate Stewart, Matthew Wilcox, Philippe Ombredanne,
	Thomas Gleixner, linux-fsdevel, linux-kernel

Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Refer to:
https://en.wikipedia.org/wiki/Tagged_pointer

To: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer
@ 2018-07-02 14:53     ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Refer to:
https://en.wikipedia.org/wiki/Tagged_pointer

To: Alexander Viro <viro at zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh at linuxfoundation.org>
Cc: Kate Stewart <kstewart at linuxfoundation.org>
Cc: Matthew Wilcox <willy at infradead.org>
Cc: Philippe Ombredanne <pombredanne at nexb.com>
Cc: Thomas Gleixner <tglx at linutronix.de>
Cc: Chao Yu <yuchao0 at huawei.com>
Cc: Miao Xie <miaoxie at huawei.com>
Cc: linux-fsdevel at vger.kernel.org
Cc: linux-erofs at lists.ozlabs.org
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 2/7] erofs: introduce pagevec for unzip subsystem
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-02 14:53     ` Gao Xiang
@ 2018-07-02 14:53   ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 3/7] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 3/7] erofs: add erofs_map_blocks_iter
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-02 14:53     ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 2/7] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-02 14:53   ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 4/7] erofs: add erofs_allocpage Gao Xiang
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |   8 ++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index c244cf3..3b34402 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -69,3 +69,11 @@ config EROFS_FS_USE_VM_MAP_RAM
 
 	  If you don't know what these are, say N.
 
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1dd783c..d327de2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -56,6 +56,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..300f556
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 4/7] erofs: add erofs_allocpage
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 3/7] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-02 14:53   ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 5/7] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index d327de2..6d9a927 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -320,5 +320,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 5/7] erofs: globalize prepare_bio and __submit_bio
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 4/7] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-02 14:53   ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 6/7] erofs: add a generic z_erofs VLE decompressor Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 7/7] erofs: introduce VLE decompression subsystem Gao Xiang
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9b30095..45ad829 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -43,33 +43,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -92,7 +65,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -233,6 +206,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -250,6 +225,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -257,8 +235,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -267,7 +244,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -291,7 +268,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6d9a927..c9482fe 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -233,6 +233,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 6/7] erofs: add a generic z_erofs VLE decompressor
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (4 preceding siblings ...)
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 5/7] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-02 14:53   ` Gao Xiang
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 7/7] erofs: introduce VLE decompression subsystem Gao Xiang
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  15 +++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  34 +++++++++++
 fs/erofs/unzip_vle_lz4.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 3b34402..c7fea19 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -77,3 +77,18 @@ config EROFS_FS_ZIP
 	  Play at your own risk.
 
 	  If you don't want to use compression feature, say N.
+
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c9482fe..b9db1c2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -104,6 +104,11 @@ struct erofs_sb_info {
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..143b6c3
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..bb5d830
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool backedup[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	for(i = 0; i < nr_pages; ++i) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL && !backedup[i-1])
+				kunmap_atomic(src);
+
+			src = NULL;
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(j = 0; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(backedup[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			backedup[j] = true;
+			break;
+		}
+
+		if (src == NULL && i) {
+			if (backedup[i-1])
+				src = percpu_data + i-1;
+			else
+				src = kmap_atomic(compressed_pages[i-1]);
+		}
+
+		memcpy(dst, src + righthalf, pageofs);
+
+		if (!backedup[i-1])
+			kunmap_atomic(src);
+
+		if (i >= clusterpages) {
+			kunmap_atomic(dst);
+			break;
+		}
+
+		if (backedup[i])
+			src = percpu_data + i;
+		else
+			src = kmap_atomic(compressed_pages[i]);
+		memcpy(dst + pageofs, src, righthalf);
+		kunmap_atomic(dst);
+	}
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 7/7] erofs: introduce VLE decompression subsystem
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (5 preceding siblings ...)
  2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 6/7] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-02 14:53   ` Gao Xiang
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-02 14:53 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |   6 +-
 fs/erofs/internal.h  |  24 ++
 fs/erofs/staging.h   |  38 +++
 fs/erofs/super.c     |  36 +-
 fs/erofs/unzip_vle.c | 947 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h | 202 +++++++++++
 6 files changed, 1248 insertions(+), 5 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 7391ef6..12f2e1c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -181,8 +181,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b9db1c2..f015e1d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* dedicated workspace for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} zwrksp;
 #endif
 
 	u32 build_time_nsec;
@@ -87,6 +95,16 @@ struct erofs_sb_info {
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define z_erofs_workspace_lock(sbi) spin_lock(&(sbi)->zwrksp.lock)
+#define z_erofs_workspace_unlock(sbi) spin_unlock(&(sbi)->zwrksp.lock)
+#else
+#define z_erofs_workspace_lock(sbi) xa_lock(&(sbi)->zwrksp.tree)
+#define z_erofs_workspace_unlock(sbi) xa_unlock(&(sbi)->zwrksp.tree)
+#endif
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
@@ -107,6 +125,9 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -190,6 +211,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,41 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b41613f..297dc78 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -111,6 +111,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le64_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -186,6 +193,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->zwrksp.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->zwrksp.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -301,6 +315,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -309,11 +329,18 @@ int __init erofs_module_init(void)
 
 	err = erofs_init_inode_cache();
 	if (!err) {
-		err = register_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+		err = z_erofs_init_zip_subsystem();
 		if (!err) {
-			infoln("Successfully to initialize erofs");
-			return 0;
+#endif
+			err = register_filesystem(&erofs_fs_type);
+			if (!err) {
+				infoln("Successfully to initialize erofs");
+				return 0;
+			}
+#ifdef CONFIG_EROFS_FS_ZIP
 		}
+#endif
 	}
 	return err;
 }
@@ -321,6 +348,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	infoln("Successfully finalize erofs");
 }
 
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 300f556..f94bbde 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,952 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_handler {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (w->compressed_deficit) {
+		--w->compressed_deficit;
+		if (cmpxchg(w->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(w, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&w->vector,
+		page, type, &occupied);
+	w->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static struct z_erofs_vle_workgroup *
+z_erofs_vle_workgroup_find(struct super_block *sb,
+			   pgoff_t index,
+			   bool *cached)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+		void *ptr;
+	} u;
+
+repeat:
+	rcu_read_lock();
+	u.ptr = radix_tree_lookup(&sbi->zwrksp.tree, index);
+	if (u.ptr != NULL) {
+		*cached = radix_tree_exceptional_entry(u.ptr);
+		u.v &= ~RADIX_TREE_EXCEPTIONAL_ENTRY;
+
+		if (z_erofs_vle_workgroup_get(u.grp)) {
+			rcu_read_unlock();
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+	return u.grp;
+}
+
+static int z_erofs_vle_workgroup_register(struct super_block *sb,
+					  struct z_erofs_vle_workgroup *grp,
+					  bool cached)
+{
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+	} u;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	z_erofs_workspace_lock(sbi);
+	u.grp = grp;
+	u.v |= (unsigned)cached << RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+	err = radix_tree_insert(&sbi->zwrksp.tree, grp->index, u.grp);
+	if (!err)
+		__z_erofs_vle_workgroup_get(grp);
+
+	z_erofs_workspace_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+	/* let's claim these following types of work */
+retry:
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_NIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+	int err;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	grp = z_erofs_vle_workgroup_find(sb, index, &cached);
+	if (grp != NULL) {
+		BUG_ON(index != grp->index);
+
+		if (!cached) {
+			work = z_erofs_vle_work_uncached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			w->compressed_pages =
+				z_erofs_vle_work_uncached_mux(work);
+			w->compressed_deficit = clusterpages;
+		} else {
+			work = z_erofs_vle_work_cached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			/* TODO! get cached pages before submitting io */
+			w->compressed_pages = NULL;
+			w->compressed_deficit = 0;
+		}
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		w->owner = false;
+
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			w->owner = true;
+
+		goto got_it;
+	}
+
+	/* no available workgroup, let's allocate one */
+	do {
+		grp = kmem_cache_zalloc(z_erofs_workgroup_cachep,
+			GFP_NOFS | __GFP_NOFAIL);
+
+		/* it is not allowed to fail (-ENOMEM / -EIO, no...) */
+	} while (unlikely(grp == NULL));
+
+	/* fill general fields */
+	grp->index = index;
+	grp->llen = map->m_llen;
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		grp->flags |= Z_EROFS_WORK_FORMAT_LZ4;
+
+	/* currently, we implement uncached work at first */
+	cached = false;
+	work = z_erofs_vle_work_uncached(grp, 0);
+	work->pageofs = pageofs;
+	atomic_set(&work->refcount, 1);
+	w->compressed_pages = z_erofs_vle_work_uncached_mux(work);
+	w->compressed_deficit = clusterpages;
+
+	mutex_init(&work->lock);
+	/* new works has type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	err = z_erofs_vle_workgroup_register(sb, grp, cached);
+	if (err) {
+		kmem_cache_free(z_erofs_workgroup_cachep, grp);
+		goto repeat;
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	w->owner = true;
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+static void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *g)
+{
+	struct z_erofs_vle_work *work = &g->u.work;
+
+	if (!atomic_dec_return(&work->refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
+{
+	struct z_erofs_vle_work *zw = w->curr;
+
+	if (zw == NULL)
+		return;
+
+	z_erofs_pagevec_ctor_exit(&w->vector, false);
+	mutex_unlock(&zw->lock);
+	w->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_handler *h,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = h->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(h);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(h, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= h->owner;
+	work = h->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(h, page, page_type);
+	/* should allocate an additional page */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = alloc_pages(GFP_KERNEL | __GFP_NOFAIL, 0);
+		newpage->mapping = NULL;
+		err = z_erofs_vle_work_add_page(h, newpage, page_type);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* TODO: design for pages for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+use_global_pagemap:
+		pages = z_pagemap_global;
+	else {
+		pages = kvmalloc(nr_pages, GFP_KERNEL | __GFP_NOFAIL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			mutex_lock(&z_pagemap_global_lock);
+			goto use_global_pagemap;
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_workgroup_put(z_erofs_vle_work_workgroup(work));
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_handler(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_handler(sb, io, &sync);
+
+	io = tagptr_unfold_ptr(tio);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+
+			page->mapping = NULL;
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page, &h, &m_iter, &owned_head);
+
+	z_erofs_vle_work_iter_end(&h);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+		err = z_erofs_do_read_page(page, &h, &m_iter, &owned_head);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&h);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 143b6c3..a74a4fc 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,211 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+	atomic_t refcount;
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+	erofs_blk_t index;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+static inline int z_erofs_vle_workgroup_get(struct z_erofs_vle_workgroup *g)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&g->u.work.refcount);
+	if (unlikely(o <= 0))
+		return -1;
+	if (unlikely(atomic_cmpxchg(&g->u.work.refcount, o, o + 1) != o))
+		goto repeat;
+	return 0;
+}
+
+#define __z_erofs_vle_workgroup_get(g)  atomic_inc(&(g)->u.work.refcount)
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	(min(THREAD_SIZE >> 3, 96 * sizeof(struct page *)) / sizeof(struct page *))
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (3 preceding siblings ...)
  2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-03 16:12 ` Gao Xiang
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:12 UTC (permalink / raw)


change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

Todo list:
 - spilt into more understandable patches
 - add missing functions and bugfix


The patchset is temporarily based on
Revert "erofs: introduce VLE decompression support (experimental)"

The new unzip system is still _buggy_, not for _daily_ use!


Gao Xiang (7):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce VLE decompression subsystem

 fs/erofs/Kconfig         |   23 +
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |   77 +++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   36 +-
 fs/erofs/unzip_pagevec.h |  172 +++++++
 fs/erofs/unzip_vle.c     | 1176 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  236 ++++++++++
 fs/erofs/unzip_vle_lz4.c |  145 ++++++
 fs/erofs/utils.c         |   31 ++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 +++++
 15 files changed, 2084 insertions(+), 53 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND 0/7] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (4 preceding siblings ...)
  2018-07-03 16:12 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-03 16:13 ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (6 more replies)
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (4 subsequent siblings)
  10 siblings, 7 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

Todo list:
 - spilt into more understandable patches
 - add missing functions and bugfix


The patchset is temporarily based on
Revert "erofs: introduce VLE decompression support (experimental)"

The new unzip system is still _buggy_, not for _daily_ use!


Gao Xiang (7):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce VLE decompression subsystem

 fs/erofs/Kconfig         |   23 +
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |   77 +++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   36 +-
 fs/erofs/unzip_pagevec.h |  172 +++++++
 fs/erofs/unzip_vle.c     | 1176 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  236 ++++++++++
 fs/erofs/unzip_vle_lz4.c |  145 ++++++
 fs/erofs/utils.c         |   31 ++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 +++++
 15 files changed, 2084 insertions(+), 53 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 2/7] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 2/7] erofs: introduce pagevec for unzip subsystem
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 3/7] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 3/7] erofs: add erofs_map_blocks_iter
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 2/7] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 4/7] erofs: add erofs_allocpage Gao Xiang
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  10 +++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 246 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9c8696e..ffbd5eb 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,3 +75,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e52252f..2377cf4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -73,6 +73,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..300f556
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 4/7] erofs: add erofs_allocpage
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 3/7] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 5/7] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2377cf4..07bab28 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -381,5 +381,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 5/7] erofs: globalize prepare_bio and __submit_bio
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 4/7] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 6/7] erofs: add a generic z_erofs VLE decompressor Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 7/7] erofs: introduce VLE decompression subsystem Gao Xiang
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index b2a4d37..cebc097 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -47,33 +47,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -96,7 +69,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -237,6 +210,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -254,6 +229,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -261,8 +239,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -271,7 +248,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -295,7 +272,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 07bab28..e60f535 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -291,6 +291,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 6/7] erofs: add a generic z_erofs VLE decompressor
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
                     ` (4 preceding siblings ...)
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 5/7] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 7/7] erofs: introduce VLE decompression subsystem Gao Xiang
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  14 +++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  34 +++++++++++
 fs/erofs/unzip_vle_lz4.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index ffbd5eb..00e811c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -85,3 +85,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e60f535..038d77b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -162,6 +162,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..143b6c3
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..bb5d830
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool backedup[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	for(i = 0; i < nr_pages; ++i) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL && !backedup[i-1])
+				kunmap_atomic(src);
+
+			src = NULL;
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(j = 0; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(backedup[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			backedup[j] = true;
+			break;
+		}
+
+		if (src == NULL && i) {
+			if (backedup[i-1])
+				src = percpu_data + i-1;
+			else
+				src = kmap_atomic(compressed_pages[i-1]);
+		}
+
+		memcpy(dst, src + righthalf, pageofs);
+
+		if (!backedup[i-1])
+			kunmap_atomic(src);
+
+		if (i >= clusterpages) {
+			kunmap_atomic(dst);
+			break;
+		}
+
+		if (backedup[i])
+			src = percpu_data + i;
+		else
+			src = kmap_atomic(compressed_pages[i]);
+		memcpy(dst + pageofs, src, righthalf);
+		kunmap_atomic(dst);
+	}
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 7/7] erofs: introduce VLE decompression subsystem
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
                     ` (5 preceding siblings ...)
  2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 6/7] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-03 16:13   ` Gao Xiang
  6 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-03 16:13 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |   6 +-
 fs/erofs/internal.h  |  24 ++
 fs/erofs/staging.h   |  38 ++
 fs/erofs/super.c     |  36 +-
 fs/erofs/unzip_vle.c | 958 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h | 202 +++++++++++
 6 files changed, 1259 insertions(+), 5 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d3c1b29..dbe6a02 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -203,8 +203,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 038d77b..30c2b06 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -76,6 +76,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* dedicated workspace for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} zwrksp;
 #endif
 
 	u32 build_time_nsec;
@@ -145,6 +153,16 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define z_erofs_workspace_lock(sbi) spin_lock(&(sbi)->zwrksp.lock)
+#define z_erofs_workspace_unlock(sbi) spin_unlock(&(sbi)->zwrksp.lock)
+#else
+#define z_erofs_workspace_lock(sbi) xa_lock(&(sbi)->zwrksp.tree)
+#define z_erofs_workspace_unlock(sbi) xa_unlock(&(sbi)->zwrksp.tree)
+#endif
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
@@ -165,6 +183,9 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -248,6 +269,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,41 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a1826b9..232028b 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -112,6 +112,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -279,6 +286,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->zwrksp.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->zwrksp.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -394,6 +408,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -402,11 +422,18 @@ int __init erofs_module_init(void)
 
 	err = erofs_init_inode_cache();
 	if (!err) {
-		err = register_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+		err = z_erofs_init_zip_subsystem();
 		if (!err) {
-			infoln("Successfully to initialize erofs");
-			return 0;
+#endif
+			err = register_filesystem(&erofs_fs_type);
+			if (!err) {
+				infoln("Successfully to initialize erofs");
+				return 0;
+			}
+#ifdef CONFIG_EROFS_FS_ZIP
 		}
+#endif
 	}
 	return err;
 }
@@ -414,6 +441,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	infoln("Successfully finalize erofs");
 }
 
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 300f556..34371fa 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,963 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_handler {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (w->compressed_deficit) {
+		--w->compressed_deficit;
+		if (cmpxchg(w->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(w, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&w->vector,
+		page, type, &occupied);
+	w->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static struct z_erofs_vle_workgroup *
+z_erofs_vle_workgroup_find(struct super_block *sb,
+			   pgoff_t index,
+			   bool *cached)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+		void *ptr;
+	} u;
+
+repeat:
+	rcu_read_lock();
+	u.ptr = radix_tree_lookup(&sbi->zwrksp.tree, index);
+	if (u.ptr != NULL) {
+		*cached = radix_tree_exceptional_entry(u.ptr);
+		u.v &= ~RADIX_TREE_EXCEPTIONAL_ENTRY;
+
+		if (z_erofs_vle_workgroup_get(u.grp)) {
+			rcu_read_unlock();
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+	return u.grp;
+}
+
+static int z_erofs_vle_workgroup_register(struct super_block *sb,
+					  struct z_erofs_vle_workgroup *grp,
+					  bool cached)
+{
+	union {
+		struct z_erofs_vle_workgroup *grp;
+		uintptr_t v;
+	} u;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	z_erofs_workspace_lock(sbi);
+	u.grp = grp;
+	u.v |= (unsigned)cached << RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+	err = radix_tree_insert(&sbi->zwrksp.tree, grp->index, u.grp);
+	if (!err)
+		__z_erofs_vle_workgroup_get(grp);
+
+	z_erofs_workspace_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+	/* let's claim these following types of work */
+retry:
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_NIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+	int err;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	grp = z_erofs_vle_workgroup_find(sb, index, &cached);
+	if (grp != NULL) {
+		BUG_ON(index != grp->index);
+
+		if (!cached) {
+			work = z_erofs_vle_work_uncached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			w->compressed_pages =
+				z_erofs_vle_work_uncached_mux(work);
+			w->compressed_deficit = clusterpages;
+		} else {
+			work = z_erofs_vle_work_cached(grp, pageofs);
+			/* currently, work will not be NULL */
+
+			/* TODO! get cached pages before submitting io */
+			w->compressed_pages = NULL;
+			w->compressed_deficit = 0;
+		}
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		w->owner = false;
+
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			w->owner = true;
+
+		goto got_it;
+	}
+
+	/* no available workgroup, let's allocate one */
+	do {
+		grp = kmem_cache_zalloc(z_erofs_workgroup_cachep,
+			GFP_NOFS | __GFP_NOFAIL);
+
+		/* it is not allowed to fail (-ENOMEM / -EIO, no...) */
+	} while (unlikely(grp == NULL));
+
+	/* fill general fields */
+	grp->index = index;
+	grp->llen = map->m_llen;
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		grp->flags |= Z_EROFS_WORK_FORMAT_LZ4;
+
+	/* currently, we implement uncached work at first */
+	cached = false;
+	work = z_erofs_vle_work_uncached(grp, 0);
+	work->pageofs = pageofs;
+	atomic_set(&work->refcount, 1);
+	w->compressed_pages = z_erofs_vle_work_uncached_mux(work);
+	w->compressed_deficit = clusterpages;
+
+	mutex_init(&work->lock);
+	/* new works has type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	err = z_erofs_vle_workgroup_register(sb, grp, cached);
+	if (err) {
+		kmem_cache_free(z_erofs_workgroup_cachep, grp);
+		goto repeat;
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	w->owner = true;
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+static void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *g)
+{
+	struct z_erofs_vle_work *work = &g->u.work;
+
+	if (!atomic_dec_return(&work->refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
+{
+	struct z_erofs_vle_work *zw = w->curr;
+
+	if (zw == NULL)
+		return;
+
+	z_erofs_pagevec_ctor_exit(&w->vector, false);
+	mutex_unlock(&zw->lock);
+	w->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_handler *h,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head,
+				struct list_head *page_pool)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = h->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(h);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(h, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= h->owner;
+	work = h->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(h, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(h, newpage,
+			Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* TODO: design for pages for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc(nr_pages, GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_workgroup_put(z_erofs_vle_work_workgroup(work));
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_handler(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_handler(sb, io, &sync);
+
+	io = tagptr_unfold_ptr(tio);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+			page->mapping = NULL;
+
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page,
+		&h, &m_iter, &owned_head, &pagepool);
+
+	z_erofs_vle_work_iter_end(&h);
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(page,
+			&h, &m_iter, &owned_head, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&h);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 143b6c3..a4f6910 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,211 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+	atomic_t refcount;
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+	erofs_blk_t index;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+static inline int z_erofs_vle_workgroup_get(struct z_erofs_vle_workgroup *g)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&g->u.work.refcount);
+	if (unlikely(o <= 0))
+		return -1;
+	if (unlikely(atomic_cmpxchg(&g->u.work.refcount, o, o + 1) != o))
+		goto repeat;
+	return 0;
+}
+
+#define __z_erofs_vle_workgroup_get(g)  atomic_inc(&(g)->u.work.refcount)
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (5 preceding siblings ...)
  2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
@ 2018-07-05  8:41 ` Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (4 more replies)
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (3 subsequent siblings)
  10 siblings, 5 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:41 UTC (permalink / raw)


change log v0.5
 - add reclaim path
 - almost work, still debugging

change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

Todo list:
 - spilt into more understandable patches
 - add missing functions and bugfix


The patchset is temporarily based on
"erofs: fix erofs_module_init & exit"

The new unzip system is still _buggy_, not for _daily_ use!

Gao Xiang (10):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce superblock registration
  erofs: introduce erofs shrinker
  erofs: introduce workstation for decompression
  erofs: introduce VLE decompression support

 fs/erofs/Kconfig         |   24 +
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |  133 +++++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   57 +++
 fs/erofs/unzip_pagevec.h |  172 +++++++
 fs/erofs/unzip_vle.c     | 1200 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  228 +++++++++
 fs/erofs/unzip_vle_lz4.c |  146 ++++++
 fs/erofs/utils.c         |  233 +++++++++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 +++++
 15 files changed, 2384 insertions(+), 50 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 01/10] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-05  8:41   ` Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:41 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 02/10] erofs: introduce pagevec for unzip subsystem
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-05  8:41   ` Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:41 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 03/10] erofs: add erofs_map_blocks_iter
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-05  8:41   ` Gao Xiang
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 04/10] erofs: add erofs_allocpage Gao Xiang
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:41 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  10 +++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 246 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9c8696e..ffbd5eb 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,3 +75,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e52252f..2377cf4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -73,6 +73,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..300f556
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_la = inode->i_size - 1;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* m_(l,p)blk, m_(l,p)ofs has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto unneed;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+unneed:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags %u",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 04/10] erofs: add erofs_allocpage
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-05  8:41   ` Gao Xiang
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:41 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2377cf4..07bab28 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -381,5 +381,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 04/10] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-05  8:44   ` Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
                       ` (4 more replies)
  4 siblings, 5 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:44 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index de99217..2efe69f 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -47,33 +47,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -96,7 +69,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -237,6 +210,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -254,6 +229,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -261,8 +239,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -271,7 +248,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -295,7 +272,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 07bab28..e60f535 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -291,6 +291,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 06/10] erofs: add a generic z_erofs VLE decompressor
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-05  8:44     ` Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 07/10] erofs: introduce superblock registration Gao Xiang
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:44 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  14 +++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  34 +++++++++++
 fs/erofs/unzip_vle_lz4.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index ffbd5eb..00e811c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -85,3 +85,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e60f535..038d77b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -162,6 +162,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..143b6c3
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..c8b22a2
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool backedup[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	for(i = 0; i < nr_pages; ++i) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL && !backedup[i-1])
+				kunmap_atomic(src);
+
+			src = NULL;
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(j = 0; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(backedup[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			backedup[j] = true;
+			break;
+		}
+
+		if (src == NULL && i) {
+			if (backedup[i-1])
+				src = percpu_data + i-1;
+			else
+				src = kmap_atomic(compressed_pages[i-1]);
+		}
+
+		memcpy(dst, src + righthalf, pageofs);
+
+		if (!backedup[i-1])
+			kunmap_atomic(src);
+
+		if (i >= clusterpages) {
+			kunmap_atomic(dst);
+			break;
+		}
+
+		if (backedup[i])
+			src = percpu_data + i;
+		else
+			src = kmap_atomic(compressed_pages[i]);
+		memcpy(dst + pageofs, src, righthalf);
+		kunmap_atomic(dst);
+	}
+	preempt_enable();
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 07/10] erofs: introduce superblock registration
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-05  8:44     ` Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 08/10] erofs: introduce erofs shrinker Gao Xiang
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:44 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  6 ++++++
 fs/erofs/super.c    |  4 ++++
 fs/erofs/utils.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 038d77b..cc898b4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -65,6 +65,9 @@ struct erofs_fault_info {
 typedef u64 erofs_nid_t;
 
 struct erofs_sb_info {
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -430,5 +433,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 /* utils.c */
 extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
+extern void erofs_register_super(struct super_block *sb);
+extern void erofs_unregister_super(struct super_block *sb);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index d104d88..a91399f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -315,6 +315,8 @@ static int erofs_read_super(struct super_block *sb,
 	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
 	sbi->dev_name[PATH_MAX - 1] = '\0';
 
+	erofs_register_super(sb);
+
 	/*
 	 * We already have a positive dentry, which was instantiated
 	 * by d_make_root. Just need to d_rehash it.
@@ -352,6 +354,8 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	erofs_unregister_super(sb);
+
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dce5177..78731c5 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,3 +29,20 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+static DEFINE_MUTEX(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_register_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_unregister_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_del(&EROFS_SB(sb)->list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 08/10] erofs: introduce erofs shrinker
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 07/10] erofs: introduce superblock registration Gao Xiang
@ 2018-07-05  8:44     ` Gao Xiang
  2018-07-05  9:09       ` [WIP] [NOMERGE] [RFC PATCH v0.5 RESEND " Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 09/10] erofs: introduce workstation for decompression Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 10/10] erofs: introduce VLE decompression support Gao Xiang
  4 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:44 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  7 +++++
 fs/erofs/super.c    | 15 ++++++++++
 fs/erofs/utils.c    | 79 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index cc898b4..e202ef3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -94,6 +95,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -436,5 +438,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a91399f..9452a89 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -354,7 +354,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -396,6 +398,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -415,6 +423,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -423,6 +435,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -431,6 +445,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 78731c5..6b8c60d 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,20 +29,89 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 09/10] erofs: introduce workstation for decompression
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                       ` (2 preceding siblings ...)
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 08/10] erofs: introduce erofs shrinker Gao Xiang
@ 2018-07-05  8:44     ` Gao Xiang
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 10/10] erofs: introduce VLE decompression support Gao Xiang
  4 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:44 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h | 60 ++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/super.c    | 12 +++++++++
 fs/erofs/utils.c    | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e202ef3..489cd4e 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -80,6 +80,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* the dedicated workstation for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} workstn;
 #endif
 
 	u32 build_time_nsec;
@@ -150,6 +158,58 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define erofs_workstn_lock(sbi)         spin_lock(&(sbi)->workstn.lock)
+#define erofs_workstn_unlock(sbi)       spin_unlock(&(sbi)->workstn.lock)
+#else
+#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn.tree)
+#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn.tree)
+#endif
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+static inline bool erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&grp->refcount);
+
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	return 0;
+}
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+
+extern struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag);
+
+extern int erofs_register_workgroup(struct super_block *sb,
+	struct erofs_workgroup *grp, bool tag);
+
+extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+	unsigned long nr_shrink, bool cleanup);
+
+static inline void erofs_workstation_cleanup_all(struct super_block *sb)
+{
+	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
+}
+
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9452a89..d26eeed 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -285,6 +285,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->workstn.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -355,6 +362,11 @@ static void erofs_put_super(struct super_block *sb)
 	__putname(sbi->dev_name);
 
 	mutex_lock(&sbi->umount_mutex);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	erofs_workstation_cleanup_all(sb);
+#endif
+
 	erofs_unregister_super(sb);
 	mutex_unlock(&sbi->umount_mutex);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 6b8c60d..912be2a 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,6 +29,76 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+
+/* radix_tree and the future XArray both don't use tagptr_t yet */
+struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn.tree, index);
+	if (grp != NULL) {
+		*tag = radix_tree_exceptional_entry(grp);
+		grp = (void *)((unsigned long)grp &
+			~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		if (erofs_workgroup_get(grp)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		BUG_ON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	erofs_workstn_lock(sbi);
+
+	if (tag)
+		grp = (void *)((unsigned long)grp |
+			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	err = radix_tree_insert(&sbi->workstn.tree,
+		grp->index, grp);
+
+	if (!err) {
+		__erofs_workgroup_get(grp);
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	}
+
+	erofs_workstn_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+				       unsigned long nr_shrink,
+				       bool cleanup)
+{
+	return 0;
+}
+
+#endif
 
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -108,6 +178,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		list_move_tail(&sbi->list, &erofs_sb_list);
 		mutex_unlock(&sbi->umount_mutex);
 
+		freed += erofs_shrink_workstation(sbi, nr, false);
 		if (freed >= nr)
 			break;
 	}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 10/10] erofs: introduce VLE decompression support
  2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                       ` (3 preceding siblings ...)
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 09/10] erofs: introduce workstation for decompression Gao Xiang
@ 2018-07-05  8:44     ` Gao Xiang
  2018-07-05  9:37       ` [WIP] [NOMERGE] [RFC PATCH v0.5 RESEND " Gao Xiang
  4 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  8:44 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |   6 +-
 fs/erofs/internal.h  |   7 +
 fs/erofs/staging.h   |  38 ++
 fs/erofs/super.c     |  26 ++
 fs/erofs/unzip_vle.c | 971 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h | 194 ++++++++++
 fs/erofs/utils.c     |  47 ++-
 7 files changed, 1286 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 489cd4e..4c5b615 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -193,6 +193,7 @@ static inline bool erofs_workgroup_get(struct erofs_workgroup *grp)
 }
 
 #define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+extern void erofs_workgroup_put(struct erofs_workgroup *grp);
 
 extern struct erofs_workgroup *erofs_find_workgroup(
 	struct super_block *sb, pgoff_t index, bool *tag);
@@ -230,6 +231,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -313,6 +317,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,41 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index d26eeed..09614a7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -424,6 +431,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -439,6 +452,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -447,6 +466,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -457,6 +480,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 300f556..cc2d892 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,976 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_handler {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (w->compressed_deficit) {
+		--w->compressed_deficit;
+		if (cmpxchg(w->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(w, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&w->vector,
+		page, type, &occupied);
+	w->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline
+struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
+	pgoff_t idx, unsigned pageofs,
+	bool *cached_ret,
+	struct z_erofs_vle_workgroup **grp_ret)
+{
+	bool cached;
+	struct erofs_workgroup *egrp = erofs_find_workgroup(sb, idx, &cached);
+	struct z_erofs_vle_workgroup *grp;
+
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+	*cached_ret = cached;
+
+	return cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+}
+
+static inline struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup *grp,
+			  bool cached,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  erofs_wtptr_t *owned_head)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_work_format(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_WORK_FORMAT_LZ4 :
+			Z_EROFS_WORK_FORMAT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently not implemented */
+	BUG();
+#else
+	work = cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+	/* new works have been claimed as type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, cached);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	return work;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+	/* let's claim these following types of work */
+retry:
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_NIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static inline void __reset_compressed_pages(
+	struct z_erofs_vle_work_handler *w,
+	struct z_erofs_vle_work *work, bool cached,
+	unsigned clusterpages)
+{
+	if (!cached) {
+		w->compressed_pages =
+			z_erofs_vle_work_uncached_mux(work);
+		w->compressed_deficit = clusterpages;
+		return;
+	}
+
+	/* TODO! get cached pages before submitting io */
+	w->compressed_pages = NULL;
+	w->compressed_deficit = 0;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	work = z_erofs_vle_work_find(sb, index,
+		pageofs, &cached, &grp);
+	if (work != NULL) {
+		BUG_ON(index != grp->obj.index);
+
+		__reset_compressed_pages(w, work, cached, clusterpages);
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		w->owner = false;
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			w->owner = true;
+
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, grp,
+		false, map, index, pageofs, owned_head);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+
+	__reset_compressed_pages(w, work, cached, clusterpages);
+	w->owner = true;
+
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *grp)
+{
+	struct z_erofs_vle_work *work = &grp->u.work;
+
+	if (!atomic_dec_return(&grp->obj.refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	z_erofs_vle_workgroup_put(container_of(grp,
+		struct z_erofs_vle_workgroup, obj));
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
+{
+	struct z_erofs_vle_work *zw = w->curr;
+
+	if (zw == NULL)
+		return;
+
+	z_erofs_pagevec_ctor_exit(&w->vector, false);
+	mutex_unlock(&zw->lock);
+	w->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_handler *h,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head,
+				struct list_head *page_pool)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = h->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(h);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(h, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= h->owner;
+	work = h->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(h, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(h, newpage,
+			Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* FIXME: the following snippets are for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc(nr_pages, GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_workgroup_put(z_erofs_vle_work_workgroup(work));
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_handler(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_handler(sb, io, &sync);
+
+	io = tagptr_unfold_ptr(tio);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+			page->mapping = NULL;
+
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->obj.index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page,
+		&h, &m_iter, &owned_head, &pagepool);
+
+	z_erofs_vle_work_iter_end(&h);
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(page,
+			&h, &m_iter, &owned_head, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&h);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 143b6c3..6d0595d 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,203 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	atomic_t refcount;
+#endif
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_set_work_format(grp, fmt) \
+	((grp)->flags = ((grp)->flags & ~Z_EROFS_WORK_FORMAT_MASK) | (fmt))
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 912be2a..eb87927 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -95,7 +96,51 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	if (found > nr_shrink)
+		found = nr_shrink;
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		erofs_workgroup_put(grp);
+
+		++freed;
+		--nr_shrink;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (found && nr_shrink) {
+		first_index += found;
+		goto repeat;
+	}
+
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 RESEND 08/10] erofs: introduce erofs shrinker
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 08/10] erofs: introduce erofs shrinker Gao Xiang
@ 2018-07-05  9:09       ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  9:09 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---

change log v0.5 RESEND:
 - add the missing mutex_init(&sbi->umount_mutex);

 fs/erofs/internal.h |  7 +++++
 fs/erofs/super.c    | 15 ++++++++++
 fs/erofs/utils.c    | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index cc898b4..e202ef3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -94,6 +95,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -436,5 +438,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a91399f..9452a89 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -354,7 +354,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -396,6 +398,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -415,6 +423,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -423,6 +435,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -431,6 +445,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 78731c5..685e885 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,20 +29,93 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
-	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.5 RESEND 10/10] erofs: introduce VLE decompression support
  2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 10/10] erofs: introduce VLE decompression support Gao Xiang
@ 2018-07-05  9:37       ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-05  9:37 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
change log v0.5 RESEND:
 - fix erofs_global_shrink_cnt

 fs/erofs/inode.c     |   6 +-
 fs/erofs/internal.h  |   7 +
 fs/erofs/staging.h   |  38 ++
 fs/erofs/super.c     |  26 ++
 fs/erofs/unzip_vle.c | 971 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h | 194 ++++++++++
 fs/erofs/utils.c     |  51 ++-
 7 files changed, 1287 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 489cd4e..4c5b615 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -193,6 +193,7 @@ static inline bool erofs_workgroup_get(struct erofs_workgroup *grp)
 }
 
 #define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+extern void erofs_workgroup_put(struct erofs_workgroup *grp);
 
 extern struct erofs_workgroup *erofs_find_workgroup(
 	struct super_block *sb, pgoff_t index, bool *tag);
@@ -230,6 +231,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -313,6 +317,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,41 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index d26eeed..09614a7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -424,6 +431,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -439,6 +452,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -447,6 +466,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -457,6 +480,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 300f556..cc2d892 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,976 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_handler {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (w->compressed_deficit) {
+		--w->compressed_deficit;
+		if (cmpxchg(w->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(w, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&w->vector,
+		page, type, &occupied);
+	w->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline
+struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
+	pgoff_t idx, unsigned pageofs,
+	bool *cached_ret,
+	struct z_erofs_vle_workgroup **grp_ret)
+{
+	bool cached;
+	struct erofs_workgroup *egrp = erofs_find_workgroup(sb, idx, &cached);
+	struct z_erofs_vle_workgroup *grp;
+
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+	*cached_ret = cached;
+
+	return cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+}
+
+static inline struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup *grp,
+			  bool cached,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  erofs_wtptr_t *owned_head)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_work_format(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_WORK_FORMAT_LZ4 :
+			Z_EROFS_WORK_FORMAT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently not implemented */
+	BUG();
+#else
+	work = cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+	/* new works have been claimed as type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, cached);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	return work;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+	/* let's claim these following types of work */
+retry:
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_NIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static inline void __reset_compressed_pages(
+	struct z_erofs_vle_work_handler *w,
+	struct z_erofs_vle_work *work, bool cached,
+	unsigned clusterpages)
+{
+	if (!cached) {
+		w->compressed_pages =
+			z_erofs_vle_work_uncached_mux(work);
+		w->compressed_deficit = clusterpages;
+		return;
+	}
+
+	/* TODO! get cached pages before submitting io */
+	w->compressed_pages = NULL;
+	w->compressed_deficit = 0;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	work = z_erofs_vle_work_find(sb, index,
+		pageofs, &cached, &grp);
+	if (work != NULL) {
+		BUG_ON(index != grp->obj.index);
+
+		__reset_compressed_pages(w, work, cached, clusterpages);
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		w->owner = false;
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			w->owner = true;
+
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, grp,
+		false, map, index, pageofs, owned_head);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+
+	__reset_compressed_pages(w, work, cached, clusterpages);
+	w->owner = true;
+
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void z_erofs_vle_workgroup_put(struct z_erofs_vle_workgroup *grp)
+{
+	struct z_erofs_vle_work *work = &grp->u.work;
+
+	if (!atomic_dec_return(&grp->obj.refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	z_erofs_vle_workgroup_put(container_of(grp,
+		struct z_erofs_vle_workgroup, obj));
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
+{
+	struct z_erofs_vle_work *zw = w->curr;
+
+	if (zw == NULL)
+		return;
+
+	z_erofs_pagevec_ctor_exit(&w->vector, false);
+	mutex_unlock(&zw->lock);
+	w->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_handler *h,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head,
+				struct list_head *page_pool)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = h->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(h);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(h, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= h->owner;
+	work = h->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(h, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(h, newpage,
+			Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* FIXME: the following snippets are for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc(nr_pages, GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_workgroup_put(z_erofs_vle_work_workgroup(work));
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_handler(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_handler(sb, io, &sync);
+
+	io = tagptr_unfold_ptr(tio);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+			page->mapping = NULL;
+
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->obj.index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page,
+		&h, &m_iter, &owned_head, &pagepool);
+
+	z_erofs_vle_work_iter_end(&h);
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(page,
+			&h, &m_iter, &owned_head, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&h);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 143b6c3..6d0595d 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,203 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	atomic_t refcount;
+#endif
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_set_work_format(grp, fmt) \
+	((grp)->flags = ((grp)->flags & ~Z_EROFS_WORK_FORMAT_MASK) | (fmt))
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index fac60f6..ff2df6c 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -95,7 +96,52 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	if (found > nr_shrink)
+		found = nr_shrink;
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_put(grp);
+
+		++freed;
+		--nr_shrink;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (found && nr_shrink) {
+		first_index += found;
+		goto repeat;
+	}
+
+	return freed;
 }
 
 #endif
@@ -107,9 +153,6 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
 void erofs_register_super(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (6 preceding siblings ...)
  2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-06 16:50 ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (9 more replies)
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (2 subsequent siblings)
  10 siblings, 10 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


change log v0.6:
 - preliminary works (could boot into launcher)
 - still have minor buges to fix

change log v0.5:
 - add reclaim path
 - almost work, still debugging

change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

The patchset is temporarily based on
"erofs: fix erofs_module_init & exit"

The new unzip system is still _buggy_, not for _daily_ use!

Gao Xiang (10):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce superblock registration
  erofs: introduce erofs shrinker
  erofs: introduce workstation for decompression
  erofs: introduce VLE decompression support

 fs/erofs/Kconfig         |   24 +
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |  133 +++++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   57 +++
 fs/erofs/unzip_pagevec.h |  172 +++++++
 fs/erofs/unzip_vle.c     | 1200 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  228 +++++++++
 fs/erofs/unzip_vle_lz4.c |  146 ++++++
 fs/erofs/utils.c         |  233 +++++++++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 +++++
 15 files changed, 2384 insertions(+), 50 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 01/10] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 02/10] erofs: introduce pagevec for unzip subsystem
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 03/10] erofs: add erofs_map_blocks_iter
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 04/10] erofs: add erofs_allocpage Gao Xiang
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  10 +++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9c8696e..ffbd5eb 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,3 +75,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e52252f..2377cf4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -73,6 +73,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..b1e8bbe
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size - 1;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* [walking mode] 'map' has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto unmap_out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto exact_hitted;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+exact_hitted:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+unmap_out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+out:
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 04/10] erofs: add erofs_allocpage
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2377cf4..07bab28 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -381,5 +381,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 05/10] erofs: globalize prepare_bio and __submit_bio
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 04/10] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index de99217..2efe69f 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -47,33 +47,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -96,7 +69,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -237,6 +210,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -254,6 +229,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -261,8 +239,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -271,7 +248,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -295,7 +272,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 07bab28..e60f535 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -291,6 +291,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 06/10] erofs: add a generic z_erofs VLE decompressor
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (4 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 07/10] erofs: introduce superblock registration Gao Xiang
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  14 +++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  34 +++++++++++
 fs/erofs/unzip_vle_lz4.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index ffbd5eb..00e811c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -85,3 +85,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e60f535..038d77b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -162,6 +162,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..143b6c3
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned llen, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..1d51edd
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	j = 0;
+	for(i = 0; i < nr_pages; j = i++) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL) {
+				if (!mirrored[j])
+					kunmap_atomic(src);
+				src = NULL;
+			}
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(mirrored[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			mirrored[j] = true;
+			break;
+		}
+
+		if (i) {
+			if (src == NULL)
+				src = mirrored[i-1] ?
+					percpu_data + (i-1) * PAGE_SIZE :
+					kmap_atomic(compressed_pages[i-1]);
+
+			memcpy(dst, src + righthalf, pageofs);
+
+			if (!mirrored[i-1])
+				kunmap_atomic(src);
+
+			if (unlikely(i >= clusterpages)) {
+				kunmap_atomic(dst);
+				break;
+			}
+		}
+
+		if (!righthalf)
+			src = NULL;
+		else {
+			src = mirrored[i] ? percpu_data + i * PAGE_SIZE :
+				kmap_atomic(compressed_pages[i]);
+
+			memcpy(dst + pageofs, src, righthalf);
+		}
+
+		kunmap_atomic(dst);
+	}
+
+	if (src != NULL && !mirrored[j])
+		kunmap_atomic(src);
+
+	preempt_enable();
+	return 0;
+}
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned llen,
+				  unsigned short pageofs)
+{
+	return -ENOTSUPP;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 07/10] erofs: introduce superblock registration
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (5 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 08/10] erofs: introduce erofs shrinker Gao Xiang
                     ` (2 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  6 ++++++
 fs/erofs/super.c    |  4 ++++
 fs/erofs/utils.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 038d77b..cc898b4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -65,6 +65,9 @@ struct erofs_fault_info {
 typedef u64 erofs_nid_t;
 
 struct erofs_sb_info {
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -430,5 +433,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 /* utils.c */
 extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
+extern void erofs_register_super(struct super_block *sb);
+extern void erofs_unregister_super(struct super_block *sb);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0dcf9c7..3d286f4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -315,6 +315,8 @@ static int erofs_read_super(struct super_block *sb,
 	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
 	sbi->dev_name[PATH_MAX - 1] = '\0';
 
+	erofs_register_super(sb);
+
 	/*
 	 * We already have a positive dentry, which was instantiated
 	 * by d_make_root. Just need to d_rehash it.
@@ -352,6 +354,8 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	erofs_unregister_super(sb);
+
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dce5177..78731c5 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,3 +29,20 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+static DEFINE_MUTEX(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_register_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_unregister_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_del(&EROFS_SB(sb)->list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 08/10] erofs: introduce erofs shrinker
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (6 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 07/10] erofs: introduce superblock registration Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 09/10] erofs: introduce workstation for decompression Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 10/10] erofs: introduce VLE decompression support Gao Xiang
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  7 +++++
 fs/erofs/super.c    | 15 ++++++++++
 fs/erofs/utils.c    | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index cc898b4..e202ef3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -94,6 +95,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -436,5 +438,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3d286f4..00ec621 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -354,7 +354,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -396,6 +398,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -415,6 +423,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -423,6 +435,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -431,6 +445,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 78731c5..685e885 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,20 +29,93 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
-	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 09/10] erofs: introduce workstation for decompression
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (7 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 08/10] erofs: introduce erofs shrinker Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 10/10] erofs: introduce VLE decompression support Gao Xiang
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h | 60 ++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/super.c    | 12 +++++++++
 fs/erofs/utils.c    | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e202ef3..489cd4e 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -80,6 +80,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* the dedicated workstation for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} workstn;
 #endif
 
 	u32 build_time_nsec;
@@ -150,6 +158,58 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define erofs_workstn_lock(sbi)         spin_lock(&(sbi)->workstn.lock)
+#define erofs_workstn_unlock(sbi)       spin_unlock(&(sbi)->workstn.lock)
+#else
+#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn.tree)
+#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn.tree)
+#endif
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+static inline bool erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&grp->refcount);
+
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	return 0;
+}
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+
+extern struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag);
+
+extern int erofs_register_workgroup(struct super_block *sb,
+	struct erofs_workgroup *grp, bool tag);
+
+extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+	unsigned long nr_shrink, bool cleanup);
+
+static inline void erofs_workstation_cleanup_all(struct super_block *sb)
+{
+	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
+}
+
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 00ec621..a631ffe 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -285,6 +285,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->workstn.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -355,6 +362,11 @@ static void erofs_put_super(struct super_block *sb)
 	__putname(sbi->dev_name);
 
 	mutex_lock(&sbi->umount_mutex);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	erofs_workstation_cleanup_all(sb);
+#endif
+
 	erofs_unregister_super(sb);
 	mutex_unlock(&sbi->umount_mutex);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 685e885..fac60f6 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,6 +29,76 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+
+/* radix_tree and the future XArray both don't use tagptr_t yet */
+struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn.tree, index);
+	if (grp != NULL) {
+		*tag = radix_tree_exceptional_entry(grp);
+		grp = (void *)((unsigned long)grp &
+			~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		if (erofs_workgroup_get(grp)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		BUG_ON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err = radix_tree_preload(GFP_NOFS);
+
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	erofs_workstn_lock(sbi);
+
+	if (tag)
+		grp = (void *)((unsigned long)grp |
+			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	err = radix_tree_insert(&sbi->workstn.tree,
+		grp->index, grp);
+
+	if (!err) {
+		__erofs_workgroup_get(grp);
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	}
+
+	erofs_workstn_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+				       unsigned long nr_shrink,
+				       bool cleanup)
+{
+	return 0;
+}
+
+#endif
 
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -112,6 +182,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		list_move_tail(&sbi->list, &erofs_sb_list);
 		mutex_unlock(&sbi->umount_mutex);
 
+		freed += erofs_shrink_workstation(sbi, nr, false);
 		if (freed >= nr)
 			break;
 	}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.6 10/10] erofs: introduce VLE decompression support
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (8 preceding siblings ...)
  2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 09/10] erofs: introduce workstation for decompression Gao Xiang
@ 2018-07-06 16:50   ` Gao Xiang
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-06 16:50 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |   6 +-
 fs/erofs/internal.h  |   7 +
 fs/erofs/staging.h   |  38 ++
 fs/erofs/super.c     |  26 ++
 fs/erofs/unzip_vle.c | 982 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h | 194 ++++++++++
 fs/erofs/utils.c     |  48 ++-
 7 files changed, 1295 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 489cd4e..4c5b615 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -193,6 +193,7 @@ static inline bool erofs_workgroup_get(struct erofs_workgroup *grp)
 }
 
 #define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+extern void erofs_workgroup_put(struct erofs_workgroup *grp);
 
 extern struct erofs_workgroup *erofs_find_workgroup(
 	struct super_block *sb, pgoff_t index, bool *tag);
@@ -230,6 +231,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -313,6 +317,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..c9cd542 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,41 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a631ffe..546a308 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -424,6 +431,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -439,6 +452,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -447,6 +466,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -457,6 +480,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index b1e8bbe..447c37b 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,987 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_handler {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (w->compressed_deficit) {
+		--w->compressed_deficit;
+		if (cmpxchg(w->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(w, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&w->vector,
+		page, type, &occupied);
+	w->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline
+struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
+	pgoff_t idx, unsigned pageofs,
+	bool *cached_ret,
+	struct z_erofs_vle_workgroup **grp_ret)
+{
+	bool cached;
+	struct erofs_workgroup *egrp = erofs_find_workgroup(sb, idx, &cached);
+	struct z_erofs_vle_workgroup *grp;
+
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+	*cached_ret = cached;
+
+	return cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+}
+
+static inline struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup *grp,
+			  bool cached,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  erofs_wtptr_t *owned_head)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_work_format(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_WORK_FORMAT_LZ4 :
+			Z_EROFS_WORK_FORMAT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently not implemented */
+	BUG();
+#else
+	work = cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+	/* new works have been claimed as type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, cached);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	return work;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+	/* let's claim these following types of work */
+retry:
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_NIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static inline void __reset_compressed_pages(
+	struct z_erofs_vle_work_handler *w,
+	struct z_erofs_vle_work *work, bool cached,
+	unsigned clusterpages)
+{
+	if (!cached) {
+		w->compressed_pages =
+			z_erofs_vle_work_uncached_mux(work);
+		w->compressed_deficit = clusterpages;
+		return;
+	}
+
+	/* TODO! get cached pages before submitting io */
+	w->compressed_pages = NULL;
+	w->compressed_deficit = 0;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	work = z_erofs_vle_work_find(sb, index,
+		pageofs, &cached, &grp);
+	if (work != NULL) {
+		BUG_ON(index != grp->obj.index);
+
+		__reset_compressed_pages(w, work, cached, clusterpages);
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		w->owner = false;
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			w->owner = true;
+
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, grp,
+		false, map, index, pageofs, owned_head);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+
+	__reset_compressed_pages(w, work, cached, clusterpages);
+	w->owner = true;
+
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->u.work;
+
+	if (!atomic_dec_return(&vgrp->obj.refcount))
+		call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work);
+
+	erofs_workgroup_put(&grp->obj);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
+{
+	struct z_erofs_vle_work *work = w->curr;
+
+	if (work == NULL)
+		return;
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current handler is not the owner.
+	 */
+	if (!w->owner)
+		z_erofs_vle_work_release(work);
+
+	z_erofs_pagevec_ctor_exit(&w->vector, false);
+	mutex_unlock(&work->lock);
+	w->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_handler *h,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head,
+				struct list_head *page_pool)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = true;
+	struct z_erofs_vle_work *work = h->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(h);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(h, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= h->owner;
+	work = h->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(h, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(h, newpage,
+			Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* FIXME: the following snippets are for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc(nr_pages, GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs);
+	if (err != -ENOTSUPP)
+		goto out;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (work->vcnt == nr_pages)
+		goto skip_allocpage;
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_work_release(work);
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_handler(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_handler(sb, io, &sync);
+
+	io = tagptr_unfold_ptr(tio);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+			page->mapping = NULL;
+
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->obj.index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page,
+		&h, &m_iter, &owned_head, &pagepool);
+
+	z_erofs_vle_work_iter_end(&h);
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(page,
+			&h, &m_iter, &owned_head, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&h);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 143b6c3..6d0595d 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,203 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	atomic_t refcount;
+#endif
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_set_work_format(grp, fmt) \
+	((grp)->flags = ((grp)->flags & ~Z_EROFS_WORK_FORMAT_MASK) | (fmt))
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index fac60f6..083a07f 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -95,7 +96,49 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (!unlikely(--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink) {
+		first_index += i;
+		goto repeat;
+	}
+	return freed;
 }
 
 #endif
@@ -107,9 +150,6 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
 void erofs_register_super(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (7 preceding siblings ...)
  2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-09 19:17 ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (9 more replies)
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  10 siblings, 10 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


change log v0.7:
 - several bugfix ( buffer overflow, shrinker, ownership, etc... )
 - all features available
 - it works now, and need do more for the random read compared
   with the old decompression version.

change log v0.6:
 - preliminary works (could boot into launcher)
 - still have minor buges to fix

change log v0.5:
 - add reclaim path
 - almost work, still debugging

change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix

The patchset is temporarily based on
"erofs: fix erofs_module_init & exit"

Gao Xiang (10):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce superblock registration
  erofs: introduce erofs shrinker
  erofs: introduce workstation for decompression
  erofs: introduce VLE decompression support

 fs/erofs/Kconfig         |   24 +
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |  133 +++++
 fs/erofs/staging.h       |   42 ++
 fs/erofs/super.c         |   57 +++
 fs/erofs/unzip_pagevec.h |  172 +++++++
 fs/erofs/unzip_vle.c     | 1200 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  228 +++++++++
 fs/erofs/unzip_vle_lz4.c |  146 ++++++
 fs/erofs/utils.c         |  233 +++++++++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 +++++
 15 files changed, 2384 insertions(+), 50 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 01/10] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 02/10] erofs: introduce pagevec for unzip subsystem
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 03/10] erofs: add erofs_map_blocks_iter
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 04/10] erofs: add erofs_allocpage Gao Xiang
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  10 +++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9c8696e..ffbd5eb 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,3 +75,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e52252f..2377cf4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -73,6 +73,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..b1e8bbe
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size - 1;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* [walking mode] 'map' has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto unmap_out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto exact_hitted;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+exact_hitted:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+unmap_out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+out:
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 04/10] erofs: add erofs_allocpage
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2377cf4..07bab28 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -381,5 +381,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 05/10] erofs: globalize prepare_bio and __submit_bio
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 04/10] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index de99217..2efe69f 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -47,33 +47,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -96,7 +69,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -237,6 +210,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -254,6 +229,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -261,8 +239,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -271,7 +248,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -295,7 +272,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 07bab28..e60f535 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -291,6 +291,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 06/10] erofs: add a generic z_erofs VLE decompressor
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (4 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 07/10] erofs: introduce superblock registration Gao Xiang
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  14 ++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  35 ++++++++
 fs/erofs/unzip_vle_lz4.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index ffbd5eb..00e811c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -85,3 +85,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e60f535..038d77b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -162,6 +162,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..8e23e44
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned outlen, unsigned short pageofs,
+	void (*endio)(struct page *));
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..fda8e6d
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	j = 0;
+	for(i = 0; i < nr_pages; j = i++) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL) {
+				if (!mirrored[j])
+					kunmap_atomic(src);
+				src = NULL;
+			}
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(mirrored[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			mirrored[j] = true;
+			break;
+		}
+
+		if (i) {
+			if (src == NULL)
+				src = mirrored[i-1] ?
+					percpu_data + (i-1) * PAGE_SIZE :
+					kmap_atomic(compressed_pages[i-1]);
+
+			memcpy(dst, src + righthalf, pageofs);
+
+			if (!mirrored[i-1])
+				kunmap_atomic(src);
+
+			if (unlikely(i >= clusterpages)) {
+				kunmap_atomic(dst);
+				break;
+			}
+		}
+
+		if (!righthalf)
+			src = NULL;
+		else {
+			src = mirrored[i] ? percpu_data + i * PAGE_SIZE :
+				kmap_atomic(compressed_pages[i]);
+
+			memcpy(dst + pageofs, src, righthalf);
+		}
+
+		kunmap_atomic(dst);
+	}
+
+	if (src != NULL && !mirrored[j])
+		kunmap_atomic(src);
+
+	preempt_enable();
+	return 0;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned outlen,
+				  unsigned short pageofs,
+				  void (*endio)(struct page *))
+{
+	void *vin, *vout;
+	unsigned nr_pages, i, j;
+	int ret;
+
+	if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE)
+		return -ENOTSUPP;
+
+	nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE);
+
+	if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else
+		vin = erofs_vmap(compressed_pages, clusterpages);
+
+	preempt_disable();
+	vout = erofs_pcpubuf[smp_processor_id()].data;
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, outlen);
+
+	if (ret >= 0) {
+		outlen = ret;
+		ret = 0;
+	}
+
+	for(i = 0; i < nr_pages; ++i) {
+		j = min((unsigned)PAGE_SIZE - pageofs, outlen);
+
+		if (pages[i] != NULL) {
+			if (ret < 0)
+				SetPageError(pages[i]);
+			else if (clusterpages == 1 && pages[i] == compressed_pages[0])
+				memcpy(vin + pageofs, vout + pageofs, j);
+			else {
+				void *dst = kmap_atomic(pages[i]);
+
+				memcpy(dst + pageofs, vout + pageofs, j);
+				kunmap_atomic(dst);
+			}
+			endio(pages[i]);
+		}
+		vout += PAGE_SIZE;
+		outlen -= j;
+		pageofs = 0;
+	}
+	preempt_enable();
+
+	if (clusterpages == 1)
+		kunmap_atomic(vin);
+	else
+		erofs_vunmap(vin, clusterpages);
+
+	return ret;
+}
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 07/10] erofs: introduce superblock registration
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (5 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 08/10] erofs: introduce erofs shrinker Gao Xiang
                     ` (2 subsequent siblings)
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  6 ++++++
 fs/erofs/super.c    |  4 ++++
 fs/erofs/utils.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 038d77b..cc898b4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -65,6 +65,9 @@ struct erofs_fault_info {
 typedef u64 erofs_nid_t;
 
 struct erofs_sb_info {
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -430,5 +433,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 /* utils.c */
 extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
+extern void erofs_register_super(struct super_block *sb);
+extern void erofs_unregister_super(struct super_block *sb);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0dcf9c7..3d286f4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -315,6 +315,8 @@ static int erofs_read_super(struct super_block *sb,
 	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
 	sbi->dev_name[PATH_MAX - 1] = '\0';
 
+	erofs_register_super(sb);
+
 	/*
 	 * We already have a positive dentry, which was instantiated
 	 * by d_make_root. Just need to d_rehash it.
@@ -352,6 +354,8 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	erofs_unregister_super(sb);
+
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dce5177..78731c5 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,3 +29,20 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+static DEFINE_MUTEX(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_register_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_unregister_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_del(&EROFS_SB(sb)->list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 08/10] erofs: introduce erofs shrinker
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (6 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 07/10] erofs: introduce superblock registration Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 09/10] erofs: introduce workstation for decompression Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 10/10] erofs: introduce VLE decompression support Gao Xiang
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  7 +++++
 fs/erofs/super.c    | 15 ++++++++++
 fs/erofs/utils.c    | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index cc898b4..e202ef3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -94,6 +95,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -436,5 +438,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3d286f4..00ec621 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -354,7 +354,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -396,6 +398,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -415,6 +423,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -423,6 +435,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -431,6 +445,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 78731c5..685e885 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,20 +29,93 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
-	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 09/10] erofs: introduce workstation for decompression
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (7 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 08/10] erofs: introduce erofs shrinker Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 10/10] erofs: introduce VLE decompression support Gao Xiang
  9 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h | 61 ++++++++++++++++++++++++++++++++++++++++
 fs/erofs/super.c    | 12 ++++++++
 fs/erofs/utils.c    | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e202ef3..2c20492 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -80,6 +80,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* the dedicated workstation for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} workstn;
 #endif
 
 	u32 build_time_nsec;
@@ -150,6 +158,59 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define erofs_workstn_lock(sbi)         spin_lock(&(sbi)->workstn.lock)
+#define erofs_workstn_unlock(sbi)       spin_unlock(&(sbi)->workstn.lock)
+#else
+#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn.tree)
+#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn.tree)
+#endif
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
+{
+	int o;
+
+repeat:
+	o = atomic_read(&grp->refcount);
+
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	*ocnt = o;
+	return 0;
+}
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+
+extern struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag);
+
+extern int erofs_register_workgroup(struct super_block *sb,
+	struct erofs_workgroup *grp, bool tag);
+
+extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+	unsigned long nr_shrink, bool cleanup);
+
+static inline void erofs_workstation_cleanup_all(struct super_block *sb)
+{
+	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
+}
+
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 00ec621..a631ffe 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -285,6 +285,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->workstn.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -355,6 +362,11 @@ static void erofs_put_super(struct super_block *sb)
 	__putname(sbi->dev_name);
 
 	mutex_lock(&sbi->umount_mutex);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	erofs_workstation_cleanup_all(sb);
+#endif
+
 	erofs_unregister_super(sb);
 	mutex_unlock(&sbi->umount_mutex);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 685e885..ab37072 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,6 +29,83 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+
+/* radix_tree and the future XArray both don't use tagptr_t yet */
+struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+	int oldcount;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn.tree, index);
+	if (grp != NULL) {
+		*tag = radix_tree_exceptional_entry(grp);
+		grp = (void *)((unsigned long)grp &
+			~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		if (erofs_workgroup_get(grp, &oldcount)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		/* decrease refcount added by erofs_workgroup_put */
+		if (unlikely(oldcount == 1))
+			atomic_long_dec(&erofs_global_shrink_cnt);
+		BUG_ON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err;
+
+	/* grp->refcount should not < 1 */
+	BUG_ON(!atomic_read(&grp->refcount));
+
+	err = radix_tree_preload(GFP_NOFS);
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	erofs_workstn_lock(sbi);
+
+	if (tag)
+		grp = (void *)((unsigned long)grp |
+			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	err = radix_tree_insert(&sbi->workstn.tree,
+		grp->index, grp);
+
+	if (!err) {
+		__erofs_workgroup_get(grp);
+	}
+
+	erofs_workstn_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+				       unsigned long nr_shrink,
+				       bool cleanup)
+{
+	return 0;
+}
+
+#endif
 
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -37,9 +114,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
 void erofs_register_super(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -112,6 +186,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		list_move_tail(&sbi->list, &erofs_sb_list);
 		mutex_unlock(&sbi->umount_mutex);
 
+		freed += erofs_shrink_workstation(sbi, nr, false);
 		if (freed >= nr)
 			break;
 	}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [WIP] [NOMERGE] [RFC PATCH v0.7 10/10] erofs: introduce VLE decompression support
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (8 preceding siblings ...)
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 09/10] erofs: introduce workstation for decompression Gao Xiang
@ 2018-07-09 19:17   ` Gao Xiang
  2018-07-13 13:17     ` [PATCH 1/2] temp commit 1 Gao Xiang
  9 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-09 19:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |   6 +-
 fs/erofs/internal.h  |   8 +
 fs/erofs/staging.h   |  46 +++
 fs/erofs/super.c     |  26 ++
 fs/erofs/unzip_vle.c | 991 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h | 194 ++++++++++
 fs/erofs/utils.c     |  59 ++-
 7 files changed, 1327 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2c20492..8d7b6ab 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -195,6 +195,8 @@ static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
 
 #define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
 
+extern int erofs_workgroup_put(struct erofs_workgroup *grp);
+
 extern struct erofs_workgroup *erofs_find_workgroup(
 	struct super_block *sb, pgoff_t index, bool *tag);
 
@@ -231,6 +233,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -314,6 +319,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..47c9708d 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,49 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a631ffe..546a308 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -424,6 +431,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -439,6 +452,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -447,6 +466,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -457,6 +480,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index b1e8bbe..04c50cd 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,996 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+struct z_erofs_vle_work_handler {
+	bool owner;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page)
+{
+	/* the following is a lockless approach */
+	while (w->compressed_deficit) {
+		--w->compressed_deficit;
+		if (cmpxchg(w->compressed_pages++, NULL, page) == NULL)
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_handler *w,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(w, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&w->vector,
+		page, type, &occupied);
+	w->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline
+struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
+	pgoff_t idx, unsigned pageofs,
+	bool *cached_ret,
+	struct z_erofs_vle_workgroup **grp_ret)
+{
+	bool cached;
+	struct erofs_workgroup *egrp = erofs_find_workgroup(sb, idx, &cached);
+	struct z_erofs_vle_workgroup *grp;
+
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+	*cached_ret = cached;
+
+	return cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+}
+
+static inline struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup *grp,
+			  bool cached,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  erofs_wtptr_t *owned_head)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_work_format(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_WORK_FORMAT_LZ4 :
+			Z_EROFS_WORK_FORMAT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently not implemented */
+	BUG();
+#else
+	work = cached ? z_erofs_vle_work_cached(grp, pageofs) :
+		z_erofs_vle_work_uncached(grp, pageofs);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+	/* new works have been claimed as type 1 */
+	WRITE_ONCE(work->next, *owned_head);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, cached);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	return work;
+}
+
+static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
+     erofs_wtptr_t *owned_head, bool cached)
+{
+	/* let's claim these following types of work */
+retry:
+	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
+		/* type 2, link to a existing chain */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
+			Z_EROFS_WORK_TPTR_TAIL))
+			goto retry;
+
+		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
+		/* type 1 */
+		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_NIL, *owned_head),
+			Z_EROFS_WORK_TPTR_NIL))
+			goto retry;
+
+		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static inline void __reset_compressed_pages(
+	struct z_erofs_vle_work_handler *w,
+	struct z_erofs_vle_work *work, bool cached,
+	unsigned clusterpages)
+{
+	if (!cached) {
+		w->compressed_pages =
+			z_erofs_vle_work_uncached_mux(work);
+		w->compressed_deficit = clusterpages;
+		return;
+	}
+
+	/* TODO! get cached pages before submitting io */
+	w->compressed_pages = NULL;
+	w->compressed_deficit = 0;
+}
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       erofs_wtptr_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	bool cached;
+	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(map->m_pa % EROFS_BLKSIZ);
+
+repeat:
+	work = z_erofs_vle_work_find(sb, index,
+		pageofs, &cached, &grp);
+	if (work != NULL) {
+		BUG_ON(index != grp->obj.index);
+
+		__reset_compressed_pages(w, work, cached, clusterpages);
+		BUG_ON(work->pageofs != pageofs);
+
+		mutex_lock(&work->lock);
+
+		if (grp->llen < map->m_llen)
+			grp->llen = map->m_llen;
+
+		w->owner = false;
+		/* claim the work if it can */
+		if (try_to_claim_work(work, owned_head, cached))
+			w->owner = true;
+
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, grp,
+		false, map, index, pageofs, owned_head);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+
+	__reset_compressed_pages(w, work, cached, clusterpages);
+	w->owner = true;
+
+	mutex_lock(&work->lock);
+
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->u.work;
+
+	call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work);
+
+	erofs_workgroup_put(&grp->obj);
+}
+
+static inline void
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
+{
+	struct z_erofs_vle_work *work = w->curr;
+
+	if (work == NULL)
+		return;
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current handler is not the owner.
+	 */
+	if (!w->owner)
+		z_erofs_vle_work_release(work);
+
+	z_erofs_pagevec_ctor_exit(&w->vector, false);
+	mutex_unlock(&work->lock);
+	w->curr = NULL;
+}
+
+static int z_erofs_do_read_page(struct page *page,
+				struct z_erofs_vle_work_handler *h,
+				struct erofs_map_blocks_iter *m,
+				erofs_wtptr_t *owned_head,
+				struct list_head *page_pool)
+{
+	struct inode *const inode = page->mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	const loff_t offset = page_offset(page);
+	bool owned = h->owner;
+	struct z_erofs_vle_work *work = h->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= m->map.m_la &&
+            offset + cur < m->map.m_la + m->map.m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	z_erofs_vle_work_iter_end(h);
+
+	m->map.m_la = offset + cur;
+	m->map.m_llen = 0;
+	err = erofs_map_blocks_iter(inode, &m->map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(m->map.m_plen != 1 << EROFS_SB(sb)->clusterbits);
+	BUG_ON(m->map.m_pa % EROFS_BLKSIZ);
+
+	err = z_erofs_vle_work_iter_begin(h, sb, &m->map, owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= h->owner;
+	work = h->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
+	if (unlikely(!(m->map.m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(h, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(h, newpage,
+			Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - m->map.m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	m->map.m_llen = offset + cur - m->map.m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, m->map.m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool async = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (async)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		if (unlikely(err))
+			SetPageError(page);
+
+		/* FIXME: the following snippets are for cached work */
+		else if (0)
+			SetPageUptodate(page);
+
+		if (0)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_work *work,
+	bool cached, struct list_head *page_pool)
+{
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	unsigned sparsemem_pages = 0;
+#endif
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_workgroup *grp;
+	void *vout;
+	int err;
+
+	BUG_ON(!READ_ONCE(work->nr_pages));
+	might_sleep();
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc_array(nr_pages,
+			sizeof(struct page *), GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	if (cached) {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_cached_managed(grp);
+	} else {
+		grp = z_erofs_vle_work_workgroup(work);
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+
+		for(i = 0; i < clusterpages; ++i) {
+			unsigned pagenr;
+
+			BUG_ON(compressed_pages[i] == NULL);
+			page = compressed_pages[i];
+
+			if (page->mapping == NULL)
+				continue;
+
+			pagenr = z_erofs_onlinepage_index(page);
+
+			BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+			BUG_ON(pages[pagenr] != NULL);
+			++sparsemem_pages;
+#endif
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs,
+		z_erofs_onlinepage_endio);
+	if (err != -ENOTSUPP)
+		goto out_percpu;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (sparsemem_pages >= nr_pages) {
+		BUG_ON(sparsemem_pages > nr_pages);
+		goto skip_allocpage;
+	}
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+out_percpu:
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		if (!cached)
+			WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	mutex_unlock(&work->lock);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	erofs_wtptr_t owned = io->head;
+	struct z_erofs_vle_work *work;
+	bool cached;
+
+	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	do {
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned);
+		cached = tagptr_unfold_tags(owned);
+
+		owned = READ_ONCE(work->next);
+		z_erofs_vle_unzip(sb, work, cached, page_pool);
+
+		z_erofs_vle_work_release(work);
+	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline tagptr1_t prepare_io_handler(
+	struct super_block *sb,
+	struct z_erofs_vle_unzip_io *io,
+	bool *sync)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	/* use the existing on-stack dummy descriptor for sync mode */
+	if (io != NULL) {
+		*sync = true;
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+
+		return tagptr_fold(tagptr1_t, io, 0);
+	}
+
+	/* allocate extra io descriptor in async mode */
+	sync = false;
+
+	iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+		GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(iosb == NULL);
+
+	iosb->sb = sb;
+	io = &iosb->io;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+
+	return tagptr_fold(tagptr1_t, io, 1);
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   erofs_wtptr_t owned_head,
+				   struct list_head *page_pool,
+				   struct z_erofs_vle_unzip_io *io)
+{
+	struct bio *bio = NULL;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	pgoff_t last_page;
+	bool sync;
+	unsigned bios_submitted;
+	tagptr1_t tio;
+
+	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+		return false;
+
+	tio = prepare_io_handler(sb, io, &sync);
+
+	io = tagptr_unfold_ptr(tio);
+	io->head = owned_head;
+
+	bios_submitted = 0;
+
+	do {
+		struct z_erofs_vle_work *work;
+		struct z_erofs_vle_workgroup *grp;
+		bool cached, locked;
+		struct page **compressed_pages;
+		pgoff_t current_page;
+		unsigned i;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+
+		work = tagptr_unfold_ptr(owned_head);
+		cached = tagptr_unfold_tags(owned_head);
+
+		/* close the owned chain at first */
+		owned_head = tagptr_cmpxchg(&work->next,
+			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
+
+		grp = z_erofs_vle_work_workgroup(work);
+
+		BUG_ON(cached);
+
+		locked = false;
+		if (unlikely(mutex_is_locked(&work->lock))) {
+			mutex_lock(&work->lock);
+			locked = true;
+		}
+
+		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		/* fulfill all compressed pages */
+		for (i = 0; i < clusterpages; ++i) {
+			struct page *page;
+
+			if (READ_ONCE(compressed_pages[i]) != NULL)
+				continue;
+
+			page = erofs_allocpage(page_pool, GFP_KERNEL);
+			page->mapping = NULL;
+
+			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+				list_add(&page->lru, page_pool);
+		}
+
+		if (unlikely(locked))
+			mutex_unlock(&work->lock);
+
+		current_page = grp->obj.index;
+		i = 0;
+
+		if (bio != NULL && last_page + 1 != current_page) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+repeat:
+		if (bio == NULL) {
+			bio = prepare_bio(sb, current_page,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(tio);
+
+			++bios_submitted;
+		}
+
+		err = bio_add_page(bio, compressed_pages[i], PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		last_page = current_page;
+		++current_page;
+
+		if (++i < clusterpages)
+			goto repeat;
+	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(tio), bios_submitted);
+	return true;
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL, .owner = true };
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct super_block *sb;
+	struct z_erofs_vle_unzip_io io;
+	LIST_HEAD(pagepool);
+
+	int err = z_erofs_do_read_page(page,
+		&h, &m_iter, &owned_head, &pagepool);
+
+	z_erofs_vle_work_iter_end(&h);
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	sb = page->mapping->host->i_sb;
+
+	if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+		goto out;
+
+	/* wait until all bios are completed */
+	wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+	/* synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io, &pagepool);
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct erofs_map_blocks_iter m_iter = {
+		.map = { .m_llen = 0, .m_plen = 0 },
+		.mpage = NULL
+	};
+	struct z_erofs_vle_work_handler h = { .curr = NULL, .owner = true };
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(page,
+			&h, &m_iter, &owned_head, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+	z_erofs_vle_work_iter_end(&h);
+
+	if (!sync)
+		z_erofs_vle_submit_all(sb, owned_head, &pagepool, NULL);
+	else {
+		struct z_erofs_vle_unzip_io io;
+
+		if (!z_erofs_vle_submit_all(sb, owned_head, &pagepool, &io))
+			goto out;
+
+		/* wait until all bios are completed */
+		wait_event(io.u.wait, !atomic_read(&io.pending_bios));
+
+		/* let's synchronous decompression */
+		z_erofs_vle_unzip_all(sb, &io, &pagepool);
+	}
+
+out:
+	if (m_iter.mpage != NULL)
+		put_page(m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 8e23e44..7542aa8 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,203 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/* (uncached/cached) work tagged pointer */
+typedef tagptr1_t       erofs_wtptr_t;
+
+/* let's avoid the 32-bit valid kernel address */
+
+/* the chained works haven't io submitted (still open) */
+#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
+/* the chained works have already io submitted */
+#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
+
+
+#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
+#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
+	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
+
+#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+	struct mutex lock;
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	atomic_t refcount;
+#endif
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	/* L: the next owned work */
+	erofs_wtptr_t next;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_WORK_FORMAT_PLAIN       0
+#define Z_EROFS_WORK_FORMAT_LZ4         1
+#define Z_EROFS_WORK_FORMAT_MASK        1
+
+struct z_erofs_vle_work_uncached {
+	struct z_erofs_vle_work work;
+
+	/* multi-usage (both used for decompressed / compressed pages) */
+	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_cached_header {
+	struct z_erofs_vle_work work;
+
+	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
+};
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	union {
+		struct z_erofs_vle_work work;
+		struct z_erofs_vle_work_uncached uncached;
+		struct z_erofs_vle_cached_header cached;
+	} u;
+
+	unsigned int llen, flags;
+};
+
+#define z_erofs_vle_workgroup_fmt(grp)	\
+	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+
+#define z_erofs_vle_set_work_format(grp, fmt) \
+	((grp)->flags = ((grp)->flags & ~Z_EROFS_WORK_FORMAT_MASK) | (fmt))
+
+#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
+#define z_erofs_vle_work_uncached_mux(wrk)      \
+	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
+#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
+#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
+#define z_erofs_vle_work_workgroup(wrk) \
+	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	erofs_wtptr_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ab37072..a46a8b7 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,67 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
 
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
+
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+		first_index = grp->index + 1;
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [PATCH 1/2] temp commit 1
  2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 10/10] erofs: introduce VLE decompression support Gao Xiang
@ 2018-07-13 13:17     ` Gao Xiang
  2018-07-13 13:17       ` [PATCH 2/2] temp commit 2 Gao Xiang
  0 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-13 13:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  38 ++++++
 fs/erofs/Makefile    |   2 +-
 fs/erofs/unzip_vle.c | 374 +++++++++++++++++++++++++++++----------------------
 fs/erofs/unzip_vle.h | 103 +++++++-------
 4 files changed, 308 insertions(+), 209 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 00e811c..583a7b3 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -99,3 +99,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	  than 2. Otherwise, the image cannot be mounted
 	  correctly on this kernel.
 
+choice
+	prompt "EROFS VLE Data Decompression mode"
+	depends on EROFS_FS_ZIP
+	help
+	  EROFS supports three options for VLE decompression.
+	  "In-place Decompression Only" consumes the minimum memory
+	  with lowest random read.
+
+	  "Bidirectional Cached Decompression" consumes the maximum memory
+	  with highest random read.
+
+	  If unsure, select "Bidirectional Cached Decompression"
+
+config EROFS_FS_ZIP_0
+	bool "In-place Decompression Only"
+	help
+	  Read compressed data into page cache and do in-place
+	  decompression directly.
+
+config EROFS_FS_ZIP_UNIDIRECTIONAL
+	bool "Unidirectional Cached Decompression"
+	help
+	  For each request, it caches the last compressed page
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+config EROFS_FS_ZIP_BIDIRECTIONAL
+	bool "Bidirectional Cached Decompression"
+	default y
+	help
+	  For each request, it caches the both end compressed pages
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+	  Recommended for performance priority.
+
+endchoice
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index fa9d179..0c34265 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,6 +1,6 @@
 EROFS_VERSION = "1.0"
 
-EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+EXTRA_CFLAGS += -g -O1 -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 04c50cd..7ca1d5d 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -56,8 +56,16 @@ int z_erofs_init_zip_subsystem(void)
 	return -ENOMEM;
 }
 
+enum z_erofs_vle_workrole {
+	Z_EROFS_VLE_WORK_SECONDARY,
+	Z_EROFS_VLE_WORK_PRIMARY,
+	Z_EROFS_VLE_WORK_PRIMARY_OWNER,
+	Z_EROFS_VLE_WORK_MAX
+};
+
 struct z_erofs_vle_work_handler {
-	bool owner;
+	enum z_erofs_vle_workrole role;
+
 	struct z_erofs_vle_work *curr;
 	struct z_erofs_pagevec_ctor vector;
 
@@ -66,6 +74,9 @@ struct z_erofs_vle_work_handler {
 	unsigned compressed_deficit;
 };
 
+#define VLE_WORK_HANDLER_INIT()	\
+	{ .curr = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_OWNER }
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
 	struct z_erofs_vle_work_handler *w,
@@ -91,7 +102,8 @@ static int z_erofs_vle_work_add_page(
 	bool occupied;
 
 	/* give priority for the compressed data storage */
-	if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+	if (w->role >= Z_EROFS_VLE_WORK_PRIMARY &&
+		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
 		try_to_reuse_as_compressed_page(w, page))
 		return 0;
 
@@ -102,16 +114,45 @@ static int z_erofs_vle_work_add_page(
 	return ret ? 0 : -EAGAIN;
 }
 
-static inline
-struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
+static inline bool try_to_claim_workgroup(
+	struct z_erofs_vle_workgroup *grp,
+	z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	/* let's claim these following types of workgroup */
+retry:
+	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
+		/* type 1, nil workgroup */
+		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
+			goto retry;
+
+		*owned_head = grp;
+	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
+		/* type 2, link to the end of a existing chain */
+		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
+			goto retry;
+
+		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+struct z_erofs_vle_work *
+z_erofs_vle_work_lookup(struct super_block *sb,
 	pgoff_t idx, unsigned pageofs,
-	bool *cached_ret,
-	struct z_erofs_vle_workgroup **grp_ret)
+	struct z_erofs_vle_workgroup **grp_ret,
+	enum z_erofs_vle_workrole *role,
+	z_erofs_vle_owned_workgrp_t *owned_head)
 {
-	bool cached;
-	struct erofs_workgroup *egrp = erofs_find_workgroup(sb, idx, &cached);
+	bool tag, primary;
+	struct erofs_workgroup *egrp;
 	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
 
+	egrp = erofs_find_workgroup(sb, idx, &tag);
 	if (egrp == NULL) {
 		*grp_ret = NULL;
 		return NULL;
@@ -119,21 +160,73 @@ struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
 
 	*grp_ret = grp = container_of(egrp,
 		struct z_erofs_vle_workgroup, obj);
-	*cached_ret = cached;
 
-	return cached ? z_erofs_vle_work_cached(grp, pageofs) :
-		z_erofs_vle_work_uncached(grp, pageofs);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_work(grp, pageofs);
+	primary = true;
+#else
+	BUG();
+#endif
+
+	/*
+	 * lock must be taken first to avoid grp->next == NIL between
+	 * claiming workgroup and adding pages:
+	 *                        grp->next != NIL
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                        mutex_lock(&work->lock)
+	 *                        add all pages to pagevec
+	 *
+	 * [correct locking case 1]:
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
+	 *   ...                          *role = SECONDARY
+	 *                                add all pages to pagevec
+	 *                                ...
+	 *                                mutex_unlock(grp->work[c])
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *
+	 * [correct locking case 2]:
+	 *   mutex_lock(grp->work[b])
+	 *   ...
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                                mutex_lock(grp->work[a])
+	 *                                *role = PRIMARY_OWNER
+	 *                                add all pages to pagevec
+	 *                                ...
+	 */
+	mutex_lock(&work->lock);
+
+	if (!primary)
+		*role = Z_EROFS_VLE_WORK_SECONDARY;
+	/* claim the workgroup if possible */
+	else if (try_to_claim_workgroup(grp, owned_head))
+		*role = Z_EROFS_VLE_WORK_PRIMARY_OWNER;
+	else
+		*role = Z_EROFS_VLE_WORK_PRIMARY;
+
+	return work;
 }
 
-static inline struct z_erofs_vle_work *
+struct z_erofs_vle_work *
 z_erofs_vle_work_register(struct super_block *sb,
-			  struct z_erofs_vle_workgroup *grp,
-			  bool cached,
+			  struct z_erofs_vle_workgroup **grp_ret,
 			  struct erofs_map_blocks *map,
 			  pgoff_t index, unsigned pageofs,
-			  erofs_wtptr_t *owned_head)
+			  enum z_erofs_vle_workrole *role,
+			  z_erofs_vle_owned_workgrp_t *owned_head)
 {
 	bool newgrp = false;
+	struct z_erofs_vle_workgroup *grp = *grp_ret;
 	struct z_erofs_vle_work *work;
 
 #ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
@@ -150,29 +243,31 @@ struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
 	grp->obj.index = index;
 	grp->llen = map->m_llen;
 
-	z_erofs_vle_set_work_format(grp,
+	z_erofs_vle_set_workgrp_fmt(grp,
 		(map->m_flags & EROFS_MAP_ZIPPED) ?
-			Z_EROFS_WORK_FORMAT_LZ4 :
-			Z_EROFS_WORK_FORMAT_PLAIN);
+			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
+			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
 	atomic_set(&grp->obj.refcount, 1);
 
+	/* new workgrps have been claimed as type 1 */
+	WRITE_ONCE(grp->next, *owned_head);
+	/* primary & owner work role for new workgrps */
+	*role = Z_EROFS_VLE_WORK_PRIMARY_OWNER;
+
 	newgrp = true;
 #ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
 skip:
-	/* currently not implemented */
+	/* currently unimplemented */
 	BUG();
 #else
-	work = cached ? z_erofs_vle_work_cached(grp, pageofs) :
-		z_erofs_vle_work_uncached(grp, pageofs);
+	work = z_erofs_vle_grab_primary_work(grp);
 #endif
 	work->pageofs = pageofs;
 
 	mutex_init(&work->lock);
-	/* new works have been claimed as type 1 */
-	WRITE_ONCE(work->next, *owned_head);
 
 	if (newgrp) {
-		int err = erofs_register_workgroup(sb, &grp->obj, cached);
+		int err = erofs_register_workgroup(sb, &grp->obj, 0);
 
 		if (err) {
 			kmem_cache_free(z_erofs_workgroup_cachep, grp);
@@ -180,61 +275,45 @@ struct z_erofs_vle_work *z_erofs_vle_work_find(struct super_block *sb,
 		}
 	}
 
-	*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
-	return work;
-}
-
-static inline bool try_to_claim_work(struct z_erofs_vle_work *work,
-     erofs_wtptr_t *owned_head, bool cached)
-{
-	/* let's claim these following types of work */
-retry:
-	if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_TAIL)) {
-		/* type 2, link to a existing chain */
-		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
-			Z_EROFS_WORK_TPTR_TAIL, *owned_head),
-			Z_EROFS_WORK_TPTR_TAIL))
-			goto retry;
-
-		*owned_head = Z_EROFS_WORK_TPTR_TAIL;
-	} else if (tagptr_eq(work->next, Z_EROFS_WORK_TPTR_NIL)) {
-		/* type 1 */
-		if (!tagptr_eq(tagptr_cmpxchg(&work->next,
-			Z_EROFS_WORK_TPTR_NIL, *owned_head),
-			Z_EROFS_WORK_TPTR_NIL))
-			goto retry;
-
-		*owned_head = tagptr_fold(erofs_wtptr_t, work, cached);
-	} else
-		return false;	/* :( better luck next time */
+	*owned_head = *grp_ret = grp;
 
-	return true;	/* lucky, I am the owner :) */
+	mutex_lock(&work->lock);
+	return work;
 }
 
 static inline void __reset_compressed_pages(
 	struct z_erofs_vle_work_handler *w,
-	struct z_erofs_vle_work *work, bool cached,
+	struct z_erofs_vle_workgroup *grp, bool page_reuse,
 	unsigned clusterpages)
 {
-	if (!cached) {
-		w->compressed_pages =
-			z_erofs_vle_work_uncached_mux(work);
+	if (page_reuse) {
+		w->compressed_pages = grp->compressed_pages;
 		w->compressed_deficit = clusterpages;
 		return;
 	}
 
-	/* TODO! get cached pages before submitting io */
 	w->compressed_pages = NULL;
 	w->compressed_deficit = 0;
 }
 
+static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
+					 unsigned int llen)
+{
+	while(1) {
+		unsigned int orig_llen = grp->llen;
+
+		if (orig_llen >= llen || cmpxchg(&grp->llen,
+			orig_llen, llen) == orig_llen)
+			break;
+	}
+}
+
 static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
 				       struct super_block *sb,
 				       struct erofs_map_blocks *map,
-				       erofs_wtptr_t *owned_head)
+				       z_erofs_vle_owned_workgrp_t *owned_head)
 {
 	struct z_erofs_vle_workgroup *grp;
-	bool cached;
 	pgoff_t index = map->m_pa / EROFS_BLKSIZ;
 	struct z_erofs_vle_work *work;
 	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
@@ -243,48 +322,39 @@ static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
 	BUG_ON(w->curr != NULL);
 
 	/* must be Z_EROFS_WORK_TAIL or the next chained work */
-	BUG_ON(tagptr_cast_ptr(*owned_head) == NULL);
+	BUG_ON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+	BUG_ON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
 	BUG_ON(map->m_pa % EROFS_BLKSIZ);
 
 repeat:
-	work = z_erofs_vle_work_find(sb, index,
-		pageofs, &cached, &grp);
+	work = z_erofs_vle_work_lookup(sb, index,
+		pageofs, &grp, &w->role, owned_head);
 	if (work != NULL) {
 		BUG_ON(index != grp->obj.index);
-
-		__reset_compressed_pages(w, work, cached, clusterpages);
 		BUG_ON(work->pageofs != pageofs);
 
-		mutex_lock(&work->lock);
-
-		if (grp->llen < map->m_llen)
-			grp->llen = map->m_llen;
-
-		w->owner = false;
-		/* claim the work if it can */
-		if (try_to_claim_work(work, owned_head, cached))
-			w->owner = true;
-
+		__update_workgrp_llen(grp, map->m_llen);
 		goto got_it;
 	}
 
-	work = z_erofs_vle_work_register(sb, grp,
-		false, map, index, pageofs, owned_head);
+	work = z_erofs_vle_work_register(sb, &grp,
+		map, index, pageofs, &w->role, owned_head);
 
 	if (unlikely(work == ERR_PTR(-EAGAIN)))
 		goto repeat;
 
 	if (unlikely(IS_ERR(work)))
 		return PTR_ERR(work);
-
-	__reset_compressed_pages(w, work, cached, clusterpages);
-	w->owner = true;
-
-	mutex_lock(&work->lock);
-
 got_it:
 	z_erofs_pagevec_ctor_init(&w->vector,
 		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+
+	if (w->role >= Z_EROFS_VLE_WORK_PRIMARY)
+		__reset_compressed_pages(w, grp, true, clusterpages);
+	else
+		__reset_compressed_pages(w, grp, false, 0);
+
 	w->curr = work;
 	return 0;
 }
@@ -293,7 +363,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
 {
 	struct z_erofs_vle_work *work =	container_of(head,
 		struct z_erofs_vle_work, rcu);
-	struct z_erofs_vle_workgroup *grp = z_erofs_vle_work_workgroup(work);
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
 
 	kmem_cache_free(z_erofs_workgroup_cachep, grp);
 }
@@ -302,7 +373,7 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
 {
 	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
 		struct z_erofs_vle_workgroup, obj);
-	struct z_erofs_vle_work *const work = &vgrp->u.work;
+	struct z_erofs_vle_work *const work = &vgrp->work;
 
 	call_rcu(&work->rcu, z_erofs_rcu_callback);
 }
@@ -310,11 +381,13 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
 void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
 {
 	struct z_erofs_vle_workgroup *grp =
-		z_erofs_vle_work_workgroup(work);
+		z_erofs_vle_work_workgroup(work, true);
 
 	erofs_workgroup_put(&grp->obj);
 }
 
+#define handler_is_owner(w) ((w)->role >= Z_EROFS_VLE_WORK_PRIMARY_OWNER)
+
 static inline void
 z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
 {
@@ -327,7 +400,7 @@ void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
 	 * if all pending pages are added, don't hold work reference
 	 * any longer if the current handler is not the owner.
 	 */
-	if (!w->owner)
+	if (!handler_is_owner(w))
 		z_erofs_vle_work_release(work);
 
 	z_erofs_pagevec_ctor_exit(&w->vector, false);
@@ -338,13 +411,13 @@ void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
 static int z_erofs_do_read_page(struct page *page,
 				struct z_erofs_vle_work_handler *h,
 				struct erofs_map_blocks_iter *m,
-				erofs_wtptr_t *owned_head,
+				z_erofs_vle_owned_workgrp_t *owned_head,
 				struct list_head *page_pool)
 {
 	struct inode *const inode = page->mapping->host;
 	struct super_block *const sb = inode->i_sb;
 	const loff_t offset = page_offset(page);
-	bool owned = h->owner;
+	bool owned = handler_is_owner(h);
 	struct z_erofs_vle_work *work = h->curr;
 	enum z_erofs_page_type page_type;
 	unsigned cur, end, spiltted, index;
@@ -385,7 +458,7 @@ static int z_erofs_do_read_page(struct page *page,
 	if (unlikely(err))
 		goto err_out;
 
-	owned &= h->owner;
+	owned &= handler_is_owner(h);
 	work = h->curr;
 hitted:
 	cur = end - min_t(unsigned, offset + end - m->map.m_la, end);
@@ -498,8 +571,8 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 static DEFINE_MUTEX(z_pagemap_global_lock);
 
 static int z_erofs_vle_unzip(struct super_block *sb,
-	struct z_erofs_vle_work *work,
-	bool cached, struct list_head *page_pool)
+	struct z_erofs_vle_workgroup *grp,
+	struct list_head *page_pool)
 {
 	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
 	struct z_erofs_pagevec_ctor ctor;
@@ -513,12 +586,17 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
 	enum z_erofs_page_type page_type;
 	bool overlapped;
-	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
 	void *vout;
 	int err;
 
-	BUG_ON(!READ_ONCE(work->nr_pages));
 	might_sleep();
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_primary_work(grp);
+#else
+	BUG();
+#endif
+	BUG_ON(!READ_ONCE(work->nr_pages));
 
 	mutex_lock(&work->lock);
 	nr_pages = work->nr_pages;
@@ -578,38 +656,32 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	z_erofs_pagevec_ctor_exit(&ctor, true);
 
 	overlapped = false;
-	if (cached) {
-		grp = z_erofs_vle_work_workgroup(work);
-		compressed_pages = z_erofs_vle_cached_managed(grp);
-	} else {
-		grp = z_erofs_vle_work_workgroup(work);
-		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+	compressed_pages = grp->compressed_pages;
 
-		for(i = 0; i < clusterpages; ++i) {
-			unsigned pagenr;
+	for(i = 0; i < clusterpages; ++i) {
+		unsigned pagenr;
 
-			BUG_ON(compressed_pages[i] == NULL);
-			page = compressed_pages[i];
+		BUG_ON(compressed_pages[i] == NULL);
+		page = compressed_pages[i];
 
-			if (page->mapping == NULL)
-				continue;
+		if (page->mapping == NULL)
+			continue;
 
-			pagenr = z_erofs_onlinepage_index(page);
+		pagenr = z_erofs_onlinepage_index(page);
 
-			BUG_ON(pagenr >= nr_pages);
+		BUG_ON(pagenr >= nr_pages);
 #ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
-			BUG_ON(pages[pagenr] != NULL);
-			++sparsemem_pages;
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
 #endif
-			pages[pagenr] = page;
+		pages[pagenr] = page;
 
-			overlapped = true;
-		}
+		overlapped = true;
 	}
 
 	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
 
-	if (z_erofs_vle_workgroup_fmt(grp) == Z_EROFS_WORK_FORMAT_PLAIN) {
+	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
 		BUG_ON(grp->llen != llen);
 
 		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
@@ -675,8 +747,7 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 		if (page->mapping == NULL)
 			list_add(&page->lru, page_pool);
 
-		if (!cached)
-			WRITE_ONCE(compressed_pages[i], NULL);
+		WRITE_ONCE(compressed_pages[i], NULL);
 	}
 
 	if (pages == z_pagemap_global)
@@ -687,8 +758,14 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	work->nr_pages = 0;
 	work->vcnt = 0;
 
-	WRITE_ONCE(work->next, Z_EROFS_WORK_TPTR_NIL);
+	/* all work locks MUST be taken before */
+
+	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
+
+	/* all work locks SHOULD be released right now */
 	mutex_unlock(&work->lock);
+
+	z_erofs_vle_work_release(work);
 	return err;
 }
 
@@ -696,26 +773,23 @@ static void z_erofs_vle_unzip_all(struct super_block *sb,
 				  struct z_erofs_vle_unzip_io *io,
 				  struct list_head *page_pool)
 {
-	erofs_wtptr_t owned = io->head;
-	struct z_erofs_vle_work *work;
-	bool cached;
+	z_erofs_vle_owned_workgrp_t owned = io->head;
 
-	BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+	BUG_ON(owned == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
 	do {
+		struct z_erofs_vle_workgroup *grp;
+
 		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
-		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL));
+		BUG_ON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
 
 		/* no possible that 'owned' equals NULL */
-		BUG_ON(tagptr_eq(owned, Z_EROFS_WORK_TPTR_NIL));
-
-		work = tagptr_unfold_ptr(owned);
-		cached = tagptr_unfold_tags(owned);
+		BUG_ON(owned == Z_EROFS_VLE_WORKGRP_NIL);
 
-		owned = READ_ONCE(work->next);
-		z_erofs_vle_unzip(sb, work, cached, page_pool);
+		grp = owned;
+		owned = READ_ONCE(grp->next);
 
-		z_erofs_vle_work_release(work);
-	} while (!tagptr_eq(owned, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
+		z_erofs_vle_unzip(sb, grp, page_pool);
+	} while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
 }
 
 static void z_erofs_vle_unzip_wq(struct work_struct *work)
@@ -762,7 +836,7 @@ static inline tagptr1_t prepare_io_handler(
 }
 
 static bool z_erofs_vle_submit_all(struct super_block *sb,
-				   erofs_wtptr_t owned_head,
+				   z_erofs_vle_owned_workgrp_t owned_head,
 				   struct list_head *page_pool,
 				   struct z_erofs_vle_unzip_io *io)
 {
@@ -773,7 +847,7 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	unsigned bios_submitted;
 	tagptr1_t tio;
 
-	if (unlikely(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL)))
+	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
 		return false;
 
 	tio = prepare_io_handler(sb, io, &sync);
@@ -784,36 +858,23 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	bios_submitted = 0;
 
 	do {
-		struct z_erofs_vle_work *work;
 		struct z_erofs_vle_workgroup *grp;
-		bool cached, locked;
 		struct page **compressed_pages;
 		pgoff_t current_page;
 		unsigned i;
 		int err;
 
 		/* no possible 'owned_head' equals the following */
-		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL_CLOSED));
-		BUG_ON(tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_NIL));
+		BUG_ON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+		BUG_ON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
 
-		work = tagptr_unfold_ptr(owned_head);
-		cached = tagptr_unfold_tags(owned_head);
+		grp = owned_head;
 
 		/* close the owned chain at first */
-		owned_head = tagptr_cmpxchg(&work->next,
-			Z_EROFS_WORK_TPTR_TAIL, Z_EROFS_WORK_TPTR_TAIL_CLOSED);
-
-		grp = z_erofs_vle_work_workgroup(work);
-
-		BUG_ON(cached);
+		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
+			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
 
-		locked = false;
-		if (unlikely(mutex_is_locked(&work->lock))) {
-			mutex_lock(&work->lock);
-			locked = true;
-		}
-
-		compressed_pages = z_erofs_vle_work_uncached_mux(work);
+		compressed_pages = grp->compressed_pages;
 		/* fulfill all compressed pages */
 		for (i = 0; i < clusterpages; ++i) {
 			struct page *page;
@@ -828,9 +889,6 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 				list_add(&page->lru, page_pool);
 		}
 
-		if (unlikely(locked))
-			mutex_unlock(&work->lock);
-
 		current_page = grp->obj.index;
 		i = 0;
 
@@ -857,7 +915,7 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 
 		if (++i < clusterpages)
 			goto repeat;
-	} while (!tagptr_eq(owned_head, Z_EROFS_WORK_TPTR_TAIL));
+	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
 
 	if (bio != NULL)
 		__submit_bio(bio, REQ_OP_READ, 0);
@@ -873,8 +931,8 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
 		.map = { .m_llen = 0, .m_plen = 0 },
 		.mpage = NULL
 	};
-	struct z_erofs_vle_work_handler h = { .curr = NULL, .owner = true };
-	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
+	struct z_erofs_vle_work_handler h = VLE_WORK_HANDLER_INIT();
+	z_erofs_vle_owned_workgrp_t owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
 	struct super_block *sb;
 	struct z_erofs_vle_unzip_io io;
 	LIST_HEAD(pagepool);
@@ -917,12 +975,12 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 		.map = { .m_llen = 0, .m_plen = 0 },
 		.mpage = NULL
 	};
-	struct z_erofs_vle_work_handler h = { .curr = NULL, .owner = true };
+	struct z_erofs_vle_work_handler h = VLE_WORK_HANDLER_INIT();
+	z_erofs_vle_owned_workgrp_t owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
 	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 	struct page *head = NULL;
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-	erofs_wtptr_t owned_head = Z_EROFS_WORK_TPTR_TAIL;
 	LIST_HEAD(pagepool);
 
 	for (; nr_pages; --nr_pages) {
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 7542aa8..2a446f9 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -16,22 +16,13 @@
 #include "internal.h"
 #include "unzip_pagevec.h"
 
-/* (uncached/cached) work tagged pointer */
-typedef tagptr1_t       erofs_wtptr_t;
-
-/* let's avoid the 32-bit valid kernel address */
-
-/* the chained works haven't io submitted (still open) */
-#define Z_EROFS_WORK_TAIL               0x5F0ECAFE
-/* the chained works have already io submitted */
-#define Z_EROFS_WORK_TAIL_CLOSED        0x5F0EDEAD
-
-
-#define Z_EROFS_WORK_TPTR_TAIL  tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL)
-#define Z_EROFS_WORK_TPTR_TAIL_CLOSED \
-	tagptr_init(erofs_wtptr_t, Z_EROFS_WORK_TAIL_CLOSED)
-
-#define Z_EROFS_WORK_TPTR_NIL   tagptr_init(erofs_wtptr_t, NULL)
+#ifdef CONFIG_EROFS_FS_ZIP_BIDIRECTIONAL
+#define EROFS_FS_ZIP_CACHE_LVL	(2)
+#elif defined(CONFIG_EROFS_FS_ZIP_UNIDIRECTIONAL)
+#define EROFS_FS_ZIP_CACHE_LVL	(1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL	(0)
+#endif
 
 /*
  * Structure fields follow one of the following exclusion rules.
@@ -45,11 +36,13 @@
 
 struct z_erofs_vle_work {
 	/* struct z_erofs_vle_work *left, *right; */
-	struct mutex lock;
 
 #ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+
 	atomic_t refcount;
 #endif
+	struct mutex lock;
 
 	/* I: decompression offset in page */
 	unsigned short pageofs;
@@ -57,8 +50,6 @@ struct z_erofs_vle_work {
 
 	/* L: queued pages in pagevec[] */
 	unsigned vcnt;
-	/* L: the next owned work */
-	erofs_wtptr_t next;
 
 	union {
 		/* L: pagevec */
@@ -67,54 +58,66 @@ struct z_erofs_vle_work {
 	};
 };
 
-#define Z_EROFS_WORK_FORMAT_PLAIN       0
-#define Z_EROFS_WORK_FORMAT_LZ4         1
-#define Z_EROFS_WORK_FORMAT_MASK        1
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
 
-struct z_erofs_vle_work_uncached {
-	struct z_erofs_vle_work work;
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
 
-	/* multi-usage (both used for decompressed / compressed pages) */
-	struct page *mux[Z_EROFS_CLUSTER_MAX_PAGES];
-};
-
-struct z_erofs_vle_cached_header {
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
 	struct z_erofs_vle_work work;
 
-	struct page *managed[Z_EROFS_CLUSTER_MAX_PAGES];
-};
+#if (EROFS_FS_ZIP_CACHE_LVL > 0)
+	/* used for cached compressed pages reclaim serialization */
+	rwlock_t reclaim_lock;
+#endif
 
-struct z_erofs_vle_workgroup {
-	struct erofs_workgroup obj;
-	union {
-		struct z_erofs_vle_work work;
-		struct z_erofs_vle_work_uncached uncached;
-		struct z_erofs_vle_cached_header cached;
-	} u;
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
 
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
 	unsigned int llen, flags;
 };
 
-#define z_erofs_vle_workgroup_fmt(grp)	\
-	((grp)->flags & Z_EROFS_WORK_FORMAT_MASK)
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
 
-#define z_erofs_vle_set_work_format(grp, fmt) \
-	((grp)->flags = ((grp)->flags & ~Z_EROFS_WORK_FORMAT_MASK) | (fmt))
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
 
-#define z_erofs_vle_work_uncached(grp, pageofs) (&(grp)->u.uncached.work)
-#define z_erofs_vle_work_uncached_mux(wrk)      \
-	(container_of(wrk, struct z_erofs_vle_work_uncached, work)->mux)
-#define z_erofs_vle_work_cached(grp, pageofs)   (&(grp)->u.cached.work)
-#define z_erofs_vle_cached_managed(grp)         ((grp)->u.cached.managed)
-#define z_erofs_vle_work_workgroup(wrk) \
-	container_of(wrk, struct z_erofs_vle_workgroup, u.work)
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
 
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+
+#endif
 
 #define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
 
 struct z_erofs_vle_unzip_io {
 	atomic_t pending_bios;
-	erofs_wtptr_t head;
+	z_erofs_vle_owned_workgrp_t head;
 
 	union {
 		wait_queue_head_t wait;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [PATCH 2/2] temp commit 2
  2018-07-13 13:17     ` [PATCH 1/2] temp commit 1 Gao Xiang
@ 2018-07-13 13:17       ` Gao Xiang
  0 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-13 13:17 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h  |  61 ++++++++++++++++
 fs/erofs/super.c     |  79 ++++++++++++++++++++-
 fs/erofs/unzip_vle.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++----
 fs/erofs/unzip_vle.h |   8 ---
 4 files changed, 323 insertions(+), 22 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 8d7b6ab..edba0da 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,22 @@ struct erofs_fault_info {
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_ZIP_BIDIRECTIONAL
+#define EROFS_FS_ZIP_CACHE_LVL	(2)
+#elif defined(CONFIG_EROFS_FS_ZIP_UNIDIRECTIONAL)
+#define EROFS_FS_ZIP_CACHE_LVL	(1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL	(0)
+#endif
+
+#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
+#define EROFS_FS_HAS_MANAGED_CACHE
+
+#define EROFS_UNALLOCATED_CACHED_PAGE	((void *)0x5F0EF00D)
+
+extern int try_to_free_cached_page(struct address_space *, struct page *);
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -88,6 +104,11 @@ struct erofs_sb_info {
 		spinlock_t lock;
 #endif
 	} workstn;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct inode *managed_cache;
+#endif
+
 #endif
 
 	u32 build_time_nsec;
@@ -176,13 +197,53 @@ struct erofs_workgroup {
 	atomic_t refcount;
 };
 
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+
+static inline bool erofs_workgroup_try_to_freeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	if (v != atomic_cmpxchg(&grp->refcount,
+		v, EROFS_LOCKED_MAGIC))
+		return false;
+	preempt_disable();
+#else
+	preempt_disable();
+	if (atomic_read(&grp->refcount) != v) {
+		preempt_enable();
+		return false;
+	}
+#endif
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	atomic_set(&grp->refcount, v);
+#endif
+	preempt_enable();
+}
+
 static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
 {
+	const int locked = (int)EROFS_LOCKED_MAGIC;
 	int o;
 
 repeat:
 	o = atomic_read(&grp->refcount);
 
+	/* spin if it is temporarily locked at the reclaim path */
+	if (unlikely(o == locked)) {
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+		do
+			cpu_relax();
+		while (atomic_read(&grp->refcount) == locked);
+#endif
+		goto repeat;
+	}
+
 	if (unlikely(o <= 0))
 		return -1;
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 546a308..fc4d750 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -245,6 +245,67 @@ static int parse_options(struct super_block *sb, char *options)
 	return 0;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static const struct address_space_operations managed_cache_aops;
+
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	int ret = 1;	/* 0 - busy */
+	struct address_space *const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(mapping->a_ops != &managed_cache_aops);
+
+	if (PagePrivate(page))
+		ret = try_to_free_cached_page(mapping, page);
+
+	return ret;
+}
+
+static void managed_cache_invalidatepage(struct page *page,
+	unsigned int offset, unsigned int length)
+{
+	const unsigned int stop = length + offset;
+
+	BUG_ON(!PageLocked(page));
+
+	/* Check for overflow */
+	BUG_ON(stop > PAGE_SIZE || stop < length);
+
+	if (offset == 0 && stop == PAGE_SIZE)
+		while(!managed_cache_releasepage(page, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+	.releasepage = managed_cache_releasepage,
+	.invalidatepage = managed_cache_invalidatepage,
+};
+
+struct inode *erofs_init_managed_cache(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (unlikely(inode == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &managed_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+	                     GFP_NOFS | __GFP_HIGHMEM |
+	                     __GFP_MOVABLE |  __GFP_NOFAIL
+#if defined(CONFIG_CMA) && defined(___GFP_CMA)
+	                     | ___GFP_CMA
+#endif
+	                    );
+	return inode;
+}
+
+#endif
+
 static int erofs_read_super(struct super_block *sb,
 	const char *dev_name, void *data, int silent)
 {
@@ -299,11 +360,19 @@ static int erofs_read_super(struct super_block *sb,
 #endif
 #endif
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	sbi->managed_cache = erofs_init_managed_cache(sb);
+	if (sbi->managed_cache == NULL) {
+		err = -ENOMEM;
+		goto err_sbi;
+	}
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto err_sbi;
+		goto iget_err;
 	}
 
 	if (!S_ISDIR(inode->i_mode)) {
@@ -346,6 +415,10 @@ static int erofs_read_super(struct super_block *sb,
 err_iput:
 	if (sb->s_root == NULL)
 		iput(inode);
+iget_err:
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
 err_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -368,6 +441,10 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
+
 	mutex_lock(&sbi->umount_mutex);
 
 #ifdef CONFIG_EROFS_FS_ZIP
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 7ca1d5d..0920851 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -77,6 +77,107 @@ struct z_erofs_vle_work_handler {
 #define VLE_WORK_HANDLER_INIT()	\
 	{ .curr = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_OWNER }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+int try_to_free_cached_page(struct address_space *mapping, struct page *page)
+{
+	struct inode *inode = mapping->host;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(inode->i_sb));
+	struct z_erofs_vle_workgroup *grp = (void *)page_private(page);
+	int i, ret = 0;
+
+	/* prevent the workgroup from being freed */
+	rcu_read_lock();
+
+	if (!erofs_workgroup_try_to_freeze(&grp->obj, 1))
+		goto out;
+
+	for(i = 0; i < clusterpages; ++i) {
+		if (grp->compressed_pages[i] == page) {
+			WRITE_ONCE(grp->compressed_pages[i], NULL);
+			ret = 1;
+			break;
+		}
+	}
+	erofs_workgroup_unfreeze(&grp->obj, 1);
+out:
+	rcu_read_unlock();
+
+	if (ret) {
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+	return ret;
+}
+
+static inline void grab_compressed_pages_in_managed_cache(
+	struct super_block *sb,
+	pgoff_t start,
+	struct page **compressed_pages,
+	int clusterpages, bool alloc_reserve)
+{
+	pgoff_t cur;
+#if 0
+	struct page *pages[clusterpages];
+#endif
+	unsigned i, j, found;
+
+	for (i = 0; i < clusterpages; ++i) {
+		if (READ_ONCE(compressed_pages[i]) == NULL) {
+			cur = (start += i);
+			compressed_pages += i;
+			clusterpages -= i;		
+			goto hitted;
+		}
+	}
+
+	return;
+hitted:
+#if 0
+	found = find_get_pages_range(EROFS_SB(sb)->managed_cache->i_mapping,
+		&cur, start + clusterpages, clusterpages, pages);
+#endif
+
+	i = 0;
+	for (j = 0; i < clusterpages && j < found; ++i) {
+		struct page *page;
+
+#if 0
+		cur = pages[j]->index - start;
+		if (cur != i) {
+			if (!alloc_reserve)
+				continue;
+			page = EROFS_UNALLOCATED_CACHED_PAGE;
+		} else
+			page = pages[j++];
+#else
+		page = find_get_page(EROFS_SB(sb)->managed_cache->i_mapping,
+			start + j);
+		if (page == NULL) {
+			if (!alloc_reserve)
+				continue;
+			page = EROFS_UNALLOCATED_CACHED_PAGE;
+		}
+#endif
+
+		if (cmpxchg(compressed_pages + cur, NULL, page) == NULL)
+			continue;
+
+		if (cur == i)
+			put_page(page);
+	}
+
+	if (!alloc_reserve)
+		return;
+
+	while (i < clusterpages) {
+		cmpxchg(compressed_pages + i, NULL,
+			EROFS_UNALLOCATED_CACHED_PAGE);
+		++i;
+	}
+}
+
+#endif
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
 	struct z_erofs_vle_work_handler *w,
@@ -308,6 +409,13 @@ static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
 	}
 }
 
+#define handler_is_owner(w) ((w)->role >= Z_EROFS_VLE_WORK_PRIMARY_OWNER)
+
+struct z_erofs_vle_frontend {
+	bool initial;
+	erofs_off_t cachedzone_la;
+};
+
 static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_handler *w,
 				       struct super_block *sb,
 				       struct erofs_map_blocks *map,
@@ -386,8 +494,6 @@ void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
 	erofs_workgroup_put(&grp->obj);
 }
 
-#define handler_is_owner(w) ((w)->role >= Z_EROFS_VLE_WORK_PRIMARY_OWNER)
-
 static inline void
 z_erofs_vle_work_iter_end(struct z_erofs_vle_work_handler *w)
 {
@@ -408,7 +514,8 @@ void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
 	w->curr = NULL;
 }
 
-static int z_erofs_do_read_page(struct page *page,
+static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
+				struct page *page,
 				struct z_erofs_vle_work_handler *h,
 				struct erofs_map_blocks_iter *m,
 				z_erofs_vle_owned_workgrp_t *owned_head,
@@ -458,6 +565,14 @@ static int z_erofs_do_read_page(struct page *page,
 	if (unlikely(err))
 		goto err_out;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	grab_compressed_pages_in_managed_cache(sb,
+		m->map.m_pa / EROFS_BLKSIZ, h->compressed_pages,
+		h->compressed_deficit, fe->initial || m->map.m_la <= fe->cachedzone_la);
+
+	fe->initial = false;
+#endif
+
 	owned &= handler_is_owner(h);
 	work = h->curr;
 hitted:
@@ -542,6 +657,7 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 {
 	unsigned i;
 	struct bio_vec *bvec;
+
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
 	const int err = bio->bi_status;
 #elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
@@ -550,16 +666,25 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
+		bool cachedpage = false;
 
 		DBG_BUGON(PageUptodate(page));
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping != NULL) {
+			struct inode *inode = page->mapping->host;
+
+			cachedpage = (inode ==
+				EROFS_SB(inode->i_sb)->managed_cache);
+		}
+#endif
+
 		if (unlikely(err))
 			SetPageError(page);
-
-		/* FIXME: the following snippets are for cached work */
-		else if (0)
+		else if (cachedpage)
 			SetPageUptodate(page);
 
-		if (0)
+		if (cachedpage)
 			unlock_page(page);
 	}
 
@@ -574,7 +699,8 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	struct z_erofs_vle_workgroup *grp,
 	struct list_head *page_pool)
 {
-	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned clusterpages = erofs_clusterpages(sbi);
 	struct z_erofs_pagevec_ctor ctor;
 	unsigned nr_pages;
 #ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
@@ -666,6 +792,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
 		if (page->mapping == NULL)
 			continue;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping->host == sbi->managed_cache)
+			continue;
+#endif
 
 		pagenr = z_erofs_onlinepage_index(page);
 
@@ -747,6 +877,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 		if (page->mapping == NULL)
 			list_add(&page->lru, page_pool);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		else if (page->mapping->host == sbi->managed_cache)
+			continue;
+#endif
 		WRITE_ONCE(compressed_pages[i], NULL);
 	}
 
@@ -841,7 +975,8 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 				   struct z_erofs_vle_unzip_io *io)
 {
 	struct bio *bio = NULL;
-	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned clusterpages = erofs_clusterpages(sbi);
 	pgoff_t last_page;
 	bool sync;
 	unsigned bios_submitted;
@@ -878,15 +1013,34 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		/* fulfill all compressed pages */
 		for (i = 0; i < clusterpages; ++i) {
 			struct page *page;
+			struct page *old = READ_ONCE(compressed_pages[i]);
+			bool cached = (old == EROFS_UNALLOCATED_CACHED_PAGE);
 
-			if (READ_ONCE(compressed_pages[i]) != NULL)
+			if (old != NULL && !cached)
 				continue;
 
 			page = erofs_allocpage(page_pool, GFP_KERNEL);
 			page->mapping = NULL;
+			if (cached) {
+				if (add_to_page_cache_lru(page,
+					sbi->managed_cache->i_mapping,
+					grp->obj.index + i, GFP_KERNEL))
+					cached = false;
+				else {
+					if (cmpxchg(compressed_pages + i,
+						old, page) == old) {
+						set_page_private(page, (unsigned long)grp);
+						SetPagePrivate(page);
+					} else {
+						put_page(page);
+					}
+					continue;
+				}
+			}
 
-			if (cmpxchg(compressed_pages + i, NULL, page) != NULL)
+			if (cmpxchg(compressed_pages + i, old, page) != old) {
 				list_add(&page->lru, page_pool);
+			}
 		}
 
 		current_page = grp->obj.index;
@@ -898,6 +1052,19 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 			bio = NULL;
 		}
 repeat:
+		if (compressed_pages[i]->mapping == sbi->managed_cache->i_mapping) {
+			if (PageUptodate(compressed_pages[i])) {
+				if (++i < clusterpages) {
+					if (bio != NULL)
+						goto submit_bio_retry;
+					else
+						goto repeat;
+				}
+				continue;
+			}
+			BUG_ON(!PageLocked(compressed_pages[i]));
+		}
+
 		if (bio == NULL) {
 			bio = prepare_bio(sb, current_page,
 				BIO_MAX_PAGES, z_erofs_vle_read_endio);
@@ -927,6 +1094,8 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 static int z_erofs_vle_normalaccess_readpage(struct file *file,
                                              struct page *page)
 {
+	struct z_erofs_vle_frontend fe = { .initial = true,
+		.cachedzone_la = page->index << PAGE_SHIFT };
 	struct erofs_map_blocks_iter m_iter = {
 		.map = { .m_llen = 0, .m_plen = 0 },
 		.mpage = NULL
@@ -937,7 +1106,7 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
 	struct z_erofs_vle_unzip_io io;
 	LIST_HEAD(pagepool);
 
-	int err = z_erofs_do_read_page(page,
+	int err = z_erofs_do_read_page(&fe, page,
 		&h, &m_iter, &owned_head, &pagepool);
 
 	z_erofs_vle_work_iter_end(&h);
@@ -971,6 +1140,8 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 	struct address_space *mapping,
 	struct list_head *pages, unsigned nr_pages, bool sync)
 {
+	struct z_erofs_vle_frontend fe = { .initial = true,
+		.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT };
 	struct erofs_map_blocks_iter m_iter = {
 		.map = { .m_llen = 0, .m_plen = 0 },
 		.mpage = NULL
@@ -1006,7 +1177,7 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 		/* traversal in reverse order */
 		head = (void *)page_private(page);
 
-		err = z_erofs_do_read_page(page,
+		err = z_erofs_do_read_page(&fe, page,
 			&h, &m_iter, &owned_head, &pagepool);
 		if (err) {
 			struct erofs_vnode *vi = EROFS_V(inode);
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 2a446f9..91e4a80 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -16,14 +16,6 @@
 #include "internal.h"
 #include "unzip_pagevec.h"
 
-#ifdef CONFIG_EROFS_FS_ZIP_BIDIRECTIONAL
-#define EROFS_FS_ZIP_CACHE_LVL	(2)
-#elif defined(CONFIG_EROFS_FS_ZIP_UNIDIRECTIONAL)
-#define EROFS_FS_ZIP_CACHE_LVL	(1)
-#else
-#define EROFS_FS_ZIP_CACHE_LVL	(0)
-#endif
-
 /*
  * Structure fields follow one of the following exclusion rules.
  *
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (8 preceding siblings ...)
  2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-17 14:18 ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (10 more replies)
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  10 siblings, 11 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


TODO List:
 - further minor cleanup
 - bugfix
 - will be stable this week

change log v1:
 - introduce cached decompression in order to boost random read
 - several bugfix
 *******************************************************************
 * In this version, we have an amazing seq & rand read performance *
 *******************************************************************
change log v0.7:
 - several bugfix ( buffer overflow, shrinker, ownership, etc... )
 - all features available
 - it works now, and need do more for the random read compared
   with the old decompression version.

change log v0.6:
 - preliminary works (could boot into launcher)
 - still have minor buges to fix

change log v0.5:
 - add reclaim path
 - almost work, still debugging

change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix


Gao Xiang (11):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce superblock registration
  erofs: introduce erofs shrinker
  erofs: introduce workstation for decompression
  erofs: introduce VLE decompression support
  erofs: introduce cached decompression

 fs/erofs/Kconfig         |   62 ++
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |  200 ++++++
 fs/erofs/staging.h       |   50 ++
 fs/erofs/super.c         |  132 +++-
 fs/erofs/unzip_pagevec.h |  172 +++++
 fs/erofs/unzip_vle.c     | 1572 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  219 +++++++
 fs/erofs/unzip_vle_lz4.c |  209 ++++++
 fs/erofs/utils.c         |  270 ++++++++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 ++++
 15 files changed, 3034 insertions(+), 51 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 01/11] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 02/11] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (9 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 02/11] erofs: introduce pagevec for unzip subsystem
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 03/11] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (8 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 03/11] erofs: add erofs_map_blocks_iter
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 02/11] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 04/11] erofs: add erofs_allocpage Gao Xiang
                     ` (7 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  10 +++
 fs/erofs/Makefile    |   1 +
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9c8696e..ffbd5eb 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,3 +75,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e52252f..2377cf4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -73,6 +73,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..456ef4a
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = (unsigned long long)lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size - 1;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* [walking mode] 'map' has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto unmap_out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto exact_hitted;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+exact_hitted:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+unmap_out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+out:
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 04/11] erofs: add erofs_allocpage
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 03/11] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 05/11] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                     ` (6 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2377cf4..07bab28 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -381,5 +381,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 05/11] erofs: globalize prepare_bio and __submit_bio
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 04/11] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 06/11] erofs: add a generic z_erofs VLE decompressor Gao Xiang
                     ` (5 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index de99217..2efe69f 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -47,33 +47,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -96,7 +69,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -237,6 +210,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -254,6 +229,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -261,8 +239,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -271,7 +248,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -295,7 +272,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 07bab28..e60f535 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -291,6 +291,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 06/11] erofs: add a generic z_erofs VLE decompressor
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (4 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 05/11] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 07/11] erofs: introduce superblock registration Gao Xiang
                     ` (4 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  14 ++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  35 ++++++++
 fs/erofs/unzip_vle_lz4.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index ffbd5eb..00e811c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -85,3 +85,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e60f535..038d77b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -162,6 +162,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..8e23e44
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned outlen, unsigned short pageofs,
+	void (*endio)(struct page *));
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..fda8e6d
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	j = 0;
+	for(i = 0; i < nr_pages; j = i++) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL) {
+				if (!mirrored[j])
+					kunmap_atomic(src);
+				src = NULL;
+			}
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(mirrored[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			mirrored[j] = true;
+			break;
+		}
+
+		if (i) {
+			if (src == NULL)
+				src = mirrored[i-1] ?
+					percpu_data + (i-1) * PAGE_SIZE :
+					kmap_atomic(compressed_pages[i-1]);
+
+			memcpy(dst, src + righthalf, pageofs);
+
+			if (!mirrored[i-1])
+				kunmap_atomic(src);
+
+			if (unlikely(i >= clusterpages)) {
+				kunmap_atomic(dst);
+				break;
+			}
+		}
+
+		if (!righthalf)
+			src = NULL;
+		else {
+			src = mirrored[i] ? percpu_data + i * PAGE_SIZE :
+				kmap_atomic(compressed_pages[i]);
+
+			memcpy(dst + pageofs, src, righthalf);
+		}
+
+		kunmap_atomic(dst);
+	}
+
+	if (src != NULL && !mirrored[j])
+		kunmap_atomic(src);
+
+	preempt_enable();
+	return 0;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned outlen,
+				  unsigned short pageofs,
+				  void (*endio)(struct page *))
+{
+	void *vin, *vout;
+	unsigned nr_pages, i, j;
+	int ret;
+
+	if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE)
+		return -ENOTSUPP;
+
+	nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE);
+
+	if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else
+		vin = erofs_vmap(compressed_pages, clusterpages);
+
+	preempt_disable();
+	vout = erofs_pcpubuf[smp_processor_id()].data;
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, outlen);
+
+	if (ret >= 0) {
+		outlen = ret;
+		ret = 0;
+	}
+
+	for(i = 0; i < nr_pages; ++i) {
+		j = min((unsigned)PAGE_SIZE - pageofs, outlen);
+
+		if (pages[i] != NULL) {
+			if (ret < 0)
+				SetPageError(pages[i]);
+			else if (clusterpages == 1 && pages[i] == compressed_pages[0])
+				memcpy(vin + pageofs, vout + pageofs, j);
+			else {
+				void *dst = kmap_atomic(pages[i]);
+
+				memcpy(dst + pageofs, vout + pageofs, j);
+				kunmap_atomic(dst);
+			}
+			endio(pages[i]);
+		}
+		vout += PAGE_SIZE;
+		outlen -= j;
+		pageofs = 0;
+	}
+	preempt_enable();
+
+	if (clusterpages == 1)
+		kunmap_atomic(vin);
+	else
+		erofs_vunmap(vin, clusterpages);
+
+	return ret;
+}
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 07/11] erofs: introduce superblock registration
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (5 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 06/11] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 08/11] erofs: introduce erofs shrinker Gao Xiang
                     ` (3 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  6 ++++++
 fs/erofs/super.c    |  4 ++++
 fs/erofs/utils.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 038d77b..cc898b4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -65,6 +65,9 @@ struct erofs_fault_info {
 typedef u64 erofs_nid_t;
 
 struct erofs_sb_info {
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -430,5 +433,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 /* utils.c */
 extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
+extern void erofs_register_super(struct super_block *sb);
+extern void erofs_unregister_super(struct super_block *sb);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0dcf9c7..3d286f4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -315,6 +315,8 @@ static int erofs_read_super(struct super_block *sb,
 	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
 	sbi->dev_name[PATH_MAX - 1] = '\0';
 
+	erofs_register_super(sb);
+
 	/*
 	 * We already have a positive dentry, which was instantiated
 	 * by d_make_root. Just need to d_rehash it.
@@ -352,6 +354,8 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	erofs_unregister_super(sb);
+
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dce5177..78731c5 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,3 +29,20 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+static DEFINE_MUTEX(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_register_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_unregister_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_del(&EROFS_SB(sb)->list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 08/11] erofs: introduce erofs shrinker
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (6 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 07/11] erofs: introduce superblock registration Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 09/11] erofs: introduce workstation for decompression Gao Xiang
                     ` (2 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  7 +++++
 fs/erofs/super.c    | 15 ++++++++++
 fs/erofs/utils.c    | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index cc898b4..e202ef3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -94,6 +95,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -436,5 +438,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3d286f4..00ec621 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -354,7 +354,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -396,6 +398,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -415,6 +423,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -423,6 +435,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -431,6 +445,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 78731c5..685e885 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,20 +29,93 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
-	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 09/11] erofs: introduce workstation for decompression
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (7 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 08/11] erofs: introduce erofs shrinker Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 10/11] erofs: introduce VLE decompression support Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 11/11] erofs: introduce cached decompression Gao Xiang
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/super.c    |  12 ++++++
 fs/erofs/utils.c    |  81 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e202ef3..ed2e701 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -80,6 +80,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* the dedicated workstation for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} workstn;
 #endif
 
 	u32 build_time_nsec;
@@ -150,6 +158,101 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define erofs_workstn_lock(sbi)         spin_lock(&(sbi)->workstn.lock)
+#define erofs_workstn_unlock(sbi)       spin_unlock(&(sbi)->workstn.lock)
+#else
+#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn.tree)
+#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn.tree)
+#endif
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+
+static inline bool erofs_workgroup_try_to_freeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	if (v != atomic_cmpxchg(&grp->refcount,
+		v, EROFS_LOCKED_MAGIC))
+		return false;
+	preempt_disable();
+#else
+	preempt_disable();
+	if (atomic_read(&grp->refcount) != v) {
+		preempt_enable();
+		return false;
+	}
+#endif
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	atomic_set(&grp->refcount, v);
+#endif
+	preempt_enable();
+}
+
+static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
+{
+	const int locked = (int)EROFS_LOCKED_MAGIC;
+	int o;
+
+repeat:
+	o = atomic_read(&grp->refcount);
+
+	/* spin if it is temporarily locked at the reclaim path */
+	if (unlikely(o == locked)) {
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+		do
+			cpu_relax();
+		while (atomic_read(&grp->refcount) == locked);
+#endif
+		goto repeat;
+	}
+
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	*ocnt = o;
+	return 0;
+}
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+
+extern int erofs_workgroup_put(struct erofs_workgroup *grp);
+
+extern struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag);
+
+extern int erofs_register_workgroup(struct super_block *sb,
+	struct erofs_workgroup *grp, bool tag);
+
+extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+	unsigned long nr_shrink, bool cleanup);
+
+static inline void erofs_workstation_cleanup_all(struct super_block *sb)
+{
+	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
+}
+
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 00ec621..a631ffe 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -285,6 +285,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->workstn.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -355,6 +362,11 @@ static void erofs_put_super(struct super_block *sb)
 	__putname(sbi->dev_name);
 
 	mutex_lock(&sbi->umount_mutex);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	erofs_workstation_cleanup_all(sb);
+#endif
+
 	erofs_unregister_super(sb);
 	mutex_unlock(&sbi->umount_mutex);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 685e885..ab37072 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,6 +29,83 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+
+/* radix_tree and the future XArray both don't use tagptr_t yet */
+struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+	int oldcount;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn.tree, index);
+	if (grp != NULL) {
+		*tag = radix_tree_exceptional_entry(grp);
+		grp = (void *)((unsigned long)grp &
+			~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		if (erofs_workgroup_get(grp, &oldcount)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		/* decrease refcount added by erofs_workgroup_put */
+		if (unlikely(oldcount == 1))
+			atomic_long_dec(&erofs_global_shrink_cnt);
+		BUG_ON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err;
+
+	/* grp->refcount should not < 1 */
+	BUG_ON(!atomic_read(&grp->refcount));
+
+	err = radix_tree_preload(GFP_NOFS);
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	erofs_workstn_lock(sbi);
+
+	if (tag)
+		grp = (void *)((unsigned long)grp |
+			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	err = radix_tree_insert(&sbi->workstn.tree,
+		grp->index, grp);
+
+	if (!err) {
+		__erofs_workgroup_get(grp);
+	}
+
+	erofs_workstn_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+				       unsigned long nr_shrink,
+				       bool cleanup)
+{
+	return 0;
+}
+
+#endif
 
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -37,9 +114,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
 void erofs_register_super(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -112,6 +186,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		list_move_tail(&sbi->list, &erofs_sb_list);
 		mutex_unlock(&sbi->umount_mutex);
 
+		freed += erofs_shrink_workstation(sbi, nr, false);
 		if (freed >= nr)
 			break;
 	}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 10/11] erofs: introduce VLE decompression support
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (8 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 09/11] erofs: introduce workstation for decompression Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  2018-07-17 14:18   ` [RFC PATCH v1 11/11] erofs: introduce cached decompression Gao Xiang
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |    6 +-
 fs/erofs/internal.h  |    6 +
 fs/erofs/staging.h   |   46 +++
 fs/erofs/super.c     |   26 ++
 fs/erofs/unzip_vle.c | 1083 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h |  184 +++++++++
 fs/erofs/utils.c     |   61 ++-
 7 files changed, 1409 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index ed2e701..8dd674f 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -273,6 +273,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -356,6 +359,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..47c9708d 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,49 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a631ffe..546a308 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -424,6 +431,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -439,6 +452,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -447,6 +466,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -457,6 +480,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 456ef4a..4966a9d 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,1088 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_vle_work_role {
+	Z_EROFS_VLE_WORK_SECONDARY,
+	Z_EROFS_VLE_WORK_PRIMARY,
+	Z_EROFS_VLE_WORK_PRIMARY_OWNER,
+	Z_EROFS_VLE_WORK_MAX
+};
+
+struct z_erofs_vle_work_builder {
+	enum z_erofs_vle_work_role role;
+
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *curr;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+#define VLE_WORK_BUILDER_INIT()	\
+	{ .curr = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_OWNER }
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_builder *b,
+	struct page *page)
+{
+	while (b->compressed_deficit) {
+		--b->compressed_deficit;
+		if (NULL == cmpxchg(b->compressed_pages++, NULL, page))
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_builder *b,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (b->role >= Z_EROFS_VLE_WORK_PRIMARY &&
+		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(b, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&b->vector,
+		page, type, &occupied);
+	b->curr->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline bool try_to_claim_workgroup(
+	struct z_erofs_vle_workgroup *grp,
+	z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	/* let's claim these following types of workgroup */
+retry:
+	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
+		/* type 1, nil workgroup */
+		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
+			goto retry;
+
+		*owned_head = grp;
+	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
+		/* type 2, link to the end of a existing chain */
+		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
+			goto retry;
+
+		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the owner :) */
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_lookup(struct super_block *sb,
+			pgoff_t idx, unsigned pageofs,
+			struct z_erofs_vle_workgroup **grp_ret,
+			enum z_erofs_vle_work_role *role,
+			z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	bool tag, primary;
+	struct erofs_workgroup *egrp;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	egrp = erofs_find_workgroup(sb, idx, &tag);
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_work(grp, pageofs);
+	primary = true;
+#else
+	BUG();
+#endif
+
+	BUG_ON(work->pageofs != pageofs);
+
+	/*
+	 * lock must be taken first to avoid grp->next == NIL between
+	 * claiming workgroup and adding pages:
+	 *                        grp->next != NIL
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                        mutex_lock(&work->lock)
+	 *                        add all pages to pagevec
+	 *
+	 * [correct locking case 1]:
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
+	 *   ...                          *role = SECONDARY
+	 *                                add all pages to pagevec
+	 *                                ...
+	 *                                mutex_unlock(grp->work[c])
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *
+	 * [correct locking case 2]:
+	 *   mutex_lock(grp->work[b])
+	 *   ...
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                                mutex_lock(grp->work[a])
+	 *                                *role = PRIMARY_OWNER
+	 *                                add all pages to pagevec
+	 *                                ...
+	 */
+	mutex_lock(&work->lock);
+
+	if (!primary)
+		*role = Z_EROFS_VLE_WORK_SECONDARY;
+	/* claim the workgroup if possible */
+	else if (try_to_claim_workgroup(grp, owned_head))
+		*role = Z_EROFS_VLE_WORK_PRIMARY_OWNER;
+	else
+		*role = Z_EROFS_VLE_WORK_PRIMARY;
+
+	return work;
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup **grp_ret,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  enum z_erofs_vle_work_role *role,
+			  z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_workgroup *grp = *grp_ret;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_workgrp_fmt(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
+			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	/* new workgrps have been claimed as type 1 */
+	WRITE_ONCE(grp->next, *owned_head);
+	/* primary & owner work role for new workgrps */
+	*role = Z_EROFS_VLE_WORK_PRIMARY_OWNER;
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently unimplemented */
+	BUG();
+#else
+	work = z_erofs_vle_grab_primary_work(grp);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, 0);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = *grp_ret = grp;
+
+	mutex_lock(&work->lock);
+	return work;
+}
+
+static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
+					 unsigned int llen)
+{
+	while(1) {
+		unsigned int orig_llen = grp->llen;
+
+		if (orig_llen >= llen || orig_llen ==
+			cmpxchg(&grp->llen, orig_llen, llen))
+			break;
+	}
+}
+
+#define builder_is_owner(b) ((b)->role >= Z_EROFS_VLE_WORK_PRIMARY_OWNER)
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *w,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	erofs_blk_t index = erofs_blknr(map->m_pa);
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+
+	BUG_ON(w->curr != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	BUG_ON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+	BUG_ON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	BUG_ON(erofs_blkoff(map->m_pa));
+
+repeat:
+	work = z_erofs_vle_work_lookup(sb, index,
+		pageofs, &grp, &w->role, owned_head);
+	if (work != NULL) {
+		__update_workgrp_llen(grp, map->m_llen);
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, &grp,
+		map, index, pageofs, &w->role, owned_head);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+got_it:
+	z_erofs_pagevec_ctor_init(&w->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+
+	if (w->role >= Z_EROFS_VLE_WORK_PRIMARY) {
+		/* enable possibly in-place decompression */
+		w->compressed_pages = grp->compressed_pages;
+		w->compressed_deficit = clusterpages;
+	} else {
+		w->compressed_pages = NULL;
+		w->compressed_deficit = 0;
+	}
+
+	w->grp = grp;
+	w->curr = work;
+	return 0;
+}
+
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->work;
+
+	call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp,
+	struct z_erofs_vle_work *work __maybe_unused)
+{
+	erofs_workgroup_put(&grp->obj);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	__z_erofs_vle_work_release(grp, work);
+}
+
+static inline bool
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder)
+{
+	struct z_erofs_vle_work *work = builder->curr;
+
+	if (work == NULL)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&builder->vector, false);
+	mutex_unlock(&work->lock);
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current builder is not the owner.
+	 */
+	if (!builder_is_owner(builder))
+		__z_erofs_vle_work_release(builder->grp, work);
+
+	builder->curr = NULL;
+	builder->grp = NULL;
+	return true;
+}
+
+struct z_erofs_vle_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_vle_work_builder builder;
+	struct erofs_map_blocks_iter m_iter;
+
+	z_erofs_vle_owned_workgrp_t owned_head;
+
+	bool initial;
+};
+
+#define VLE_FRONTEND_INIT(__i) { \
+	.inode = __i, \
+	.m_iter = { \
+		{ .m_llen = 0, .m_plen = 0 }, \
+		.mpage = NULL \
+	}, \
+	.builder = VLE_WORK_BUILDER_INIT(), \
+	.owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \
+	.initial = true, }
+
+static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
+				struct page *page,
+				struct list_head *page_pool)
+{
+	struct super_block *const sb = fe->inode->i_sb;
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+	struct erofs_map_blocks_iter *const m = &fe->m_iter;
+	struct erofs_map_blocks *const map = &m->map;
+	struct z_erofs_vle_work_builder *const builder = &fe->builder;
+	const loff_t offset = page_offset(page);
+
+	bool owned = builder_is_owner(builder);
+	struct z_erofs_vle_work *work = builder->curr;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+            offset + cur < map->m_la + map->m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (!z_erofs_vle_work_iter_end(builder))
+		fe->initial = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(map->m_plen != 1 << sbi->clusterbits);
+	BUG_ON(erofs_blkoff(map->m_pa));
+
+	err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	owned &= builder_is_owner(builder);
+	work = builder->curr;
+hitted:
+	cur = end - min_t(unsigned, offset + end - map->m_la, end);
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(owned ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(builder, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(builder,
+			newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - map->m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, map->m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool background = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (background)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		bool cachedpage = false;
+
+		DBG_BUGON(PageUptodate(page));
+
+		if (unlikely(err))
+			SetPageError(page);
+		else if (cachedpage)
+			SetPageUptodate(page);
+
+		if (cachedpage)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_workgroup *grp,
+	struct list_head *page_pool)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned clusterpages = erofs_clusterpages(sbi);
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	unsigned sparsemem_pages = 0;
+#endif
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_work *work;
+	void *vout;
+	int err;
+
+	might_sleep();
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_primary_work(grp);
+#else
+	BUG();
+#endif
+	BUG_ON(!READ_ONCE(work->nr_pages));
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc_array(nr_pages,
+			sizeof(struct page *), GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = grp->compressed_pages;
+
+	for(i = 0; i < clusterpages; ++i) {
+		unsigned pagenr;
+
+		BUG_ON(compressed_pages[i] == NULL);
+		page = compressed_pages[i];
+
+		if (page->mapping == NULL)
+			continue;
+
+		pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+
+		overlapped = true;
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs,
+		z_erofs_onlinepage_endio);
+	if (err != -ENOTSUPP)
+		goto out_percpu;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (sparsemem_pages >= nr_pages) {
+		BUG_ON(sparsemem_pages > nr_pages);
+		goto skip_allocpage;
+	}
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+out_percpu:
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	/* all work locks MUST be taken before */
+
+	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
+
+	/* all work locks SHOULD be released right now */
+	mutex_unlock(&work->lock);
+
+	z_erofs_vle_work_release(work);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	z_erofs_vle_owned_workgrp_t owned = io->head;
+
+	while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) {
+		struct z_erofs_vle_workgroup *grp;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		BUG_ON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		BUG_ON(owned == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned;
+		owned = READ_ONCE(grp->next);
+
+		z_erofs_vle_unzip(sb, grp, page_pool);
+	};
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline struct z_erofs_vle_unzip_io *
+prepare_io_handler(struct super_block *sb,
+		   struct z_erofs_vle_unzip_io *io,
+		   bool background)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	if (!background) {
+		/* waitqueue available for foreground io */
+		BUG_ON(io == NULL);
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+		goto out;
+	}
+
+	if (io != NULL)
+		BUG();
+	else {
+		/* allocate extra io descriptor for background io */
+		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+			GFP_KERNEL | __GFP_NOFAIL);
+		BUG_ON(iosb == NULL);
+
+		io = &iosb->io;
+	}
+
+	iosb->sb = sb;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+	io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED;
+	return io;
+}
+
+#define __FSIO_1 0
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   z_erofs_vle_owned_workgrp_t owned_head,
+				   struct list_head *pagepool,
+				   struct z_erofs_vle_unzip_io *fg_io,
+				   bool force_fg)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+	const gfp_t gfp = GFP_NOFS;
+	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
+	struct bio *bio;
+	tagptr1_t bi_private;
+	pgoff_t last_index;
+	bool force_submit = false;
+	unsigned nr_bios;
+
+	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
+		return false;
+
+	/*
+	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
+         * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
+	 */
+	if (force_fg) {
+		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
+	} else {
+		ios[__FSIO_1] = prepare_io_handler(sb, NULL, true);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1);
+	}
+
+	nr_bios = 0;
+	force_submit = false;
+	bio = NULL;
+
+	/* by default, all need io submission */
+	ios[__FSIO_1]->head = owned_head;
+
+	do {
+		struct z_erofs_vle_workgroup *grp;
+		struct page **compressed_pages, *oldpage, *page;
+		pgoff_t first_index;
+		unsigned i = 0;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		BUG_ON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+		BUG_ON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned_head;
+		/* close the owned chain at first */
+		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
+			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+		first_index = grp->obj.index;
+		compressed_pages = grp->compressed_pages;
+
+		force_submit |= (first_index != last_index + 1);
+repeat:
+		/* fulfill all compressed pages */
+		oldpage = page = READ_ONCE(compressed_pages[i]);
+
+		if (page != NULL)
+			BUG_ON(PageUptodate(page));
+		else {
+			page = erofs_allocpage(pagepool, gfp);
+			page->mapping = NULL;
+
+			if (oldpage != cmpxchg(compressed_pages + i,
+				oldpage, page)) {
+				list_add(&page->lru, pagepool);
+				goto repeat;
+			}
+		}
+
+		if (bio != NULL && force_submit) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+
+		if (bio == NULL) {
+			bio = prepare_bio(sb, first_index + i,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(bi_private);
+
+			++nr_bios;
+		}
+
+		err = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		force_submit = false;
+		last_index = first_index + i;
+		if (++i < clusterpages)
+			goto repeat;
+	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	BUG_ON(!nr_bios);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
+	return true;
+}
+
+static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
+				     struct list_head *pagepool,
+				     bool force_fg)
+{
+	struct super_block *sb = f->inode->i_sb;
+	struct z_erofs_vle_unzip_io io[1 + __FSIO_1];
+
+	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
+		return;
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	wait_event(io[__FSIO_1].u.wait,
+		!atomic_read(&io[__FSIO_1].pending_bios));
+
+	/* let's synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct z_erofs_vle_frontend f
+		= VLE_FRONTEND_INIT(page->mapping->host);
+	int err;
+	LIST_HEAD(pagepool);
+
+	err = z_erofs_do_read_page(&f, page, &pagepool);
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	z_erofs_submit_and_unzip(&f, &pagepool, true);
+out:
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct inode *const inode = mapping->host;
+
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	z_erofs_submit_and_unzip(&f, &pagepool, sync);
+
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 8e23e44..9630022 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,193 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+
+	atomic_t refcount;
+#endif
+	struct mutex lock;
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
+
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	struct z_erofs_vle_work work;
+
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
+
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+	unsigned int llen, flags;
+};
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
+
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
+
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+
+#endif
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_vle_owned_workgrp_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ab37072..dd1ce5f 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
 
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
+
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		first_index = grp->index + 1;
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v1 11/11] erofs: introduce cached decompression
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (9 preceding siblings ...)
  2018-07-17 14:18   ` [RFC PATCH v1 10/11] erofs: introduce VLE decompression support Gao Xiang
@ 2018-07-17 14:18   ` Gao Xiang
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-17 14:18 UTC (permalink / raw)


This patch adds the cached decompression as a complement
to the in-place decompression in order to boost random read.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  38 ++++++++
 fs/erofs/internal.h  |  25 +++++
 fs/erofs/super.c     |  75 ++++++++++++++-
 fs/erofs/unzip_vle.c | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/utils.c     |  17 +++-
 5 files changed, 408 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 00e811c..d08c019 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -99,3 +99,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	  than 2. Otherwise, the image cannot be mounted
 	  correctly on this kernel.
 
+choice
+	prompt "EROFS VLE Data Decompression mode"
+	depends on EROFS_FS_ZIP
+	default EROFS_FS_ZIP_CACHE_BIPOLAR
+	help
+	  EROFS supports three options for VLE decompression.
+	  "In-place Decompression Only" consumes the minimum memory
+	  with lowest random read.
+
+	  "Bipolar Cached Decompression" consumes the maximum memory
+	  with highest random read.
+
+	  If unsure, select "Bipolar Cached Decompression"
+
+config EROFS_FS_ZIP_NO_CACHE
+	bool "In-place Decompression Only"
+	help
+	  Read compressed data into page cache and do in-place
+	  decompression directly.
+
+config EROFS_FS_ZIP_CACHE_UNIPOLAR
+	bool "Unipolar Cached Decompression"
+	help
+	  For each request, it caches the last compressed page
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+config EROFS_FS_ZIP_CACHE_BIPOLAR
+	bool "Bipolar Cached Decompression"
+	help
+	  For each request, it caches the both end compressed pages
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+	  Recommended for performance priority.
+
+endchoice
+
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 8dd674f..b7e01b7 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,18 @@ struct erofs_fault_info {
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR
+#define EROFS_FS_ZIP_CACHE_LVL	(2)
+#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR)
+#define EROFS_FS_ZIP_CACHE_LVL	(1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL	(0)
+#endif
+
+#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
+#define EROFS_FS_HAS_MANAGED_CACHE
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -88,6 +100,11 @@ struct erofs_sb_info {
 		spinlock_t lock;
 #endif
 	} workstn;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct inode *managed_cache;
+#endif
+
 #endif
 
 	u32 build_time_nsec;
@@ -251,6 +268,14 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+#define EROFS_UNALLOCATED_CACHED_PAGE	((void *)0x5F0EF00D)
+
+extern int try_to_free_cached_page(struct address_space *, struct page *);
+extern int try_to_free_all_cached_pages(struct erofs_sb_info *,
+	struct erofs_workgroup *);
+#endif
+
 #endif
 
 /* we strictly follow PAGE_SIZE and no buffer head */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 546a308..9414030 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -245,6 +245,63 @@ static int parse_options(struct super_block *sb, char *options)
 	return 0;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static const struct address_space_operations managed_cache_aops;
+
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	int ret = 1;	/* 0 - busy */
+	struct address_space *const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(mapping->a_ops != &managed_cache_aops);
+
+	if (PagePrivate(page))
+		ret = try_to_free_cached_page(mapping, page);
+
+	return ret;
+}
+
+static void managed_cache_invalidatepage(struct page *page,
+	unsigned int offset, unsigned int length)
+{
+	const unsigned int stop = length + offset;
+
+	BUG_ON(!PageLocked(page));
+
+	/* Check for overflow */
+	BUG_ON(stop > PAGE_SIZE || stop < length);
+
+	if (offset == 0 && stop == PAGE_SIZE)
+		while(!managed_cache_releasepage(page, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+	.releasepage = managed_cache_releasepage,
+	.invalidatepage = managed_cache_invalidatepage,
+};
+
+struct inode *erofs_init_managed_cache(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (unlikely(inode == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &managed_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+	                     GFP_NOFS | __GFP_HIGHMEM |
+	                     __GFP_MOVABLE |  __GFP_NOFAIL);
+	return inode;
+}
+
+#endif
+
 static int erofs_read_super(struct super_block *sb,
 	const char *dev_name, void *data, int silent)
 {
@@ -299,11 +356,19 @@ static int erofs_read_super(struct super_block *sb,
 #endif
 #endif
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	sbi->managed_cache = erofs_init_managed_cache(sb);
+	if (IS_ERR(sbi->managed_cache)) {
+		err = PTR_ERR(sbi->managed_cache);
+		goto err_sbi;
+	}
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto err_sbi;
+		goto iget_err;
 	}
 
 	if (!S_ISDIR(inode->i_mode)) {
@@ -346,6 +411,10 @@ static int erofs_read_super(struct super_block *sb,
 err_iput:
 	if (sb->s_root == NULL)
 		iput(inode);
+iget_err:
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
 err_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -368,6 +437,10 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
+
 	mutex_lock(&sbi->umount_mutex);
 
 #ifdef CONFIG_EROFS_FS_ZIP
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 4966a9d..689ed3e 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -78,6 +78,107 @@ struct z_erofs_vle_work_builder {
 #define VLE_WORK_BUILDER_INIT()	\
 	{ .curr = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_OWNER }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static bool grab_managed_cache_pages(struct address_space *mapping,
+				     erofs_blk_t start,
+				     struct page **compressed_pages,
+				     int clusterblks,
+				     bool reserve_allocation)
+{
+	bool noio = true;
+	unsigned int i;
+
+	/* TODO: optimize by introducing find_get_pages_range */
+	for (i = 0; i < clusterblks; ++i) {
+		struct page *page, *found;
+
+		if (READ_ONCE(compressed_pages[i]) != NULL)
+			continue;
+
+		page = found = find_get_page(mapping, start + i);
+		if (found == NULL) {
+			noio = false;
+			if (!reserve_allocation)
+				continue;
+			page = EROFS_UNALLOCATED_CACHED_PAGE;
+		}
+
+		if (NULL == cmpxchg(compressed_pages + i, NULL, page))
+                        continue;
+
+		if (found != NULL)
+			put_page(found);
+	}
+	return noio;
+}
+
+int try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				 struct erofs_workgroup *egrp)
+{
+	struct z_erofs_vle_workgroup *const grp =
+		container_of(egrp, struct z_erofs_vle_workgroup, obj);
+	struct address_space *const mapping = sbi->managed_cache->i_mapping;
+	const int clusterpages = erofs_clusterpages(sbi);
+	int i;
+
+	/*
+	 * refcount of workgroup is now freezed as 1,
+	 * therefore no need to worry about available decompression users.
+	 */
+	for (i = 0; i < clusterpages; ++i) {
+		struct page *page = grp->compressed_pages[i];
+
+		if (page == NULL || page->mapping != mapping)
+			continue;
+
+		/* block from reclaiming or migrating the page */
+		if (!trylock_page(page))
+			return -EBUSY;
+
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+int try_to_free_cached_page(struct address_space *mapping, struct page *page)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_vle_workgroup *grp;
+	int ret = 0;	/* 0 - busy */
+
+	/* prevent the workgroup from being freed */
+	rcu_read_lock();
+	grp = (void *)page_private(page);
+
+	if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
+		unsigned i;
+
+		for (i = 0; i < clusterpages; ++i) {
+			if (grp->compressed_pages[i] == page) {
+				WRITE_ONCE(grp->compressed_pages[i], NULL);
+				ret = 1;
+				break;
+			}
+		}
+		erofs_workgroup_unfreeze(&grp->obj, 1);
+	}
+	rcu_read_unlock();
+
+	if (ret) {
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+	return ret;
+}
+#endif
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
 	struct z_erofs_vle_work_builder *b,
@@ -415,6 +516,9 @@ struct z_erofs_vle_frontend {
 	z_erofs_vle_owned_workgrp_t owned_head;
 
 	bool initial;
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	erofs_off_t cachedzone_la;
+#endif
 };
 
 #define VLE_FRONTEND_INIT(__i) { \
@@ -480,6 +584,28 @@ static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
 	if (unlikely(err))
 		goto err_out;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	else {
+		struct z_erofs_vle_workgroup *grp = fe->builder.grp;
+		struct address_space *mapping = sbi->managed_cache->i_mapping;
+
+		/* let's do out of order decompression for noio */
+		bool noio_outoforder = grab_managed_cache_pages(mapping,
+			erofs_blknr(map->m_pa),
+			grp->compressed_pages, erofs_blknr(map->m_plen),
+			fe->initial
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+			| (map->m_la <= fe->cachedzone_la)
+#endif
+		);
+
+		if (noio_outoforder && builder_is_owner(builder)) {
+			__erofs_workgroup_get(&grp->obj);
+			builder->role = Z_EROFS_VLE_WORK_PRIMARY;
+		}
+	}
+#endif
+
 	owned &= builder_is_owner(builder);
 	work = builder->curr;
 hitted:
@@ -577,6 +703,15 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 
 		DBG_BUGON(PageUptodate(page));
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping != NULL) {
+			struct inode *inode = page->mapping->host;
+
+			cachedpage = (inode ==
+				EROFS_SB(inode->i_sb)->managed_cache);
+		}
+#endif
+
 		if (unlikely(err))
 			SetPageError(page);
 		else if (cachedpage)
@@ -690,6 +825,13 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
 		if (page->mapping == NULL)
 			continue;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping->host == sbi->managed_cache) {
+			BUG_ON(PageLocked(page));
+			BUG_ON(!PageUptodate(page));
+			continue;
+		}
+#endif
 
 		pagenr = z_erofs_onlinepage_index(page);
 
@@ -771,6 +913,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 		if (page->mapping == NULL)
 			list_add(&page->lru, page_pool);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		else if (page->mapping->host == sbi->managed_cache)
+			continue;
+#endif
 		WRITE_ONCE(compressed_pages[i], NULL);
 	}
 
@@ -862,7 +1008,32 @@ static void z_erofs_vle_unzip_wq(struct work_struct *work)
 	return io;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+/* true - unlocked (noio), false - locked (need submit io) */
+static inline bool recover_managed_page(
+	struct z_erofs_vle_workgroup *grp,
+	struct page *page)
+{
+	wait_on_page_locked(page);
+	if (PagePrivate(page) && PageUptodate(page))
+		return true;
+
+	lock_page(page);
+	if (unlikely(!PagePrivate(page))) {
+		set_page_private(page, (unsigned long)grp);
+		SetPagePrivate(page);
+	}
+	if (unlikely(PageUptodate(page))) {
+		unlock_page(page);
+		return true;
+	}
+	return false;
+}
+
+#define __FSIO_1 1
+#else
 #define __FSIO_1 0
+#endif
 
 static bool z_erofs_vle_submit_all(struct super_block *sb,
 				   z_erofs_vle_owned_workgrp_t owned_head,
@@ -873,6 +1044,11 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	const unsigned clusterpages = erofs_clusterpages(sbi);
 	const gfp_t gfp = GFP_NOFS;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const managed_cache_mapping =
+		sbi->managed_cache->i_mapping;
+	struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL;
+#endif
 	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
 	struct bio *bio;
 	tagptr1_t bi_private;
@@ -887,6 +1063,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
          * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
 	 */
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	ios[0] = prepare_io_handler(sb, fg_io + 0, false);
+#endif
+
 	if (force_fg) {
 		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
 		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
@@ -907,6 +1087,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		struct page **compressed_pages, *oldpage, *page;
 		pgoff_t first_index;
 		unsigned i = 0;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		unsigned noio = 0;
+		bool cachemanaged;
+#endif
 		int err;
 
 		/* no possible 'owned_head' equals the following */
@@ -926,9 +1110,28 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		/* fulfill all compressed pages */
 		oldpage = page = READ_ONCE(compressed_pages[i]);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		cachemanaged = false;
+
+		if (page == EROFS_UNALLOCATED_CACHED_PAGE) {
+			cachemanaged = true;
+			goto do_allocpage;
+		} else if (page != NULL) {
+			if (page->mapping != managed_cache_mapping)
+				BUG_ON(PageUptodate(page));
+			else if (recover_managed_page(grp, page)) {
+				/* page is uptodate, skip io submission */
+				force_submit = true;
+				++noio;
+				goto skippage;
+			}
+		} else {
+do_allocpage:
+#else
 		if (page != NULL)
 			BUG_ON(PageUptodate(page));
 		else {
+#endif
 			page = erofs_allocpage(pagepool, gfp);
 			page->mapping = NULL;
 
@@ -936,6 +1139,12 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 				oldpage, page)) {
 				list_add(&page->lru, pagepool);
 				goto repeat;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+			} else if (cachemanaged && !add_to_page_cache_lru(page,
+				managed_cache_mapping, first_index + i, gfp)) {
+				set_page_private(page, (unsigned long)grp);
+				SetPagePrivate(page);
+#endif
 			}
 		}
 
@@ -959,14 +1168,51 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 
 		force_submit = false;
 		last_index = first_index + i;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skippage:
+#endif
 		if (++i < clusterpages)
 			goto repeat;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (noio < clusterpages)
+			lstgrp_io = grp;
+		else {
+			z_erofs_vle_owned_workgrp_t iogrp_next =
+				owned_head == Z_EROFS_VLE_WORKGRP_TAIL ?
+				Z_EROFS_VLE_WORKGRP_TAIL_CLOSED :
+				owned_head;
+
+			if (lstgrp_io == NULL)
+				ios[1]->head = iogrp_next;
+			else
+				WRITE_ONCE(lstgrp_io->next, iogrp_next);
+
+			if (lstgrp_noio == NULL)
+				ios[0]->head = grp;
+			else
+				WRITE_ONCE(lstgrp_noio->next, grp);
+
+			lstgrp_noio = grp;
+		}
+#endif
 	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
 
 	if (bio != NULL)
 		__submit_bio(bio, REQ_OP_READ, 0);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 	BUG_ON(!nr_bios);
+#else
+	if (lstgrp_noio != NULL)
+		WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	if (!force_fg && !nr_bios) {
+		kvfree(container_of(ios[1],
+			struct z_erofs_vle_unzip_io_sb, io));
+		return true;
+	}
+#endif
 
 	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
 	return true;
@@ -982,6 +1228,9 @@ static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
 	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
 		return;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	z_erofs_vle_unzip_all(sb, &io[0], pagepool);
+#endif
 	if (!force_fg)
 		return;
 
@@ -1001,6 +1250,9 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
 	int err;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = page->index << PAGE_SHIFT;
+#endif
 	err = z_erofs_do_read_page(&f, page, &pagepool);
 	(void)z_erofs_vle_work_iter_end(&f.builder);
 
@@ -1031,6 +1283,9 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 	struct page *head = NULL;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT;
+#endif
 	for (; nr_pages; --nr_pages) {
 		struct page *page = lru_to_page(pages);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dd1ce5f..b669ca3 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -143,13 +143,28 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 		if (cleanup)
 			BUG_ON(cnt != 1);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 		else if (cnt > 1)
+#else
+		if (!erofs_workgroup_try_to_freeze(grp, 1))
+#endif
 			continue;
 
 		if (radix_tree_delete(&sbi->workstn.tree,
-			grp->index) != grp)
+			grp->index) != grp) {
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skip:
+			erofs_workgroup_unfreeze(grp, 1);
+#endif
 			continue;
+		}
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (try_to_free_all_cached_pages(sbi, grp))
+			goto skip;
+
+		erofs_workgroup_unfreeze(grp, 1);
+#endif
 		/* (rarely) grabbed again when freeing */
 		erofs_workgroup_put(grp);
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem
  2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
                   ` (9 preceding siblings ...)
  2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-20  2:52 ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
                     ` (10 more replies)
  10 siblings, 11 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


TODO List:
 - further minor cleanup
 - bugfix
 - will be stable this week

change log v2:
 - fill the commit messages for all patches
 - fix reference count bug in some case
 - code cleanup

change log v1:
 - introduce cached decompression in order to boost random read
 - several bugfix
 *******************************************************************
 * In this version, we have an amazing seq & rand read performance *
 *******************************************************************
change log v0.7:
 - several bugfix ( buffer overflow, shrinker, ownership, etc... )
 - all features available
 - it works now, and need do more for the random read compared
   with the old decompression version.

change log v0.6:
 - preliminary works (could boot into launcher)
 - still have minor buges to fix

change log v0.5:
 - add reclaim path
 - almost work, still debugging

change log v0.4:
 - bugfix (runable now for small files)
 - separate into one more patch
[RESEND]
 - fix according to:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-July/049774.html
 - fix compiling warning:
   Link: https://lists.01.org/pipermail/kbuild-all/2018-June/049647.html
 - rebase code

change log v0.3:
 - separate to several small patches, maybe more in the future patchset

change log v0.2:
 - use the recent introduced tagptr_t type to manage tagged pointers.
 - bugfix


Gao Xiang (11):
  <linux/tagptr.h>: Introduce tagged pointer
  erofs: introduce pagevec for unzip subsystem
  erofs: add erofs_map_blocks_iter
  erofs: add erofs_allocpage
  erofs: globalize prepare_bio and __submit_bio
  erofs: add a generic z_erofs VLE decompressor
  erofs: introduce superblock registration
  erofs: introduce erofs shrinker
  erofs: introduce workstation for decompression
  erofs: introduce VLE decompression support
  erofs: introduce cached decompression

 fs/erofs/Kconfig         |   62 ++
 fs/erofs/Makefile        |    3 +-
 fs/erofs/data.c          |   41 +-
 fs/erofs/inode.c         |    6 +-
 fs/erofs/internal.h      |  200 ++++++
 fs/erofs/staging.h       |   50 ++
 fs/erofs/super.c         |  132 +++-
 fs/erofs/unzip_pagevec.h |  172 +++++
 fs/erofs/unzip_vle.c     | 1572 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/unzip_vle.h     |  219 +++++++
 fs/erofs/unzip_vle_lz4.c |  209 ++++++
 fs/erofs/utils.c         |  270 ++++++++
 fs/file.c                |   24 +-
 include/linux/file.h     |   15 +-
 include/linux/tagptr.h   |  110 ++++
 15 files changed, 3034 insertions(+), 51 deletions(-)
 create mode 100644 fs/erofs/unzip_pagevec.h
 create mode 100644 fs/erofs/unzip_vle.c
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c
 create mode 100644 fs/erofs/utils.c
 create mode 100644 include/linux/tagptr.h

-- 
1.9.1

^ permalink raw reply	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 01/11] <linux/tagptr.h>: Introduce tagged pointer
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 02/11] erofs: introduce pagevec for unzip subsystem Gao Xiang
                     ` (9 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/file.c              |  24 ++++++-----
 include/linux/file.h   |  15 ++++---
 include/linux/tagptr.h | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/tagptr.h

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..c54cb50 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -727,42 +727,44 @@ struct file *fget_raw(unsigned int fd)
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static fdtagptr_t __fget_light(unsigned int fd, fmode_t mask)
 {
+	const fdtagptr_t nil = tagptr_init(fdtagptr_t, NULL);
 	struct files_struct *files = current->files;
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, 0);
 	} else {
 		file = __fget(fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return nil;
+		return tagptr_fold(fdtagptr_t, file, FDPUT_FPUT);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+
+fdtagptr_t __fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
 EXPORT_SYMBOL(__fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+fdtagptr_t __fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+fdtagptr_t __fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	fdtagptr_t v = __fdget(fd);
+	struct file *file = tagptr_unfold_ptr(v);
 
 	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
 		if (file_count(file) > 1) {
-			v |= FDPUT_POS_UNLOCK;
+			tagptr_set_tags(&v, FDPUT_POS_UNLOCK);
 			mutex_lock(&file->f_pos_lock);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720d..e2bb489 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/posix_types.h>
+#include <linux/tagptr.h>
 
 struct file;
 
@@ -34,6 +35,9 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+/* tagged pointer for fd */
+typedef tagptr2_t	fdtagptr_t;
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
@@ -42,14 +46,15 @@ static inline void fdput(struct fd fd)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
+extern fdtagptr_t __fdget(unsigned int fd);
+extern fdtagptr_t __fdget_raw(unsigned int fd);
+extern fdtagptr_t __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
+static inline struct fd __to_fd(fdtagptr_t v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){ tagptr_unfold_ptr(v),
+		tagptr_unfold_tags(v) };
 }
 
 static inline struct fd fdget(unsigned int fd)
diff --git a/include/linux/tagptr.h b/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 02/11] erofs: introduce pagevec for unzip subsystem
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 03/11] erofs: add erofs_map_blocks_iter Gao Xiang
                     ` (8 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


For each compressed cluster, there is a straight-forward
way of allocating a fixed or variable-sized (for VLE) array
to record the corresponding file pages for its decompression
if we decide to decompress these pages asynchronously (eg.
read-ahead case), however it could take much extra on-heap
memory compared with traditional uncompressed filesystems.

This patch introduces a pagevec solution to reuse some
allocated file page in the time-sharing approach storing
parts of the array itself in order to minimize the extra
memory overhead, thus only a constant and small-sized array
used for booting the whole array itself up will be needed.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/unzip_pagevec.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 fs/erofs/unzip_pagevec.h

diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..4633b15
--- /dev/null
+++ b/fs/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 03/11] erofs: add erofs_map_blocks_iter
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 02/11] erofs: introduce pagevec for unzip subsystem Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 04/11] erofs: add erofs_allocpage Gao Xiang
                     ` (7 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


This patch adds 'erofs_map_blocks_iter', which
is designed for the iterable L2P mapping.
Compared with 'erofs_map_blocks', it avoids
the redundant 'release and regrab' process if
these requests read the same meta page.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  10 +++
 fs/erofs/Makefile    |   1 +
 fs/erofs/data.c      |  39 ++++++++-
 fs/erofs/internal.h  |   4 +
 fs/erofs/unzip_vle.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 286 insertions(+), 4 deletions(-)
 create mode 100644 fs/erofs/unzip_vle.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9c8696e..ffbd5eb 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,3 +75,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 9d7f90a..0b3db0a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index de99217..faaec37 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -189,14 +189,45 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	return 0;
 }
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_map_blocks_iter(struct inode *,
+	struct erofs_map_blocks *, struct page **, int);
+#endif
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* by default, reading raw data never use erofs_map_blocks_iter */
+	if (unlikely(!is_inode_layout_compression(inode))) {
+		if (*mpage_ret != NULL)
+			put_page(*mpage_ret);
+		*mpage_ret = NULL;
+
+		return erofs_map_blocks(inode, map, flags);
+	}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags);
+#else
+	/* data compression is not available */
+	return -ENOTSUPP;
+#endif
+}
+
 int erofs_map_blocks(struct inode *inode,
 	struct erofs_map_blocks *map, int flags)
 {
-	if (!is_inode_layout_compression(inode))
-		return erofs_map_blocks_flatmode(inode, map, flags);
+	if (unlikely(is_inode_layout_compression(inode))) {
+		struct page *mpage = NULL;
+		int err;
 
-	/* data compression unimplemented yet */
-	return -ENOTSUPP;
+		err = erofs_map_blocks_iter(inode, map, &mpage, flags);
+		if (mpage != NULL)
+			put_page(mpage);
+		return err;
+	}
+	return erofs_map_blocks_flatmode(inode, map, flags);
 }
 
 static inline struct bio *erofs_read_raw_page(
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 3f3745f..c8e5703 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -73,6 +73,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
new file mode 100644
index 0000000..96fd1114
--- /dev/null
+++ b/fs/erofs/unzip_vle.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	EROFS_VLE_DI_CLUSTER_TYPE_BIT, EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	EROFS_VLE_CLUSTER_TYPE_HEAD,
+	EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct erofs_decompressed_index_vle *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct erofs_decompressed_index_vle);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct erofs_decompressed_index_vle".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct erofs_decompressed_index_vle *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = (unsigned long long)lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct erofs_decompressed_index_vle *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size - 1;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* [walking mode] 'map' has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto unmap_out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto exact_hitted;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+exact_hitted:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+unmap_out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+out:
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 04/11] erofs: add erofs_allocpage
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (2 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 03/11] erofs: add erofs_map_blocks_iter Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 05/11] erofs: globalize prepare_bio and __submit_bio Gao Xiang
                     ` (6 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


This patch introduces an temporary _on-stack_ page
pool to reuse the freed page directly as much as
it can for better performance and release all pages
at a time, it also slightly reduces the possibility of
the potential memory allocation failure.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Makefile   |  2 +-
 fs/erofs/internal.h |  3 +++
 fs/erofs/staging.h  |  4 ++++
 fs/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/utils.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 0b3db0a..d717775 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -3,7 +3,7 @@ EROFS_VERSION = "1.0"
 EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c8e5703..b19fd78 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -382,5 +382,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..dce5177
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 05/11] erofs: globalize prepare_bio and __submit_bio
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (3 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 04/11] erofs: add erofs_allocpage Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 06/11] erofs: add a generic z_erofs VLE decompressor Gao Xiang
                     ` (5 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


The unzip subsystem also uses these functions,
let's export them to internal.h.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/data.c     | 41 +++++++++--------------------------------
 fs/erofs/internal.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index faaec37..825424b 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -47,33 +47,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -96,7 +69,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -268,6 +241,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -285,6 +260,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -292,8 +270,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -302,7 +279,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -326,7 +303,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b19fd78..6ed2ea3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -292,6 +292,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 06/11] erofs: add a generic z_erofs VLE decompressor
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (4 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 05/11] erofs: globalize prepare_bio and __submit_bio Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 07/11] erofs: introduce superblock registration Gao Xiang
                     ` (4 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


Currently, this patch only simply implements LZ4
decompressor due to its development priority.

In the future, erofs will support more compression
algorithm and format other than LZ4, thus a generic
decompressor interface will be needed.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig         |  14 ++++
 fs/erofs/Makefile        |   2 +-
 fs/erofs/internal.h      |   5 ++
 fs/erofs/unzip_vle.h     |  35 ++++++++
 fs/erofs/unzip_vle_lz4.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 fs/erofs/unzip_vle.h
 create mode 100644 fs/erofs/unzip_vle_lz4.c

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index ffbd5eb..00e811c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -85,3 +85,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index d717775..fa9d179 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,5 +5,5 @@ EXTRA_CFLAGS += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o unzip_lz4.o
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6ed2ea3..2d1df84 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -162,6 +162,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
new file mode 100644
index 0000000..8e23e44
--- /dev/null
+++ b/fs/erofs/unzip_vle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/fs/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned outlen, unsigned short pageofs,
+	void (*endio)(struct page *));
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..fda8e6d
--- /dev/null
+++ b/fs/erofs/unzip_vle_lz4.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	j = 0;
+	for(i = 0; i < nr_pages; j = i++) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL) {
+				if (!mirrored[j])
+					kunmap_atomic(src);
+				src = NULL;
+			}
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(mirrored[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			mirrored[j] = true;
+			break;
+		}
+
+		if (i) {
+			if (src == NULL)
+				src = mirrored[i-1] ?
+					percpu_data + (i-1) * PAGE_SIZE :
+					kmap_atomic(compressed_pages[i-1]);
+
+			memcpy(dst, src + righthalf, pageofs);
+
+			if (!mirrored[i-1])
+				kunmap_atomic(src);
+
+			if (unlikely(i >= clusterpages)) {
+				kunmap_atomic(dst);
+				break;
+			}
+		}
+
+		if (!righthalf)
+			src = NULL;
+		else {
+			src = mirrored[i] ? percpu_data + i * PAGE_SIZE :
+				kmap_atomic(compressed_pages[i]);
+
+			memcpy(dst + pageofs, src, righthalf);
+		}
+
+		kunmap_atomic(dst);
+	}
+
+	if (src != NULL && !mirrored[j])
+		kunmap_atomic(src);
+
+	preempt_enable();
+	return 0;
+}
+
+extern int erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned outlen,
+				  unsigned short pageofs,
+				  void (*endio)(struct page *))
+{
+	void *vin, *vout;
+	unsigned nr_pages, i, j;
+	int ret;
+
+	if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE)
+		return -ENOTSUPP;
+
+	nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE);
+
+	if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else
+		vin = erofs_vmap(compressed_pages, clusterpages);
+
+	preempt_disable();
+	vout = erofs_pcpubuf[smp_processor_id()].data;
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, outlen);
+
+	if (ret >= 0) {
+		outlen = ret;
+		ret = 0;
+	}
+
+	for(i = 0; i < nr_pages; ++i) {
+		j = min((unsigned)PAGE_SIZE - pageofs, outlen);
+
+		if (pages[i] != NULL) {
+			if (ret < 0)
+				SetPageError(pages[i]);
+			else if (clusterpages == 1 && pages[i] == compressed_pages[0])
+				memcpy(vin + pageofs, vout + pageofs, j);
+			else {
+				void *dst = kmap_atomic(pages[i]);
+
+				memcpy(dst + pageofs, vout + pageofs, j);
+				kunmap_atomic(dst);
+			}
+			endio(pages[i]);
+		}
+		vout += PAGE_SIZE;
+		outlen -= j;
+		pageofs = 0;
+	}
+	preempt_enable();
+
+	if (clusterpages == 1)
+		kunmap_atomic(vin);
+	else
+		erofs_vunmap(vin, clusterpages);
+
+	return ret;
+}
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 07/11] erofs: introduce superblock registration
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (5 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 06/11] erofs: add a generic z_erofs VLE decompressor Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 08/11] erofs: introduce erofs shrinker Gao Xiang
                     ` (3 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


In order to introducing shrink solution for erofs,
let's manage all mounted erofs instances at first.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  6 ++++++
 fs/erofs/super.c    |  4 ++++
 fs/erofs/utils.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2d1df84..06cf508 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -65,6 +65,9 @@ struct erofs_fault_info {
 typedef u64 erofs_nid_t;
 
 struct erofs_sb_info {
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -431,5 +434,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 /* utils.c */
 extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
+extern void erofs_register_super(struct super_block *sb);
+extern void erofs_unregister_super(struct super_block *sb);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ab0e4cd..6d51ec5 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -317,6 +317,8 @@ static int erofs_read_super(struct super_block *sb,
 	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
 	sbi->dev_name[PATH_MAX - 1] = '\0';
 
+	erofs_register_super(sb);
+
 	/*
 	 * We already have a positive dentry, which was instantiated
 	 * by d_make_root. Just need to d_rehash it.
@@ -354,6 +356,8 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	erofs_unregister_super(sb);
+
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dce5177..78731c5 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,3 +29,20 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+static DEFINE_MUTEX(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_register_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_unregister_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_del(&EROFS_SB(sb)->list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 08/11] erofs: introduce erofs shrinker
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (6 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 07/11] erofs: introduce superblock registration Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 09/11] erofs: introduce workstation for decompression Gao Xiang
                     ` (2 subsequent siblings)
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


This patch adds a dedicated shrinker targeting to free unneeded
memory consumed by a number of erofs in-memory data structures.

Like F2FS and UBIFS, it also adds:
  - sbi->umount_mutex to avoid races on shrinker and put_super
  - sbi->shrinker_run_no to not revisit recently scaned objects

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h |  7 +++++
 fs/erofs/super.c    | 15 ++++++++++
 fs/erofs/utils.c    | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 06cf508..444492b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -94,6 +95,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -437,5 +439,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 6d51ec5..e0ceb5c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -356,7 +356,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -398,6 +400,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -417,6 +425,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -425,6 +437,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -433,6 +447,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
 }
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 78731c5..685e885 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,20 +29,93 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
-	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 09/11] erofs: introduce workstation for decompression
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (7 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 08/11] erofs: introduce erofs shrinker Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 10/11] erofs: introduce VLE decompression support Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 11/11] erofs: introduce cached decompression Gao Xiang
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


This patch introduces another concept used by the unzip
subsystem called 'workstation'. It can be seen as a sparse
array that stores pointers pointed to data structures
related to the corresponding physical blocks.

All lookup cases are protected by RCU read lock. Besides,
reference count and spin_lock are also introduced to
manage its lifetime and serialize all update operations.

'workstation' is currently implemented on the in-kernel
radix tree approach for backward compatibility.
With the evolution of linux kernel, it could be migrated
into XArray implementation in the future.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/internal.h | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/super.c    |  12 ++++++
 fs/erofs/utils.c    |  81 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 444492b..5be7dea 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -80,6 +80,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* the dedicated workstation for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} workstn;
 #endif
 
 	u32 build_time_nsec;
@@ -150,6 +158,101 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define erofs_workstn_lock(sbi)         spin_lock(&(sbi)->workstn.lock)
+#define erofs_workstn_unlock(sbi)       spin_unlock(&(sbi)->workstn.lock)
+#else
+#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn.tree)
+#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn.tree)
+#endif
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+
+static inline bool erofs_workgroup_try_to_freeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	if (v != atomic_cmpxchg(&grp->refcount,
+		v, EROFS_LOCKED_MAGIC))
+		return false;
+	preempt_disable();
+#else
+	preempt_disable();
+	if (atomic_read(&grp->refcount) != v) {
+		preempt_enable();
+		return false;
+	}
+#endif
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	atomic_set(&grp->refcount, v);
+#endif
+	preempt_enable();
+}
+
+static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
+{
+	const int locked = (int)EROFS_LOCKED_MAGIC;
+	int o;
+
+repeat:
+	o = atomic_read(&grp->refcount);
+
+	/* spin if it is temporarily locked at the reclaim path */
+	if (unlikely(o == locked)) {
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+		do
+			cpu_relax();
+		while (atomic_read(&grp->refcount) == locked);
+#endif
+		goto repeat;
+	}
+
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	*ocnt = o;
+	return 0;
+}
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+
+extern int erofs_workgroup_put(struct erofs_workgroup *grp);
+
+extern struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag);
+
+extern int erofs_register_workgroup(struct super_block *sb,
+	struct erofs_workgroup *grp, bool tag);
+
+extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+	unsigned long nr_shrink, bool cleanup);
+
+static inline void erofs_workstation_cleanup_all(struct super_block *sb)
+{
+	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
+}
+
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index e0ceb5c..9a465bd 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -287,6 +287,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->workstn.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -357,6 +364,11 @@ static void erofs_put_super(struct super_block *sb)
 	__putname(sbi->dev_name);
 
 	mutex_lock(&sbi->umount_mutex);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	erofs_workstation_cleanup_all(sb);
+#endif
+
 	erofs_unregister_super(sb);
 	mutex_unlock(&sbi->umount_mutex);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 685e885..ab37072 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -29,6 +29,83 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+
+/* radix_tree and the future XArray both don't use tagptr_t yet */
+struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+	int oldcount;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn.tree, index);
+	if (grp != NULL) {
+		*tag = radix_tree_exceptional_entry(grp);
+		grp = (void *)((unsigned long)grp &
+			~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		if (erofs_workgroup_get(grp, &oldcount)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		/* decrease refcount added by erofs_workgroup_put */
+		if (unlikely(oldcount == 1))
+			atomic_long_dec(&erofs_global_shrink_cnt);
+		BUG_ON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err;
+
+	/* grp->refcount should not < 1 */
+	BUG_ON(!atomic_read(&grp->refcount));
+
+	err = radix_tree_preload(GFP_NOFS);
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	erofs_workstn_lock(sbi);
+
+	if (tag)
+		grp = (void *)((unsigned long)grp |
+			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	err = radix_tree_insert(&sbi->workstn.tree,
+		grp->index, grp);
+
+	if (!err) {
+		__erofs_workgroup_get(grp);
+	}
+
+	erofs_workstn_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+				       unsigned long nr_shrink,
+				       bool cleanup)
+{
+	return 0;
+}
+
+#endif
 
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -37,9 +114,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
 void erofs_register_super(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -112,6 +186,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		list_move_tail(&sbi->list, &erofs_sb_list);
 		mutex_unlock(&sbi->umount_mutex);
 
+		freed += erofs_shrink_workstation(sbi, nr, false);
 		if (freed >= nr)
 			break;
 	}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 10/11] erofs: introduce VLE decompression support
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (8 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 09/11] erofs: introduce workstation for decompression Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  2018-07-20 16:55     ` [RFC PATCH v3 " Gao Xiang
  2018-07-20  2:52   ` [RFC PATCH v2 11/11] erofs: introduce cached decompression Gao Xiang
  10 siblings, 1 reply; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


This patch introduces the basic in-place VLE decompression
implementation for the erofs file system.

Compared with fixed-sized input compression, it implements
what we call 'the variable-length extent compression' which
specifies the same output size for each compression block
to make the full use of IO bandwidth (which means almost
all data from block device can be directly used for decomp-
ression), improve the real (rather than just via data caching,
which costs more memory) random read and keep the relatively
lower compression ratios (it saves more storage space than
fixed-sized input compression which is also configured with
the same input block size), as illustrated below:

        |---  variable-length extent ---|------ VLE ------|---  VLE ---|
         /> clusterofs                  /> clusterofs     /> clusterofs /> clusterofs
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
...||   |       ||           ||         | ||           || |         || | ... original data
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
   ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++
        size         size         size         size         size
         \                             /                 /            /
          \                      /              /            /
           \               /            /            /
            ++-----------++-----------++-----------++
        ... ||           ||           ||           || ... compressed clusters
            ++-----------++-----------++-----------++
            ++->cluster<-++->cluster<-++->cluster<-++
                 size         size         size

The main point of 'in-place' refers to the decompression mode:
Instead of allocating independent compressed pages and data
structures, it reuses the allocated file cache pages at most
to store its compressed data and the corresponding pagevec in
a time-sharing approach by default, which will be useful for
low memory scenario.

In the end, unlike the other filesystems with (de)compression
support using a relatively large compression block size, which
reads and decompresses >= 128KB at once, and gains a more
good-looking random read (In fact it collects small random reads
into large sequential reads and caches all decompressed data
in memory, but it is unacceptable especially for embedded devices
with limited memory, and it is not the real random read), we
select a universal small-sized 4KB compressed cluster, which is
the smallest page size for most architectures, and all compressed
clusters can be read and decompressed independently, which ensures
random read number for all use cases.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/inode.c     |    6 +-
 fs/erofs/internal.h  |    6 +
 fs/erofs/staging.h   |   46 +++
 fs/erofs/super.c     |   26 ++
 fs/erofs/unzip_vle.c | 1120 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h |  184 +++++++++
 fs/erofs/utils.c     |   61 ++-
 7 files changed, 1446 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5be7dea..fd444ec 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -273,6 +273,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -357,6 +360,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..47c9708d 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,49 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9a465bd..7e5333c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -426,6 +433,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -441,6 +454,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -449,6 +468,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -459,6 +482,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 96fd1114..c113740 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,1125 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_vle_work_role {
+	Z_EROFS_VLE_WORK_SECONDARY,
+	Z_EROFS_VLE_WORK_PRIMARY,
+	/*
+	 * The current work has at least been linked with the following
+	 * processed chained works, which means if the processing page
+	 * is the tail partial page of the work, the current work can
+	 * safely use the whole page, as illustrated below:
+	 * +--------------+-------------------------------------------+
+	 * |  tail page   |      head page (of the previous work)     |
+	 * +--------------+-------------------------------------------+
+	 *   /\  which belongs to the current work
+	 * [  (*) this page can be used for the current work itself.  ]
+	 */
+	Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
+	Z_EROFS_VLE_WORK_MAX
+};
+
+struct z_erofs_vle_work_builder {
+	enum z_erofs_vle_work_role role;
+	/*
+	 * 'hosted = false' means that the current workgroup doesn't belong to
+	 * the owned chained workgroups. In the other words, it is none of our
+	 * business to submit this workgroup.
+	 */
+	bool hosted;
+
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+#define VLE_WORK_BUILDER_INIT()	\
+	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_builder *b,
+	struct page *page)
+{
+	while (b->compressed_deficit) {
+		--b->compressed_deficit;
+		if (NULL == cmpxchg(b->compressed_pages++, NULL, page))
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_builder *builder,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY &&
+		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(builder, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&builder->vector,
+		page, type, &occupied);
+	builder->work->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline bool try_to_claim_workgroup(
+	struct z_erofs_vle_workgroup *grp,
+	z_erofs_vle_owned_workgrp_t *owned_head,
+	bool *hosted)
+{
+	DBG_BUGON(*hosted == true);
+
+	/* let's claim these following types of workgroup */
+retry:
+	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
+		/* type 1, nil workgroup */
+		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
+			goto retry;
+
+		*owned_head = grp;
+		*hosted = true;
+	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
+		/*
+		 * type 2, link to the end of a existing open chain,
+		 * be careful that its submission itself is governed
+		 * by the original owned chain.
+		 */
+		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
+			goto retry;
+
+		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the followee :) */
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_lookup(struct super_block *sb,
+			pgoff_t idx, unsigned pageofs,
+			struct z_erofs_vle_workgroup **grp_ret,
+			enum z_erofs_vle_work_role *role,
+			z_erofs_vle_owned_workgrp_t *owned_head,
+			bool *hosted)
+{
+	bool tag, primary;
+	struct erofs_workgroup *egrp;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	egrp = erofs_find_workgroup(sb, idx, &tag);
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_work(grp, pageofs);
+	primary = true;
+#else
+	BUG();
+#endif
+
+	DBG_BUGON(work->pageofs != pageofs);
+
+	/*
+	 * lock must be taken first to avoid grp->next == NIL between
+	 * claiming workgroup and adding pages:
+	 *                        grp->next != NIL
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                        mutex_lock(&work->lock)
+	 *                        add all pages to pagevec
+	 *
+	 * [correct locking case 1]:
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
+	 *   ...                          *role = SECONDARY
+	 *                                add all pages to pagevec
+	 *                                ...
+	 *                                mutex_unlock(grp->work[c])
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *
+	 * [correct locking case 2]:
+	 *   mutex_lock(grp->work[b])
+	 *   ...
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                                mutex_lock(grp->work[a])
+	 *                                *role = PRIMARY_OWNER
+	 *                                add all pages to pagevec
+	 *                                ...
+	 */
+	mutex_lock(&work->lock);
+
+	*hosted = false;
+	if (!primary)
+		*role = Z_EROFS_VLE_WORK_SECONDARY;
+	/* claim the workgroup if possible */
+	else if (try_to_claim_workgroup(grp, owned_head, hosted))
+		*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	else
+		*role = Z_EROFS_VLE_WORK_PRIMARY;
+
+	return work;
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup **grp_ret,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  enum z_erofs_vle_work_role *role,
+			  z_erofs_vle_owned_workgrp_t *owned_head,
+			  bool *hosted)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_workgroup *grp = *grp_ret;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_workgrp_fmt(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
+			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	/* new workgrps have been claimed as type 1 */
+	WRITE_ONCE(grp->next, *owned_head);
+	/* primary and followed work for all new workgrps */
+	*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	/* it should be submitted by ourselves */
+	*hosted = true;
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently unimplemented */
+	BUG();
+#else
+	work = z_erofs_vle_grab_primary_work(grp);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, 0);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = *grp_ret = grp;
+
+	mutex_lock(&work->lock);
+	return work;
+}
+
+static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
+					 unsigned int llen)
+{
+	while(1) {
+		unsigned int orig_llen = grp->llen;
+
+		if (orig_llen >= llen || orig_llen ==
+			cmpxchg(&grp->llen, orig_llen, llen))
+			break;
+	}
+}
+
+#define builder_is_followed(builder) \
+	((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	struct z_erofs_vle_workgroup *grp;
+	erofs_blk_t index = erofs_blknr(map->m_pa);
+	struct z_erofs_vle_work *work;
+	unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	unsigned pageofs = map->m_la & ~PAGE_MASK;
+
+	DBG_BUGON(builder->work != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	DBG_BUGON(erofs_blkoff(map->m_pa));
+
+repeat:
+	work = z_erofs_vle_work_lookup(sb, index,
+		pageofs, &grp, &builder->role, owned_head, &builder->hosted);
+	if (work != NULL) {
+		__update_workgrp_llen(grp, map->m_llen);
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, &grp, map, index, pageofs,
+		&builder->role, owned_head, &builder->hosted);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+got_it:
+	z_erofs_pagevec_ctor_init(&builder->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) {
+		/* enable possibly in-place decompression */
+		builder->compressed_pages = grp->compressed_pages;
+		builder->compressed_deficit = clusterpages;
+	} else {
+		builder->compressed_pages = NULL;
+		builder->compressed_deficit = 0;
+	}
+
+	builder->grp = grp;
+	builder->work = work;
+	return 0;
+}
+
+/*
+ * keep in mind that no referenced workgroups will be freed
+ * only after a RCU grace period, so rcu_read_lock() could
+ * prevent a workgroup from being freed.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->work;
+
+	call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp,
+	struct z_erofs_vle_work *work __maybe_unused)
+{
+	erofs_workgroup_put(&grp->obj);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	__z_erofs_vle_work_release(grp, work);
+}
+
+static inline bool
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder)
+{
+	struct z_erofs_vle_work *work = builder->work;
+
+	if (work == NULL)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&builder->vector, false);
+	mutex_unlock(&work->lock);
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current work isn't hosted by ourselves.
+	 */
+	if (!builder->hosted)
+		__z_erofs_vle_work_release(builder->grp, work);
+
+	builder->work = NULL;
+	builder->grp = NULL;
+	return true;
+}
+
+struct z_erofs_vle_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_vle_work_builder builder;
+	struct erofs_map_blocks_iter m_iter;
+
+	z_erofs_vle_owned_workgrp_t owned_head;
+
+	bool initial;
+};
+
+#define VLE_FRONTEND_INIT(__i) { \
+	.inode = __i, \
+	.m_iter = { \
+		{ .m_llen = 0, .m_plen = 0 }, \
+		.mpage = NULL \
+	}, \
+	.builder = VLE_WORK_BUILDER_INIT(), \
+	.owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \
+	.initial = true, }
+
+static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
+				struct page *page,
+				struct list_head *page_pool)
+{
+	struct super_block *const sb = fe->inode->i_sb;
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+	struct erofs_map_blocks_iter *const m = &fe->m_iter;
+	struct erofs_map_blocks *const map = &m->map;
+	struct z_erofs_vle_work_builder *const builder = &fe->builder;
+	const loff_t offset = page_offset(page);
+
+	bool tight = builder_is_followed(builder);
+	struct z_erofs_vle_work *work = builder->work;
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+            offset + cur < map->m_la + map->m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (!z_erofs_vle_work_iter_end(builder))
+		fe->initial = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(map->m_plen != 1 << sbi->clusterbits);
+	BUG_ON(erofs_blkoff(map->m_pa));
+
+	err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	tight &= builder_is_followed(builder);
+	work = builder->work;
+hitted:
+	cur = end - min_t(unsigned, offset + end - map->m_la, end);
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(builder, page, page_type);
+	/* should allocate an additional page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *newpage;
+
+		newpage = erofs_allocpage(page_pool, GFP_KERNEL);
+		newpage->mapping = NULL;
+
+		err = z_erofs_vle_work_add_page(builder,
+			newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - map->m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, map->m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool background = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (background)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+	unsigned i;
+	struct bio_vec *bvec;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		bool cachedpage = false;
+
+		DBG_BUGON(PageUptodate(page));
+
+		if (unlikely(err))
+			SetPageError(page);
+		else if (cachedpage)
+			SetPageUptodate(page);
+
+		if (cachedpage)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_workgroup *grp,
+	struct list_head *page_pool)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned clusterpages = erofs_clusterpages(sbi);
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	unsigned sparsemem_pages = 0;
+#endif
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_work *work;
+	void *vout;
+	int err;
+
+	might_sleep();
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_primary_work(grp);
+#else
+	BUG();
+#endif
+	BUG_ON(!READ_ONCE(work->nr_pages));
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc_array(nr_pages,
+			sizeof(struct page *), GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+		BUG_ON(!page);
+
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = grp->compressed_pages;
+
+	for(i = 0; i < clusterpages; ++i) {
+		unsigned pagenr;
+
+		BUG_ON(compressed_pages[i] == NULL);
+		page = compressed_pages[i];
+
+		if (page->mapping == NULL)
+			continue;
+
+		pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+
+		overlapped = true;
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs,
+		z_erofs_onlinepage_endio);
+	if (err != -ENOTSUPP)
+		goto out_percpu;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (sparsemem_pages >= nr_pages) {
+		BUG_ON(sparsemem_pages > nr_pages);
+		goto skip_allocpage;
+	}
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = erofs_allocpage(page_pool, GFP_KERNEL);
+		pages[i]->mapping = NULL;
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL) {
+			list_add(&page->lru, page_pool);
+			continue;
+		}
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+out_percpu:
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual pages */
+		if (page->mapping == NULL)
+			list_add(&page->lru, page_pool);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	/* all work locks MUST be taken before the following line */
+
+	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
+
+	/* all work locks SHOULD be released right now */
+	mutex_unlock(&work->lock);
+
+	z_erofs_vle_work_release(work);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	z_erofs_vle_owned_workgrp_t owned = io->head;
+
+	while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) {
+		struct z_erofs_vle_workgroup *grp;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned;
+		owned = READ_ONCE(grp->next);
+
+		z_erofs_vle_unzip(sb, grp, page_pool);
+	};
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline struct z_erofs_vle_unzip_io *
+prepare_io_handler(struct super_block *sb,
+		   struct z_erofs_vle_unzip_io *io,
+		   bool background)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	if (!background) {
+		/* waitqueue available for foreground io */
+		BUG_ON(io == NULL);
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+		goto out;
+	}
+
+	if (io != NULL)
+		BUG();
+	else {
+		/* allocate extra io descriptor for background io */
+		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+			GFP_KERNEL | __GFP_NOFAIL);
+		BUG_ON(iosb == NULL);
+
+		io = &iosb->io;
+	}
+
+	iosb->sb = sb;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+	io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED;
+	return io;
+}
+
+#define __FSIO_1 0
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   z_erofs_vle_owned_workgrp_t owned_head,
+				   struct list_head *pagepool,
+				   struct z_erofs_vle_unzip_io *fg_io,
+				   bool force_fg)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+	const gfp_t gfp = GFP_NOFS;
+	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
+	struct bio *bio;
+	tagptr1_t bi_private;
+	pgoff_t last_index;
+	bool force_submit = false;
+	unsigned nr_bios;
+
+	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
+		return false;
+
+	/*
+	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
+         * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
+	 */
+	if (force_fg) {
+		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
+	} else {
+		ios[__FSIO_1] = prepare_io_handler(sb, NULL, true);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1);
+	}
+
+	nr_bios = 0;
+	force_submit = false;
+	bio = NULL;
+
+	/* by default, all need io submission */
+	ios[__FSIO_1]->head = owned_head;
+
+	do {
+		struct z_erofs_vle_workgroup *grp;
+		struct page **compressed_pages, *oldpage, *page;
+		pgoff_t first_index;
+		unsigned i = 0;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned_head;
+
+		/* close the main owned chain at first */
+		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
+			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+		first_index = grp->obj.index;
+		compressed_pages = grp->compressed_pages;
+
+		force_submit |= (first_index != last_index + 1);
+repeat:
+		/* fulfill all compressed pages */
+		oldpage = page = READ_ONCE(compressed_pages[i]);
+
+		if (page != NULL)
+			BUG_ON(PageUptodate(page));
+		else {
+			page = erofs_allocpage(pagepool, gfp);
+			page->mapping = NULL;
+
+			if (oldpage != cmpxchg(compressed_pages + i,
+				oldpage, page)) {
+				list_add(&page->lru, pagepool);
+				goto repeat;
+			}
+		}
+
+		if (bio != NULL && force_submit) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+
+		if (bio == NULL) {
+			bio = prepare_bio(sb, first_index + i,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(bi_private);
+
+			++nr_bios;
+		}
+
+		err = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		force_submit = false;
+		last_index = first_index + i;
+		if (++i < clusterpages)
+			goto repeat;
+	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	BUG_ON(!nr_bios);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
+	return true;
+}
+
+static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
+				     struct list_head *pagepool,
+				     bool force_fg)
+{
+	struct super_block *sb = f->inode->i_sb;
+	struct z_erofs_vle_unzip_io io[1 + __FSIO_1];
+
+	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
+		return;
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	wait_event(io[__FSIO_1].u.wait,
+		!atomic_read(&io[__FSIO_1].pending_bios));
+
+	/* let's synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct z_erofs_vle_frontend f
+		= VLE_FRONTEND_INIT(page->mapping->host);
+	int err;
+	LIST_HEAD(pagepool);
+
+	err = z_erofs_do_read_page(&f, page, &pagepool);
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	z_erofs_submit_and_unzip(&f, &pagepool, true);
+out:
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct inode *const inode = mapping->host;
+
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	z_erofs_submit_and_unzip(&f, &pagepool, sync);
+
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 8e23e44..9630022 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,193 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+
+	atomic_t refcount;
+#endif
+	struct mutex lock;
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
+
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	struct z_erofs_vle_work work;
+
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
+
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+	unsigned int llen, flags;
+};
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
+
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
+
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+
+#endif
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_vle_owned_workgrp_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ab37072..dd1ce5f 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
 
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
+
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		first_index = grp->index + 1;
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v2 11/11] erofs: introduce cached decompression
  2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
                     ` (9 preceding siblings ...)
  2018-07-20  2:52   ` [RFC PATCH v2 10/11] erofs: introduce VLE decompression support Gao Xiang
@ 2018-07-20  2:52   ` Gao Xiang
  10 siblings, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20  2:52 UTC (permalink / raw)


This patch adds an optional choice which can be
enabled by users in order to cache both incomplete
ends of compressed clusters as a complement to
the in-place decompression in order to boost random
read, but it costs more memory than the in-place
decompression only.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 fs/erofs/Kconfig     |  38 ++++++++
 fs/erofs/internal.h  |  25 +++++
 fs/erofs/super.c     |  75 ++++++++++++++-
 fs/erofs/unzip_vle.c | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/utils.c     |  17 +++-
 5 files changed, 410 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 00e811c..d08c019 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -99,3 +99,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	  than 2. Otherwise, the image cannot be mounted
 	  correctly on this kernel.
 
+choice
+	prompt "EROFS VLE Data Decompression mode"
+	depends on EROFS_FS_ZIP
+	default EROFS_FS_ZIP_CACHE_BIPOLAR
+	help
+	  EROFS supports three options for VLE decompression.
+	  "In-place Decompression Only" consumes the minimum memory
+	  with lowest random read.
+
+	  "Bipolar Cached Decompression" consumes the maximum memory
+	  with highest random read.
+
+	  If unsure, select "Bipolar Cached Decompression"
+
+config EROFS_FS_ZIP_NO_CACHE
+	bool "In-place Decompression Only"
+	help
+	  Read compressed data into page cache and do in-place
+	  decompression directly.
+
+config EROFS_FS_ZIP_CACHE_UNIPOLAR
+	bool "Unipolar Cached Decompression"
+	help
+	  For each request, it caches the last compressed page
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+config EROFS_FS_ZIP_CACHE_BIPOLAR
+	bool "Bipolar Cached Decompression"
+	help
+	  For each request, it caches the both end compressed pages
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+	  Recommended for performance priority.
+
+endchoice
+
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index fd444ec..5667f56 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,18 @@ struct erofs_fault_info {
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR
+#define EROFS_FS_ZIP_CACHE_LVL	(2)
+#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR)
+#define EROFS_FS_ZIP_CACHE_LVL	(1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL	(0)
+#endif
+
+#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
+#define EROFS_FS_HAS_MANAGED_CACHE
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -88,6 +100,11 @@ struct erofs_sb_info {
 		spinlock_t lock;
 #endif
 	} workstn;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct inode *managed_cache;
+#endif
+
 #endif
 
 	u32 build_time_nsec;
@@ -251,6 +268,14 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+#define EROFS_UNALLOCATED_CACHED_PAGE	((void *)0x5F0EF00D)
+
+extern int try_to_free_cached_page(struct address_space *, struct page *);
+extern int try_to_free_all_cached_pages(struct erofs_sb_info *,
+	struct erofs_workgroup *);
+#endif
+
 #endif
 
 /* we strictly follow PAGE_SIZE and no buffer head */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 7e5333c..5a940c7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -247,6 +247,63 @@ static int parse_options(struct super_block *sb, char *options)
 	return 0;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static const struct address_space_operations managed_cache_aops;
+
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	int ret = 1;	/* 0 - busy */
+	struct address_space *const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(mapping->a_ops != &managed_cache_aops);
+
+	if (PagePrivate(page))
+		ret = try_to_free_cached_page(mapping, page);
+
+	return ret;
+}
+
+static void managed_cache_invalidatepage(struct page *page,
+	unsigned int offset, unsigned int length)
+{
+	const unsigned int stop = length + offset;
+
+	BUG_ON(!PageLocked(page));
+
+	/* Check for overflow */
+	BUG_ON(stop > PAGE_SIZE || stop < length);
+
+	if (offset == 0 && stop == PAGE_SIZE)
+		while(!managed_cache_releasepage(page, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+	.releasepage = managed_cache_releasepage,
+	.invalidatepage = managed_cache_invalidatepage,
+};
+
+struct inode *erofs_init_managed_cache(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (unlikely(inode == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &managed_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+	                     GFP_NOFS | __GFP_HIGHMEM |
+	                     __GFP_MOVABLE |  __GFP_NOFAIL);
+	return inode;
+}
+
+#endif
+
 static int erofs_read_super(struct super_block *sb,
 	const char *dev_name, void *data, int silent)
 {
@@ -301,11 +358,19 @@ static int erofs_read_super(struct super_block *sb,
 #endif
 #endif
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	sbi->managed_cache = erofs_init_managed_cache(sb);
+	if (IS_ERR(sbi->managed_cache)) {
+		err = PTR_ERR(sbi->managed_cache);
+		goto err_sbi;
+	}
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto err_sbi;
+		goto iget_err;
 	}
 
 	if (!S_ISDIR(inode->i_mode)) {
@@ -348,6 +413,10 @@ static int erofs_read_super(struct super_block *sb,
 err_iput:
 	if (sb->s_root == NULL)
 		iput(inode);
+iget_err:
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
 err_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -370,6 +439,10 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
+
 	mutex_lock(&sbi->umount_mutex);
 
 #ifdef CONFIG_EROFS_FS_ZIP
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index c113740..63e27bd 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -95,6 +95,111 @@ struct z_erofs_vle_work_builder {
 #define VLE_WORK_BUILDER_INIT()	\
 	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static bool grab_managed_cache_pages(struct address_space *mapping,
+				     erofs_blk_t start,
+				     struct page **compressed_pages,
+				     int clusterblks,
+				     bool reserve_allocation)
+{
+	bool noio = true;
+	unsigned int i;
+
+	/* TODO: optimize by introducing find_get_pages_range */
+	for (i = 0; i < clusterblks; ++i) {
+		struct page *page, *found;
+
+		if (READ_ONCE(compressed_pages[i]) != NULL)
+			continue;
+
+		page = found = find_get_page(mapping, start + i);
+		if (found == NULL) {
+			noio = false;
+			if (!reserve_allocation)
+				continue;
+			page = EROFS_UNALLOCATED_CACHED_PAGE;
+		}
+
+		if (NULL == cmpxchg(compressed_pages + i, NULL, page))
+                        continue;
+
+		if (found != NULL)
+			put_page(found);
+	}
+	return noio;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				 struct erofs_workgroup *egrp)
+{
+	struct z_erofs_vle_workgroup *const grp =
+		container_of(egrp, struct z_erofs_vle_workgroup, obj);
+	struct address_space *const mapping = sbi->managed_cache->i_mapping;
+	const int clusterpages = erofs_clusterpages(sbi);
+	int i;
+
+	/*
+	 * refcount of workgroup is now freezed as 1,
+	 * therefore no need to worry about available decompression users.
+	 */
+	for (i = 0; i < clusterpages; ++i) {
+		struct page *page = grp->compressed_pages[i];
+
+		if (page == NULL || page->mapping != mapping)
+			continue;
+
+		/* block other users from reclaiming or migrating the page */
+		if (!trylock_page(page))
+			return -EBUSY;
+
+		/* barrier is implied in the following 'unlock_page' */
+		WRITE_ONCE(grp->compressed_pages[i], NULL);
+
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+int try_to_free_cached_page(struct address_space *mapping, struct page *page)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_vle_workgroup *grp;
+	int ret = 0;	/* 0 - busy */
+
+	/* prevent the workgroup from being freed */
+	rcu_read_lock();
+	grp = (void *)page_private(page);
+
+	if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
+		unsigned i;
+
+		for (i = 0; i < clusterpages; ++i) {
+			if (grp->compressed_pages[i] == page) {
+				WRITE_ONCE(grp->compressed_pages[i], NULL);
+				ret = 1;
+				break;
+			}
+		}
+		erofs_workgroup_unfreeze(&grp->obj, 1);
+	}
+	rcu_read_unlock();
+
+	if (ret) {
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+	return ret;
+}
+#endif
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
 	struct z_erofs_vle_work_builder *b,
@@ -451,6 +556,9 @@ struct z_erofs_vle_frontend {
 	z_erofs_vle_owned_workgrp_t owned_head;
 
 	bool initial;
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	erofs_off_t cachedzone_la;
+#endif
 };
 
 #define VLE_FRONTEND_INIT(__i) { \
@@ -516,6 +624,26 @@ static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
 	if (unlikely(err))
 		goto err_out;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	else {
+		struct z_erofs_vle_workgroup *grp = fe->builder.grp;
+		struct address_space *mapping = sbi->managed_cache->i_mapping;
+
+		/* let's do out-of-order decompression for noio */
+		bool noio_outoforder = grab_managed_cache_pages(mapping,
+			erofs_blknr(map->m_pa),
+			grp->compressed_pages, erofs_blknr(map->m_plen),
+			fe->initial
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+			| (map->m_la <= fe->cachedzone_la)
+#endif
+		);
+
+		if (noio_outoforder && builder_is_followed(builder))
+			builder->role = Z_EROFS_VLE_WORK_PRIMARY;
+	}
+#endif
+
 	tight &= builder_is_followed(builder);
 	work = builder->work;
 hitted:
@@ -613,6 +741,15 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 
 		DBG_BUGON(PageUptodate(page));
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping != NULL) {
+			struct inode *inode = page->mapping->host;
+
+			cachedpage = (inode ==
+				EROFS_SB(inode->i_sb)->managed_cache);
+		}
+#endif
+
 		if (unlikely(err))
 			SetPageError(page);
 		else if (cachedpage)
@@ -726,6 +863,13 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
 		if (page->mapping == NULL)
 			continue;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping->host == sbi->managed_cache) {
+			BUG_ON(PageLocked(page));
+			BUG_ON(!PageUptodate(page));
+			continue;
+		}
+#endif
 
 		pagenr = z_erofs_onlinepage_index(page);
 
@@ -807,6 +951,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 		if (page->mapping == NULL)
 			list_add(&page->lru, page_pool);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		else if (page->mapping->host == sbi->managed_cache)
+			continue;
+#endif
 		WRITE_ONCE(compressed_pages[i], NULL);
 	}
 
@@ -898,7 +1046,32 @@ static void z_erofs_vle_unzip_wq(struct work_struct *work)
 	return io;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+/* true - unlocked (noio), false - locked (need submit io) */
+static inline bool recover_managed_page(
+	struct z_erofs_vle_workgroup *grp,
+	struct page *page)
+{
+	wait_on_page_locked(page);
+	if (PagePrivate(page) && PageUptodate(page))
+		return true;
+
+	lock_page(page);
+	if (unlikely(!PagePrivate(page))) {
+		set_page_private(page, (unsigned long)grp);
+		SetPagePrivate(page);
+	}
+	if (unlikely(PageUptodate(page))) {
+		unlock_page(page);
+		return true;
+	}
+	return false;
+}
+
+#define __FSIO_1 1
+#else
 #define __FSIO_1 0
+#endif
 
 static bool z_erofs_vle_submit_all(struct super_block *sb,
 				   z_erofs_vle_owned_workgrp_t owned_head,
@@ -909,6 +1082,11 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	const unsigned clusterpages = erofs_clusterpages(sbi);
 	const gfp_t gfp = GFP_NOFS;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const managed_cache_mapping =
+		sbi->managed_cache->i_mapping;
+	struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL;
+#endif
 	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
 	struct bio *bio;
 	tagptr1_t bi_private;
@@ -923,6 +1101,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
          * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
 	 */
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	ios[0] = prepare_io_handler(sb, fg_io + 0, false);
+#endif
+
 	if (force_fg) {
 		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
 		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
@@ -943,6 +1125,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		struct page **compressed_pages, *oldpage, *page;
 		pgoff_t first_index;
 		unsigned i = 0;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		unsigned noio = 0;
+		bool cachemanaged;
+#endif
 		int err;
 
 		/* no possible 'owned_head' equals the following */
@@ -963,9 +1149,28 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		/* fulfill all compressed pages */
 		oldpage = page = READ_ONCE(compressed_pages[i]);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		cachemanaged = false;
+
+		if (page == EROFS_UNALLOCATED_CACHED_PAGE) {
+			cachemanaged = true;
+			goto do_allocpage;
+		} else if (page != NULL) {
+			if (page->mapping != managed_cache_mapping)
+				BUG_ON(PageUptodate(page));
+			else if (recover_managed_page(grp, page)) {
+				/* page is uptodate, skip io submission */
+				force_submit = true;
+				++noio;
+				goto skippage;
+			}
+		} else {
+do_allocpage:
+#else
 		if (page != NULL)
 			BUG_ON(PageUptodate(page));
 		else {
+#endif
 			page = erofs_allocpage(pagepool, gfp);
 			page->mapping = NULL;
 
@@ -973,6 +1178,12 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 				oldpage, page)) {
 				list_add(&page->lru, pagepool);
 				goto repeat;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+			} else if (cachemanaged && !add_to_page_cache_lru(page,
+				managed_cache_mapping, first_index + i, gfp)) {
+				set_page_private(page, (unsigned long)grp);
+				SetPagePrivate(page);
+#endif
 			}
 		}
 
@@ -996,14 +1207,51 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 
 		force_submit = false;
 		last_index = first_index + i;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skippage:
+#endif
 		if (++i < clusterpages)
 			goto repeat;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (noio < clusterpages)
+			lstgrp_io = grp;
+		else {
+			z_erofs_vle_owned_workgrp_t iogrp_next =
+				owned_head == Z_EROFS_VLE_WORKGRP_TAIL ?
+				Z_EROFS_VLE_WORKGRP_TAIL_CLOSED :
+				owned_head;
+
+			if (lstgrp_io == NULL)
+				ios[1]->head = iogrp_next;
+			else
+				WRITE_ONCE(lstgrp_io->next, iogrp_next);
+
+			if (lstgrp_noio == NULL)
+				ios[0]->head = grp;
+			else
+				WRITE_ONCE(lstgrp_noio->next, grp);
+
+			lstgrp_noio = grp;
+		}
+#endif
 	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
 
 	if (bio != NULL)
 		__submit_bio(bio, REQ_OP_READ, 0);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 	BUG_ON(!nr_bios);
+#else
+	if (lstgrp_noio != NULL)
+		WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	if (!force_fg && !nr_bios) {
+		kvfree(container_of(ios[1],
+			struct z_erofs_vle_unzip_io_sb, io));
+		return true;
+	}
+#endif
 
 	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
 	return true;
@@ -1019,6 +1267,9 @@ static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
 	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
 		return;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	z_erofs_vle_unzip_all(sb, &io[0], pagepool);
+#endif
 	if (!force_fg)
 		return;
 
@@ -1038,6 +1289,9 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
 	int err;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = page->index << PAGE_SHIFT;
+#endif
 	err = z_erofs_do_read_page(&f, page, &pagepool);
 	(void)z_erofs_vle_work_iter_end(&f.builder);
 
@@ -1068,6 +1322,9 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 	struct page *head = NULL;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT;
+#endif
 	for (; nr_pages; --nr_pages) {
 		struct page *page = lru_to_page(pages);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dd1ce5f..b669ca3 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -143,13 +143,28 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 		if (cleanup)
 			BUG_ON(cnt != 1);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 		else if (cnt > 1)
+#else
+		if (!erofs_workgroup_try_to_freeze(grp, 1))
+#endif
 			continue;
 
 		if (radix_tree_delete(&sbi->workstn.tree,
-			grp->index) != grp)
+			grp->index) != grp) {
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skip:
+			erofs_workgroup_unfreeze(grp, 1);
+#endif
 			continue;
+		}
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (try_to_free_all_cached_pages(sbi, grp))
+			goto skip;
+
+		erofs_workgroup_unfreeze(grp, 1);
+#endif
 		/* (rarely) grabbed again when freeing */
 		erofs_workgroup_put(grp);
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v3 10/11] erofs: introduce VLE decompression support
  2018-07-20  2:52   ` [RFC PATCH v2 10/11] erofs: introduce VLE decompression support Gao Xiang
@ 2018-07-20 16:55     ` Gao Xiang
  2018-07-20 16:55       ` [RFC PATCH v3 11/11] erofs: introduce cached decompression Gao Xiang
  2018-07-20 17:29       ` [RFC PATCH v3 RESEND 10/11] erofs: introduce VLE decompression support Gao Xiang
  0 siblings, 2 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20 16:55 UTC (permalink / raw)


This patch introduces the basic in-place VLE decompression
implementation for the erofs file system.

Compared with fixed-sized input compression, it implements
what we call 'the variable-length extent compression' which
specifies the same output size for each compression block
to make the full use of IO bandwidth (which means almost
all data from block device can be directly used for decomp-
ression), improve the real (rather than just via data caching,
which costs more memory) random read and keep the relatively
lower compression ratios (it saves more storage space than
fixed-sized input compression which is also configured with
the same input block size), as illustrated below:

        |---  variable-length extent ---|------ VLE ------|---  VLE ---|
         /> clusterofs                  /> clusterofs     /> clusterofs /> clusterofs
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
...||   |       ||           ||         | ||           || |         || | ... original data
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
   ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++
        size         size         size         size         size
         \                             /                 /            /
          \                      /              /            /
           \               /            /            /
            ++-----------++-----------++-----------++
        ... ||           ||           ||           || ... compressed clusters
            ++-----------++-----------++-----------++
            ++->cluster<-++->cluster<-++->cluster<-++
                 size         size         size

The main point of 'in-place' refers to the decompression mode:
Instead of allocating independent compressed pages and data
structures, it reuses the allocated file cache pages at most
to store its compressed data and the corresponding pagevec in
a time-sharing approach by default, which will be useful for
low memory scenario.

In the end, unlike the other filesystems with (de)compression
support using a relatively large compression block size, which
reads and decompresses >= 128KB at once, and gains a more
good-looking random read (In fact it collects small random reads
into large sequential reads and caches all decompressed data
in memory, but it is unacceptable especially for embedded devices
with limited memory, and it is not the real random read), we
select a universal small-sized 4KB compressed cluster, which is
the smallest page size for most architectures, and all compressed
clusters can be read and decompressed independently, which ensures
random read number for all use cases.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
change log v3:
 - introduce the concept of staging page, and avoid widely used
   page->mapping = NULL, thus more potential races (if exists)
   can be detected if the page is truncated or freed by mistake.

 fs/erofs/inode.c     |    6 +-
 fs/erofs/internal.h  |    6 +
 fs/erofs/staging.h   |   46 ++
 fs/erofs/super.c     |   26 ++
 fs/erofs/unzip_vle.c | 1128 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h |  204 +++++++++
 fs/erofs/utils.c     |   61 ++-
 7 files changed, 1474 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5be7dea..fd444ec 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -273,6 +273,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -357,6 +360,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..47c9708d 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,49 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9a465bd..7e5333c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -426,6 +433,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -441,6 +454,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -449,6 +468,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -459,6 +482,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 96fd1114..011ef50 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,1133 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_vle_work_role {
+	Z_EROFS_VLE_WORK_SECONDARY,
+	Z_EROFS_VLE_WORK_PRIMARY,
+	/*
+	 * The current work has at least been linked with the following
+	 * processed chained works, which means if the processing page
+	 * is the tail partial page of the work, the current work can
+	 * safely use the whole page, as illustrated below:
+	 * +--------------+-------------------------------------------+
+	 * |  tail page   |      head page (of the previous work)     |
+	 * +--------------+-------------------------------------------+
+	 *   /\  which belongs to the current work
+	 * [  (*) this page can be used for the current work itself.  ]
+	 */
+	Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
+	Z_EROFS_VLE_WORK_MAX
+};
+
+struct z_erofs_vle_work_builder {
+	enum z_erofs_vle_work_role role;
+	/*
+	 * 'hosted = false' means that the current workgroup doesn't belong to
+	 * the owned chained workgroups. In the other words, it is none of our
+	 * business to submit this workgroup.
+	 */
+	bool hosted;
+
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+#define VLE_WORK_BUILDER_INIT()	\
+	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_builder *b,
+	struct page *page)
+{
+	while (b->compressed_deficit) {
+		--b->compressed_deficit;
+		if (NULL == cmpxchg(b->compressed_pages++, NULL, page))
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_builder *builder,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY &&
+		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(builder, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&builder->vector,
+		page, type, &occupied);
+	builder->work->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline bool try_to_claim_workgroup(
+	struct z_erofs_vle_workgroup *grp,
+	z_erofs_vle_owned_workgrp_t *owned_head,
+	bool *hosted)
+{
+	DBG_BUGON(*hosted == true);
+
+	/* let's claim these following types of workgroup */
+retry:
+	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
+		/* type 1, nil workgroup */
+		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
+			goto retry;
+
+		*owned_head = grp;
+		*hosted = true;
+	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
+		/*
+		 * type 2, link to the end of a existing open chain,
+		 * be careful that its submission itself is governed
+		 * by the original owned chain.
+		 */
+		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
+			goto retry;
+
+		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the followee :) */
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_lookup(struct super_block *sb,
+			pgoff_t idx, unsigned pageofs,
+			struct z_erofs_vle_workgroup **grp_ret,
+			enum z_erofs_vle_work_role *role,
+			z_erofs_vle_owned_workgrp_t *owned_head,
+			bool *hosted)
+{
+	bool tag, primary;
+	struct erofs_workgroup *egrp;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	egrp = erofs_find_workgroup(sb, idx, &tag);
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_work(grp, pageofs);
+	primary = true;
+#else
+	BUG();
+#endif
+
+	DBG_BUGON(work->pageofs != pageofs);
+
+	/*
+	 * lock must be taken first to avoid grp->next == NIL between
+	 * claiming workgroup and adding pages:
+	 *                        grp->next != NIL
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                        mutex_lock(&work->lock)
+	 *                        add all pages to pagevec
+	 *
+	 * [correct locking case 1]:
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
+	 *   ...                          *role = SECONDARY
+	 *                                add all pages to pagevec
+	 *                                ...
+	 *                                mutex_unlock(grp->work[c])
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *
+	 * [correct locking case 2]:
+	 *   mutex_lock(grp->work[b])
+	 *   ...
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                                mutex_lock(grp->work[a])
+	 *                                *role = PRIMARY_OWNER
+	 *                                add all pages to pagevec
+	 *                                ...
+	 */
+	mutex_lock(&work->lock);
+
+	*hosted = false;
+	if (!primary)
+		*role = Z_EROFS_VLE_WORK_SECONDARY;
+	/* claim the workgroup if possible */
+	else if (try_to_claim_workgroup(grp, owned_head, hosted))
+		*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	else
+		*role = Z_EROFS_VLE_WORK_PRIMARY;
+
+	return work;
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup **grp_ret,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  enum z_erofs_vle_work_role *role,
+			  z_erofs_vle_owned_workgrp_t *owned_head,
+			  bool *hosted)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_workgroup *grp = *grp_ret;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_workgrp_fmt(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
+			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	/* new workgrps have been claimed as type 1 */
+	WRITE_ONCE(grp->next, *owned_head);
+	/* primary and followed work for all new workgrps */
+	*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	/* it should be submitted by ourselves */
+	*hosted = true;
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently unimplemented */
+	BUG();
+#else
+	work = z_erofs_vle_grab_primary_work(grp);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, 0);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = *grp_ret = grp;
+
+	mutex_lock(&work->lock);
+	return work;
+}
+
+static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
+					 unsigned int llen)
+{
+	while(1) {
+		unsigned int orig_llen = grp->llen;
+
+		if (orig_llen >= llen || orig_llen ==
+			cmpxchg(&grp->llen, orig_llen, llen))
+			break;
+	}
+}
+
+#define builder_is_followed(builder) \
+	((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	const unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	const erofs_blk_t index = erofs_blknr(map->m_pa);
+	const unsigned pageofs = map->m_la & ~PAGE_MASK;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	DBG_BUGON(builder->work != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	DBG_BUGON(erofs_blkoff(map->m_pa));
+
+repeat:
+	work = z_erofs_vle_work_lookup(sb, index,
+		pageofs, &grp, &builder->role, owned_head, &builder->hosted);
+	if (work != NULL) {
+		__update_workgrp_llen(grp, map->m_llen);
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, &grp, map, index, pageofs,
+		&builder->role, owned_head, &builder->hosted);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+got_it:
+	z_erofs_pagevec_ctor_init(&builder->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) {
+		/* enable possibly in-place decompression */
+		builder->compressed_pages = grp->compressed_pages;
+		builder->compressed_deficit = clusterpages;
+	} else {
+		builder->compressed_pages = NULL;
+		builder->compressed_deficit = 0;
+	}
+
+	builder->grp = grp;
+	builder->work = work;
+	return 0;
+}
+
+/*
+ * keep in mind that no referenced workgroups will be freed
+ * only after a RCU grace period, so rcu_read_lock() could
+ * prevent a workgroup from being freed.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->work;
+
+	call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp,
+	struct z_erofs_vle_work *work __maybe_unused)
+{
+	erofs_workgroup_put(&grp->obj);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	__z_erofs_vle_work_release(grp, work);
+}
+
+static inline bool
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder)
+{
+	struct z_erofs_vle_work *work = builder->work;
+
+	if (work == NULL)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&builder->vector, false);
+	mutex_unlock(&work->lock);
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current work isn't hosted by ourselves.
+	 */
+	if (!builder->hosted)
+		__z_erofs_vle_work_release(builder->grp, work);
+
+	builder->work = NULL;
+	builder->grp = NULL;
+	return true;
+}
+
+static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
+					       gfp_t gfp)
+{
+	struct page *page = erofs_allocpage(pagepool, gfp);
+
+	if (unlikely(page == NULL))
+		return NULL;
+
+	page->mapping = Z_EROFS_MAPPING_STAGING;
+	return page;
+}
+
+struct z_erofs_vle_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_vle_work_builder builder;
+	struct erofs_map_blocks_iter m_iter;
+
+	z_erofs_vle_owned_workgrp_t owned_head;
+
+	bool initial;
+};
+
+#define VLE_FRONTEND_INIT(__i) { \
+	.inode = __i, \
+	.m_iter = { \
+		{ .m_llen = 0, .m_plen = 0 }, \
+		.mpage = NULL \
+	}, \
+	.builder = VLE_WORK_BUILDER_INIT(), \
+	.owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \
+	.initial = true, }
+
+static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
+				struct page *page,
+				struct list_head *page_pool)
+{
+	struct super_block *const sb = fe->inode->i_sb;
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+	struct erofs_map_blocks_iter *const m = &fe->m_iter;
+	struct erofs_map_blocks *const map = &m->map;
+	struct z_erofs_vle_work_builder *const builder = &fe->builder;
+	const loff_t offset = page_offset(page);
+
+	bool tight = builder_is_followed(builder);
+	struct z_erofs_vle_work *work = builder->work;
+
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+            offset + cur < map->m_la + map->m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (!z_erofs_vle_work_iter_end(builder))
+		fe->initial = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(map->m_plen != 1 << sbi->clusterbits);
+	BUG_ON(erofs_blkoff(map->m_pa));
+
+	err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	tight &= builder_is_followed(builder);
+	work = builder->work;
+hitted:
+	cur = end - min_t(unsigned, offset + end - map->m_la, end);
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(builder, page, page_type);
+	/* should allocate an additional staging page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *const newpage =
+			__stagingpage_alloc(page_pool, GFP_NOFS);
+
+		err = z_erofs_vle_work_add_page(builder,
+			newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - map->m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, map->m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool background = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (background)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+	unsigned i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		BUG_ON(page->mapping == NULL);
+
+		if (unlikely(err))
+			SetPageError(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_workgroup *grp,
+	struct list_head *page_pool)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	unsigned sparsemem_pages = 0;
+#endif
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_work *work;
+	void *vout;
+	int err;
+
+	might_sleep();
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_primary_work(grp);
+#else
+	BUG();
+#endif
+	BUG_ON(!READ_ONCE(work->nr_pages));
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc_array(nr_pages,
+			sizeof(struct page *), GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+
+		/* all pages in pagevec ought to be valid */
+		DBG_BUGON(page == NULL);
+		DBG_BUGON(page->mapping == NULL);
+
+		if (z_erofs_gather_if_stagingpage(page_pool, page))
+			continue;
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = grp->compressed_pages;
+
+	for(i = 0; i < clusterpages; ++i) {
+		unsigned pagenr;
+
+		page = compressed_pages[i];
+
+		/* all compressed pages ought to be valid */
+		DBG_BUGON(page == NULL);
+		DBG_BUGON(page->mapping == NULL);
+
+		if (z_erofs_is_stagingpage(page))
+			continue;
+
+		/* only non-head page could be reused as a compressed page */
+		pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+
+		overlapped = true;
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
+		/* FIXME! this should be fixed in the future */
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs,
+		z_erofs_onlinepage_endio);
+	if (err != -ENOTSUPP)
+		goto out_percpu;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (sparsemem_pages >= nr_pages) {
+		BUG_ON(sparsemem_pages > nr_pages);
+		goto skip_allocpage;
+	}
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = __stagingpage_alloc(page_pool, GFP_NOFS);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+		DBG_BUGON(page->mapping == NULL);
+
+		/* recycle all individual staging pages */
+		if (z_erofs_gather_if_stagingpage(page_pool, page))
+			continue;
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+out_percpu:
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual staging pages */
+		(void)z_erofs_gather_if_stagingpage(page_pool, page);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	/* all work locks MUST be taken before the following line */
+
+	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
+
+	/* all work locks SHOULD be released right now */
+	mutex_unlock(&work->lock);
+
+	z_erofs_vle_work_release(work);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	z_erofs_vle_owned_workgrp_t owned = io->head;
+
+	while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) {
+		struct z_erofs_vle_workgroup *grp;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned;
+		owned = READ_ONCE(grp->next);
+
+		z_erofs_vle_unzip(sb, grp, page_pool);
+	};
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline struct z_erofs_vle_unzip_io *
+prepare_io_handler(struct super_block *sb,
+		   struct z_erofs_vle_unzip_io *io,
+		   bool background)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	if (!background) {
+		/* waitqueue available for foreground io */
+		BUG_ON(io == NULL);
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+		goto out;
+	}
+
+	if (io != NULL)
+		BUG();
+	else {
+		/* allocate extra io descriptor for background io */
+		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+			GFP_KERNEL | __GFP_NOFAIL);
+		BUG_ON(iosb == NULL);
+
+		io = &iosb->io;
+	}
+
+	iosb->sb = sb;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+	io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED;
+	return io;
+}
+
+#define __FSIO_1 0
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   z_erofs_vle_owned_workgrp_t owned_head,
+				   struct list_head *pagepool,
+				   struct z_erofs_vle_unzip_io *fg_io,
+				   bool force_fg)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+	const gfp_t gfp = GFP_NOFS;
+	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
+	struct bio *bio;
+	tagptr1_t bi_private;
+	pgoff_t last_index;
+	bool force_submit = false;
+	unsigned nr_bios;
+
+	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
+		return false;
+
+	/*
+	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
+         * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
+	 */
+	if (force_fg) {
+		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
+	} else {
+		ios[__FSIO_1] = prepare_io_handler(sb, NULL, true);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1);
+	}
+
+	nr_bios = 0;
+	force_submit = false;
+	bio = NULL;
+
+	/* by default, all need io submission */
+	ios[__FSIO_1]->head = owned_head;
+
+	do {
+		struct z_erofs_vle_workgroup *grp;
+		struct page **compressed_pages, *oldpage, *page;
+		pgoff_t first_index;
+		unsigned i = 0;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned_head;
+
+		/* close the main owned chain at first */
+		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
+			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+		first_index = grp->obj.index;
+		compressed_pages = grp->compressed_pages;
+
+		force_submit |= (first_index != last_index + 1);
+repeat:
+		/* fulfill all compressed pages */
+		oldpage = page = READ_ONCE(compressed_pages[i]);
+
+		if (page != NULL)
+			BUG_ON(PageUptodate(page));
+		else {
+			page = __stagingpage_alloc(pagepool, gfp);
+
+			if (oldpage != cmpxchg(compressed_pages + i,
+				oldpage, page)) {
+				list_add(&page->lru, pagepool);
+				goto repeat;
+			}
+		}
+
+		if (bio != NULL && force_submit) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+
+		if (bio == NULL) {
+			bio = prepare_bio(sb, first_index + i,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(bi_private);
+
+			++nr_bios;
+		}
+
+		err = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		force_submit = false;
+		last_index = first_index + i;
+		if (++i < clusterpages)
+			goto repeat;
+	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	BUG_ON(!nr_bios);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
+	return true;
+}
+
+static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
+				     struct list_head *pagepool,
+				     bool force_fg)
+{
+	struct super_block *sb = f->inode->i_sb;
+	struct z_erofs_vle_unzip_io io[1 + __FSIO_1];
+
+	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
+		return;
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	wait_event(io[__FSIO_1].u.wait,
+		!atomic_read(&io[__FSIO_1].pending_bios));
+
+	/* let's synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct inode *const inode = page->mapping->host;
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	int err;
+	LIST_HEAD(pagepool);
+
+	err = z_erofs_do_read_page(&f, page, &pagepool);
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	z_erofs_submit_and_unzip(&f, &pagepool, true);
+out:
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct inode *const inode = mapping->host;
+
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	z_erofs_submit_and_unzip(&f, &pagepool, sync);
+
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 8e23e44..b276af1 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,213 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/*
+ *  - 0x5FA11OC8D ('fsallocated', Z_EROFS_MAPPING_STAGING) -
+ * used for temporary allocated pages (via erofs_allocpage),
+ * in order to seperate those from NULL mapping (eg. truncated pages)
+ */
+#define Z_EROFS_MAPPING_STAGING		((void *)0x5FA110C8D)
+
+#define z_erofs_is_stagingpage(page)	\
+	((page)->mapping == Z_EROFS_MAPPING_STAGING)
+
+static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool,
+						 struct page *page)
+{
+	if (z_erofs_is_stagingpage(page)) {
+		list_add(&page->lru, page_pool);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+
+	atomic_t refcount;
+#endif
+	struct mutex lock;
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
+
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	struct z_erofs_vle_work work;
+
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
+
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+	unsigned int llen, flags;
+};
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
+
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
+
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+
+#endif
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_vle_owned_workgrp_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ab37072..dd1ce5f 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
 
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
+
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		first_index = grp->index + 1;
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v3 11/11] erofs: introduce cached decompression
  2018-07-20 16:55     ` [RFC PATCH v3 " Gao Xiang
@ 2018-07-20 16:55       ` Gao Xiang
  2018-07-20 17:29       ` [RFC PATCH v3 RESEND 10/11] erofs: introduce VLE decompression support Gao Xiang
  1 sibling, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20 16:55 UTC (permalink / raw)


This patch adds an optional choice which can be
enabled by users in order to cache both incomplete
ends of compressed clusters as a complement to
the in-place decompression in order to boost random
read, but it costs more memory than the in-place
decompression only.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
change log v3:
 - introduce the concept of staging page, and avoid widely used
   page->mapping = NULL, thus more potential races (if exists)
   can be detected if the page is truncated or freed by mistake.
 - minor logic optimization

 fs/erofs/Kconfig     |  38 +++++++
 fs/erofs/internal.h  |  25 +++++
 fs/erofs/super.c     |  75 +++++++++++++-
 fs/erofs/unzip_vle.c | 275 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/utils.c     |  17 +++-
 5 files changed, 428 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 00e811c..d08c019 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -99,3 +99,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	  than 2. Otherwise, the image cannot be mounted
 	  correctly on this kernel.
 
+choice
+	prompt "EROFS VLE Data Decompression mode"
+	depends on EROFS_FS_ZIP
+	default EROFS_FS_ZIP_CACHE_BIPOLAR
+	help
+	  EROFS supports three options for VLE decompression.
+	  "In-place Decompression Only" consumes the minimum memory
+	  with lowest random read.
+
+	  "Bipolar Cached Decompression" consumes the maximum memory
+	  with highest random read.
+
+	  If unsure, select "Bipolar Cached Decompression"
+
+config EROFS_FS_ZIP_NO_CACHE
+	bool "In-place Decompression Only"
+	help
+	  Read compressed data into page cache and do in-place
+	  decompression directly.
+
+config EROFS_FS_ZIP_CACHE_UNIPOLAR
+	bool "Unipolar Cached Decompression"
+	help
+	  For each request, it caches the last compressed page
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+config EROFS_FS_ZIP_CACHE_BIPOLAR
+	bool "Bipolar Cached Decompression"
+	help
+	  For each request, it caches the both end compressed pages
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+	  Recommended for performance priority.
+
+endchoice
+
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index fd444ec..5667f56 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,18 @@ struct erofs_fault_info {
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR
+#define EROFS_FS_ZIP_CACHE_LVL	(2)
+#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR)
+#define EROFS_FS_ZIP_CACHE_LVL	(1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL	(0)
+#endif
+
+#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
+#define EROFS_FS_HAS_MANAGED_CACHE
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -88,6 +100,11 @@ struct erofs_sb_info {
 		spinlock_t lock;
 #endif
 	} workstn;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct inode *managed_cache;
+#endif
+
 #endif
 
 	u32 build_time_nsec;
@@ -251,6 +268,14 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+#define EROFS_UNALLOCATED_CACHED_PAGE	((void *)0x5F0EF00D)
+
+extern int try_to_free_cached_page(struct address_space *, struct page *);
+extern int try_to_free_all_cached_pages(struct erofs_sb_info *,
+	struct erofs_workgroup *);
+#endif
+
 #endif
 
 /* we strictly follow PAGE_SIZE and no buffer head */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 7e5333c..5a940c7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -247,6 +247,63 @@ static int parse_options(struct super_block *sb, char *options)
 	return 0;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static const struct address_space_operations managed_cache_aops;
+
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	int ret = 1;	/* 0 - busy */
+	struct address_space *const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(mapping->a_ops != &managed_cache_aops);
+
+	if (PagePrivate(page))
+		ret = try_to_free_cached_page(mapping, page);
+
+	return ret;
+}
+
+static void managed_cache_invalidatepage(struct page *page,
+	unsigned int offset, unsigned int length)
+{
+	const unsigned int stop = length + offset;
+
+	BUG_ON(!PageLocked(page));
+
+	/* Check for overflow */
+	BUG_ON(stop > PAGE_SIZE || stop < length);
+
+	if (offset == 0 && stop == PAGE_SIZE)
+		while(!managed_cache_releasepage(page, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+	.releasepage = managed_cache_releasepage,
+	.invalidatepage = managed_cache_invalidatepage,
+};
+
+struct inode *erofs_init_managed_cache(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (unlikely(inode == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &managed_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+	                     GFP_NOFS | __GFP_HIGHMEM |
+	                     __GFP_MOVABLE |  __GFP_NOFAIL);
+	return inode;
+}
+
+#endif
+
 static int erofs_read_super(struct super_block *sb,
 	const char *dev_name, void *data, int silent)
 {
@@ -301,11 +358,19 @@ static int erofs_read_super(struct super_block *sb,
 #endif
 #endif
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	sbi->managed_cache = erofs_init_managed_cache(sb);
+	if (IS_ERR(sbi->managed_cache)) {
+		err = PTR_ERR(sbi->managed_cache);
+		goto err_sbi;
+	}
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto err_sbi;
+		goto iget_err;
 	}
 
 	if (!S_ISDIR(inode->i_mode)) {
@@ -348,6 +413,10 @@ static int erofs_read_super(struct super_block *sb,
 err_iput:
 	if (sb->s_root == NULL)
 		iput(inode);
+iget_err:
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
 err_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -370,6 +439,10 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
+
 	mutex_lock(&sbi->umount_mutex);
 
 #ifdef CONFIG_EROFS_FS_ZIP
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 011ef50..2b42ee6 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -95,6 +95,111 @@ struct z_erofs_vle_work_builder {
 #define VLE_WORK_BUILDER_INIT()	\
 	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static bool grab_managed_cache_pages(struct address_space *mapping,
+				     erofs_blk_t start,
+				     struct page **compressed_pages,
+				     int clusterblks,
+				     bool reserve_allocation)
+{
+	bool noio = true;
+	unsigned int i;
+
+	/* TODO: optimize by introducing find_get_pages_range */
+	for (i = 0; i < clusterblks; ++i) {
+		struct page *page, *found;
+
+		if (READ_ONCE(compressed_pages[i]) != NULL)
+			continue;
+
+		page = found = find_get_page(mapping, start + i);
+		if (found == NULL) {
+			noio = false;
+			if (!reserve_allocation)
+				continue;
+			page = EROFS_UNALLOCATED_CACHED_PAGE;
+		}
+
+		if (NULL == cmpxchg(compressed_pages + i, NULL, page))
+                        continue;
+
+		if (found != NULL)
+			put_page(found);
+	}
+	return noio;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				 struct erofs_workgroup *egrp)
+{
+	struct z_erofs_vle_workgroup *const grp =
+		container_of(egrp, struct z_erofs_vle_workgroup, obj);
+	struct address_space *const mapping = sbi->managed_cache->i_mapping;
+	const int clusterpages = erofs_clusterpages(sbi);
+	int i;
+
+	/*
+	 * refcount of workgroup is now freezed as 1,
+	 * therefore no need to worry about available decompression users.
+	 */
+	for (i = 0; i < clusterpages; ++i) {
+		struct page *page = grp->compressed_pages[i];
+
+		if (page == NULL || page->mapping != mapping)
+			continue;
+
+		/* block other users from reclaiming or migrating the page */
+		if (!trylock_page(page))
+			return -EBUSY;
+
+		/* barrier is implied in the following 'unlock_page' */
+		WRITE_ONCE(grp->compressed_pages[i], NULL);
+
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+int try_to_free_cached_page(struct address_space *mapping, struct page *page)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_vle_workgroup *grp;
+	int ret = 0;	/* 0 - busy */
+
+	/* prevent the workgroup from being freed */
+	rcu_read_lock();
+	grp = (void *)page_private(page);
+
+	if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
+		unsigned i;
+
+		for (i = 0; i < clusterpages; ++i) {
+			if (grp->compressed_pages[i] == page) {
+				WRITE_ONCE(grp->compressed_pages[i], NULL);
+				ret = 1;
+				break;
+			}
+		}
+		erofs_workgroup_unfreeze(&grp->obj, 1);
+	}
+	rcu_read_unlock();
+
+	if (ret) {
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+	return ret;
+}
+#endif
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
 	struct z_erofs_vle_work_builder *b,
@@ -463,6 +568,9 @@ struct z_erofs_vle_frontend {
 	z_erofs_vle_owned_workgrp_t owned_head;
 
 	bool initial;
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	erofs_off_t cachedzone_la;
+#endif
 };
 
 #define VLE_FRONTEND_INIT(__i) { \
@@ -489,6 +597,12 @@ static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
 	bool tight = builder_is_followed(builder);
 	struct z_erofs_vle_work *work = builder->work;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const mngda = sbi->managed_cache->i_mapping;
+	struct z_erofs_vle_workgroup *grp;
+	bool noio_outoforder;
+#endif
+
 	enum z_erofs_page_type page_type;
 	unsigned cur, end, spiltted, index;
 	int err;
@@ -529,6 +643,21 @@ static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
 	if (unlikely(err))
 		goto err_out;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	grp = fe->builder.grp;
+
+	/* let's do out-of-order decompression for noio */
+	noio_outoforder = grab_managed_cache_pages(
+		mngda, erofs_blknr(map->m_pa),
+		grp->compressed_pages, erofs_blknr(map->m_plen),
+		/* compressed page caching policy */
+		fe->initial | (EROFS_FS_ZIP_CACHE_LVL >= 2 ?
+			map->m_la < fe->cachedzone_la : 0));
+
+	if (noio_outoforder && builder_is_followed(builder))
+		builder->role = Z_EROFS_VLE_WORK_PRIMARY;
+#endif
+
 	tight &= builder_is_followed(builder);
 	work = builder->work;
 hitted:
@@ -616,15 +745,39 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 #endif
 	unsigned i;
 	struct bio_vec *bvec;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *mngda = NULL;
+#endif
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
+		bool cachemngd = false;
 
 		DBG_BUGON(PageUptodate(page));
 		BUG_ON(page->mapping == NULL);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (unlikely(mngda == NULL && !z_erofs_is_stagingpage(page))) {
+			struct inode *const inode = page->mapping->host;
+			struct super_block *const sb = inode->i_sb;
+
+			mngda = EROFS_SB(sb)->managed_cache->i_mapping;
+		}
+
+		/*
+		 * If mngda has not gotten, it equals NULL,
+		 * however, page->mapping never be NULL if working properly.
+		 */
+		cachemngd = (page->mapping == mngda);
+#endif
+
 		if (unlikely(err))
 			SetPageError(page);
+		else if (cachemngd)
+			SetPageUptodate(page);
+
+		if (cachemngd)
+			unlock_page(page);
 	}
 
 	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
@@ -639,6 +792,9 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	struct list_head *page_pool)
 {
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const mngda = sbi->managed_cache->i_mapping;
+#endif
 	const unsigned clusterpages = erofs_clusterpages(sbi);
 
 	struct z_erofs_pagevec_ctor ctor;
@@ -736,6 +892,13 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
 		if (z_erofs_is_stagingpage(page))
 			continue;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		else if (page->mapping == mngda) {
+			BUG_ON(PageLocked(page));
+			BUG_ON(!PageUptodate(page));
+			continue;
+		}
+#endif
 
 		/* only non-head page could be reused as a compressed page */
 		pagenr = z_erofs_onlinepage_index(page);
@@ -813,6 +976,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	for (i = 0; i < clusterpages; ++i) {
 		page = compressed_pages[i];
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping == mngda)
+			continue;
+#endif
 		/* recycle all individual staging pages */
 		(void)z_erofs_gather_if_stagingpage(page_pool, page);
 
@@ -907,7 +1074,32 @@ static void z_erofs_vle_unzip_wq(struct work_struct *work)
 	return io;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+/* true - unlocked (noio), false - locked (need submit io) */
+static inline bool recover_managed_page(
+	struct z_erofs_vle_workgroup *grp,
+	struct page *page)
+{
+	wait_on_page_locked(page);
+	if (PagePrivate(page) && PageUptodate(page))
+		return true;
+
+	lock_page(page);
+	if (unlikely(!PagePrivate(page))) {
+		set_page_private(page, (unsigned long)grp);
+		SetPagePrivate(page);
+	}
+	if (unlikely(PageUptodate(page))) {
+		unlock_page(page);
+		return true;
+	}
+	return false;
+}
+
+#define __FSIO_1 1
+#else
 #define __FSIO_1 0
+#endif
 
 static bool z_erofs_vle_submit_all(struct super_block *sb,
 				   z_erofs_vle_owned_workgrp_t owned_head,
@@ -918,6 +1110,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	const unsigned clusterpages = erofs_clusterpages(sbi);
 	const gfp_t gfp = GFP_NOFS;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const mngda = sbi->managed_cache->i_mapping;
+	struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL;
+#endif
 	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
 	struct bio *bio;
 	tagptr1_t bi_private;
@@ -932,6 +1128,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
          * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
 	 */
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	ios[0] = prepare_io_handler(sb, fg_io + 0, false);
+#endif
+
 	if (force_fg) {
 		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
 		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
@@ -952,6 +1152,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		struct page **compressed_pages, *oldpage, *page;
 		pgoff_t first_index;
 		unsigned i = 0;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		unsigned noio = 0;
+		bool cachemngd;
+#endif
 		int err;
 
 		/* no possible 'owned_head' equals the following */
@@ -972,15 +1176,40 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		/* fulfill all compressed pages */
 		oldpage = page = READ_ONCE(compressed_pages[i]);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		cachemngd = false;
+
+		if (page == EROFS_UNALLOCATED_CACHED_PAGE) {
+			cachemngd = true;
+			goto do_allocpage;
+		} else if (page != NULL) {
+			if (page->mapping != mngda)
+				BUG_ON(PageUptodate(page));
+			else if (recover_managed_page(grp, page)) {
+				/* page is uptodate, skip io submission */
+				force_submit = true;
+				++noio;
+				goto skippage;
+			}
+		} else {
+do_allocpage:
+#else
 		if (page != NULL)
 			BUG_ON(PageUptodate(page));
 		else {
+#endif
 			page = __stagingpage_alloc(pagepool, gfp);
 
 			if (oldpage != cmpxchg(compressed_pages + i,
 				oldpage, page)) {
 				list_add(&page->lru, pagepool);
 				goto repeat;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+			} else if (cachemngd && !add_to_page_cache_lru(page,
+				mngda, first_index + i, gfp)) {
+				set_page_private(page, (unsigned long)grp);
+				SetPagePrivate(page);
+#endif
 			}
 		}
 
@@ -1004,14 +1233,51 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 
 		force_submit = false;
 		last_index = first_index + i;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skippage:
+#endif
 		if (++i < clusterpages)
 			goto repeat;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (noio < clusterpages)
+			lstgrp_io = grp;
+		else {
+			z_erofs_vle_owned_workgrp_t iogrp_next =
+				owned_head == Z_EROFS_VLE_WORKGRP_TAIL ?
+				Z_EROFS_VLE_WORKGRP_TAIL_CLOSED :
+				owned_head;
+
+			if (lstgrp_io == NULL)
+				ios[1]->head = iogrp_next;
+			else
+				WRITE_ONCE(lstgrp_io->next, iogrp_next);
+
+			if (lstgrp_noio == NULL)
+				ios[0]->head = grp;
+			else
+				WRITE_ONCE(lstgrp_noio->next, grp);
+
+			lstgrp_noio = grp;
+		}
+#endif
 	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
 
 	if (bio != NULL)
 		__submit_bio(bio, REQ_OP_READ, 0);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 	BUG_ON(!nr_bios);
+#else
+	if (lstgrp_noio != NULL)
+		WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	if (!force_fg && !nr_bios) {
+		kvfree(container_of(ios[1],
+			struct z_erofs_vle_unzip_io_sb, io));
+		return true;
+	}
+#endif
 
 	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
 	return true;
@@ -1027,6 +1293,9 @@ static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
 	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
 		return;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	z_erofs_vle_unzip_all(sb, &io[0], pagepool);
+#endif
 	if (!force_fg)
 		return;
 
@@ -1046,6 +1315,9 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
 	int err;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = page->index << PAGE_SHIFT;
+#endif
 	err = z_erofs_do_read_page(&f, page, &pagepool);
 	(void)z_erofs_vle_work_iter_end(&f.builder);
 
@@ -1076,6 +1348,9 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 	struct page *head = NULL;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT;
+#endif
 	for (; nr_pages; --nr_pages) {
 		struct page *page = lru_to_page(pages);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dd1ce5f..b669ca3 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -143,13 +143,28 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 		if (cleanup)
 			BUG_ON(cnt != 1);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 		else if (cnt > 1)
+#else
+		if (!erofs_workgroup_try_to_freeze(grp, 1))
+#endif
 			continue;
 
 		if (radix_tree_delete(&sbi->workstn.tree,
-			grp->index) != grp)
+			grp->index) != grp) {
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skip:
+			erofs_workgroup_unfreeze(grp, 1);
+#endif
 			continue;
+		}
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (try_to_free_all_cached_pages(sbi, grp))
+			goto skip;
+
+		erofs_workgroup_unfreeze(grp, 1);
+#endif
 		/* (rarely) grabbed again when freeing */
 		erofs_workgroup_put(grp);
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

* [RFC PATCH v3 RESEND 10/11] erofs: introduce VLE decompression support
  2018-07-20 16:55     ` [RFC PATCH v3 " Gao Xiang
  2018-07-20 16:55       ` [RFC PATCH v3 11/11] erofs: introduce cached decompression Gao Xiang
@ 2018-07-20 17:29       ` Gao Xiang
  1 sibling, 0 replies; 102+ messages in thread
From: Gao Xiang @ 2018-07-20 17:29 UTC (permalink / raw)


This patch introduces the basic in-place VLE decompression
implementation for the erofs file system.

Compared with fixed-sized input compression, it implements
what we call 'the variable-length extent compression' which
specifies the same output size for each compression block
to make the full use of IO bandwidth (which means almost
all data from block device can be directly used for decomp-
ression), improve the real (rather than just via data caching,
which costs more memory) random read and keep the relatively
lower compression ratios (it saves more storage space than
fixed-sized input compression which is also configured with
the same input block size), as illustrated below:

        |---  variable-length extent ---|------ VLE ------|---  VLE ---|
         /> clusterofs                  /> clusterofs     /> clusterofs /> clusterofs
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
...||   |       ||           ||         | ||           || |         || | ... original data
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
   ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++
        size         size         size         size         size
         \                             /                 /            /
          \                      /              /            /
           \               /            /            /
            ++-----------++-----------++-----------++
        ... ||           ||           ||           || ... compressed clusters
            ++-----------++-----------++-----------++
            ++->cluster<-++->cluster<-++->cluster<-++
                 size         size         size

The main point of 'in-place' refers to the decompression mode:
Instead of allocating independent compressed pages and data
structures, it reuses the allocated file cache pages at most
to store its compressed data and the corresponding pagevec in
a time-sharing approach by default, which will be useful for
low memory scenario.

In the end, unlike the other filesystems with (de)compression
support using a relatively large compression block size, which
reads and decompresses >= 128KB at once, and gains a more
good-looking random read (In fact it collects small random reads
into large sequential reads and caches all decompressed data
in memory, but it is unacceptable especially for embedded devices
with limited memory, and it is not the real random read), we
select a universal small-sized 4KB compressed cluster, which is
the smallest page size for most architectures, and all compressed
clusters can be read and decompressed independently, which ensures
random read number for all use cases.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 - introduce the concept of staging page, and avoid widely used
   page->mapping = NULL, thus more potential races (if exists)
   can be detected if the page is truncated or freed by mistake.

 - [RESEND] silence a redundant uninitialized last_index warning,
   where last_index is unneeded to be initialized at the beginning
   of z_erofs_vle_submit_all.

 fs/erofs/inode.c     |    6 +-
 fs/erofs/internal.h  |    6 +
 fs/erofs/staging.h   |   46 ++
 fs/erofs/super.c     |   26 ++
 fs/erofs/unzip_vle.c | 1129 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/unzip_vle.h |  204 +++++++++
 fs/erofs/utils.c     |   61 ++-
 7 files changed, 1475 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 573d3d3..699ce4f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -207,8 +207,12 @@ int fill_inode(struct inode *inode, int isdir)
 			goto out_unlock;
 		}
 
-		/* for compression or unknown data mapping mode */
+		/* for compression mapping mode */
+#ifdef CONFIG_EROFS_FS_ZIP
+		inode->i_mapping->a_ops = &z_erofs_vle_normal_access_aops;
+#else
 		err = -ENOTSUPP;
+#endif
 	}
 
 out_unlock:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5be7dea..fd444ec 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -273,6 +273,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -357,6 +360,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_unaligned_compressed_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normal_access_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h
index a9bfd8c..47c9708d 100644
--- a/fs/erofs/staging.h
+++ b/fs/erofs/staging.h
@@ -85,3 +85,49 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9a465bd..7e5333c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -118,6 +118,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -426,6 +433,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	.fs_flags       = FS_REQUIRES_DEV,
 };
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -441,6 +454,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -449,6 +468,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -459,6 +482,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("Successfully finalize erofs");
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index 96fd1114..d3da679 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -10,7 +10,1134 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_vle_work_role {
+	Z_EROFS_VLE_WORK_SECONDARY,
+	Z_EROFS_VLE_WORK_PRIMARY,
+	/*
+	 * The current work has at least been linked with the following
+	 * processed chained works, which means if the processing page
+	 * is the tail partial page of the work, the current work can
+	 * safely use the whole page, as illustrated below:
+	 * +--------------+-------------------------------------------+
+	 * |  tail page   |      head page (of the previous work)     |
+	 * +--------------+-------------------------------------------+
+	 *   /\  which belongs to the current work
+	 * [  (*) this page can be used for the current work itself.  ]
+	 */
+	Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
+	Z_EROFS_VLE_WORK_MAX
+};
+
+struct z_erofs_vle_work_builder {
+	enum z_erofs_vle_work_role role;
+	/*
+	 * 'hosted = false' means that the current workgroup doesn't belong to
+	 * the owned chained workgroups. In the other words, it is none of our
+	 * business to submit this workgroup.
+	 */
+	bool hosted;
+
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+#define VLE_WORK_BUILDER_INIT()	\
+	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_builder *b,
+	struct page *page)
+{
+	while (b->compressed_deficit) {
+		--b->compressed_deficit;
+		if (NULL == cmpxchg(b->compressed_pages++, NULL, page))
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_builder *builder,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY &&
+		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(builder, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&builder->vector,
+		page, type, &occupied);
+	builder->work->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline bool try_to_claim_workgroup(
+	struct z_erofs_vle_workgroup *grp,
+	z_erofs_vle_owned_workgrp_t *owned_head,
+	bool *hosted)
+{
+	DBG_BUGON(*hosted == true);
+
+	/* let's claim these following types of workgroup */
+retry:
+	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
+		/* type 1, nil workgroup */
+		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
+			goto retry;
+
+		*owned_head = grp;
+		*hosted = true;
+	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
+		/*
+		 * type 2, link to the end of a existing open chain,
+		 * be careful that its submission itself is governed
+		 * by the original owned chain.
+		 */
+		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
+			goto retry;
+
+		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the followee :) */
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_lookup(struct super_block *sb,
+			pgoff_t idx, unsigned pageofs,
+			struct z_erofs_vle_workgroup **grp_ret,
+			enum z_erofs_vle_work_role *role,
+			z_erofs_vle_owned_workgrp_t *owned_head,
+			bool *hosted)
+{
+	bool tag, primary;
+	struct erofs_workgroup *egrp;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	egrp = erofs_find_workgroup(sb, idx, &tag);
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_work(grp, pageofs);
+	primary = true;
+#else
+	BUG();
+#endif
+
+	DBG_BUGON(work->pageofs != pageofs);
+
+	/*
+	 * lock must be taken first to avoid grp->next == NIL between
+	 * claiming workgroup and adding pages:
+	 *                        grp->next != NIL
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                        mutex_lock(&work->lock)
+	 *                        add all pages to pagevec
+	 *
+	 * [correct locking case 1]:
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
+	 *   ...                          *role = SECONDARY
+	 *                                add all pages to pagevec
+	 *                                ...
+	 *                                mutex_unlock(grp->work[c])
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *
+	 * [correct locking case 2]:
+	 *   mutex_lock(grp->work[b])
+	 *   ...
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                                mutex_lock(grp->work[a])
+	 *                                *role = PRIMARY_OWNER
+	 *                                add all pages to pagevec
+	 *                                ...
+	 */
+	mutex_lock(&work->lock);
+
+	*hosted = false;
+	if (!primary)
+		*role = Z_EROFS_VLE_WORK_SECONDARY;
+	/* claim the workgroup if possible */
+	else if (try_to_claim_workgroup(grp, owned_head, hosted))
+		*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	else
+		*role = Z_EROFS_VLE_WORK_PRIMARY;
+
+	return work;
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup **grp_ret,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  enum z_erofs_vle_work_role *role,
+			  z_erofs_vle_owned_workgrp_t *owned_head,
+			  bool *hosted)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_workgroup *grp = *grp_ret;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_workgrp_fmt(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
+			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	/* new workgrps have been claimed as type 1 */
+	WRITE_ONCE(grp->next, *owned_head);
+	/* primary and followed work for all new workgrps */
+	*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	/* it should be submitted by ourselves */
+	*hosted = true;
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently unimplemented */
+	BUG();
+#else
+	work = z_erofs_vle_grab_primary_work(grp);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, 0);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = *grp_ret = grp;
+
+	mutex_lock(&work->lock);
+	return work;
+}
+
+static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
+					 unsigned int llen)
+{
+	while(1) {
+		unsigned int orig_llen = grp->llen;
+
+		if (orig_llen >= llen || orig_llen ==
+			cmpxchg(&grp->llen, orig_llen, llen))
+			break;
+	}
+}
+
+#define builder_is_followed(builder) \
+	((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	const unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	const erofs_blk_t index = erofs_blknr(map->m_pa);
+	const unsigned pageofs = map->m_la & ~PAGE_MASK;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	DBG_BUGON(builder->work != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	DBG_BUGON(erofs_blkoff(map->m_pa));
+
+repeat:
+	work = z_erofs_vle_work_lookup(sb, index,
+		pageofs, &grp, &builder->role, owned_head, &builder->hosted);
+	if (work != NULL) {
+		__update_workgrp_llen(grp, map->m_llen);
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, &grp, map, index, pageofs,
+		&builder->role, owned_head, &builder->hosted);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+got_it:
+	z_erofs_pagevec_ctor_init(&builder->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) {
+		/* enable possibly in-place decompression */
+		builder->compressed_pages = grp->compressed_pages;
+		builder->compressed_deficit = clusterpages;
+	} else {
+		builder->compressed_pages = NULL;
+		builder->compressed_deficit = 0;
+	}
+
+	builder->grp = grp;
+	builder->work = work;
+	return 0;
+}
+
+/*
+ * keep in mind that no referenced workgroups will be freed
+ * only after a RCU grace period, so rcu_read_lock() could
+ * prevent a workgroup from being freed.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->work;
+
+	call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp,
+	struct z_erofs_vle_work *work __maybe_unused)
+{
+	erofs_workgroup_put(&grp->obj);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	__z_erofs_vle_work_release(grp, work);
+}
+
+static inline bool
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder)
+{
+	struct z_erofs_vle_work *work = builder->work;
+
+	if (work == NULL)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&builder->vector, false);
+	mutex_unlock(&work->lock);
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current work isn't hosted by ourselves.
+	 */
+	if (!builder->hosted)
+		__z_erofs_vle_work_release(builder->grp, work);
+
+	builder->work = NULL;
+	builder->grp = NULL;
+	return true;
+}
+
+static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
+					       gfp_t gfp)
+{
+	struct page *page = erofs_allocpage(pagepool, gfp);
+
+	if (unlikely(page == NULL))
+		return NULL;
+
+	page->mapping = Z_EROFS_MAPPING_STAGING;
+	return page;
+}
+
+struct z_erofs_vle_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_vle_work_builder builder;
+	struct erofs_map_blocks_iter m_iter;
+
+	z_erofs_vle_owned_workgrp_t owned_head;
+
+	bool initial;
+};
+
+#define VLE_FRONTEND_INIT(__i) { \
+	.inode = __i, \
+	.m_iter = { \
+		{ .m_llen = 0, .m_plen = 0 }, \
+		.mpage = NULL \
+	}, \
+	.builder = VLE_WORK_BUILDER_INIT(), \
+	.owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \
+	.initial = true, }
+
+static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
+				struct page *page,
+				struct list_head *page_pool)
+{
+	struct super_block *const sb = fe->inode->i_sb;
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+	struct erofs_map_blocks_iter *const m = &fe->m_iter;
+	struct erofs_map_blocks *const map = &m->map;
+	struct z_erofs_vle_work_builder *const builder = &fe->builder;
+	const loff_t offset = page_offset(page);
+
+	bool tight = builder_is_followed(builder);
+	struct z_erofs_vle_work *work = builder->work;
+
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+            offset + cur < map->m_la + map->m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (!z_erofs_vle_work_iter_end(builder))
+		fe->initial = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(map->m_plen != 1 << sbi->clusterbits);
+	BUG_ON(erofs_blkoff(map->m_pa));
+
+	err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	tight &= builder_is_followed(builder);
+	work = builder->work;
+hitted:
+	cur = end - min_t(unsigned, offset + end - map->m_la, end);
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(builder, page, page_type);
+	/* should allocate an additional staging page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *const newpage =
+			__stagingpage_alloc(page_pool, GFP_NOFS);
+
+		err = z_erofs_vle_work_add_page(builder,
+			newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - map->m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, map->m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool background = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (background)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+	unsigned i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		BUG_ON(page->mapping == NULL);
+
+		if (unlikely(err))
+			SetPageError(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_workgroup *grp,
+	struct list_head *page_pool)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	unsigned sparsemem_pages = 0;
+#endif
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_work *work;
+	void *vout;
+	int err;
+
+	might_sleep();
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_primary_work(grp);
+#else
+	BUG();
+#endif
+	BUG_ON(!READ_ONCE(work->nr_pages));
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc_array(nr_pages,
+			sizeof(struct page *), GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+
+		/* all pages in pagevec ought to be valid */
+		DBG_BUGON(page == NULL);
+		DBG_BUGON(page->mapping == NULL);
+
+		if (z_erofs_gather_if_stagingpage(page_pool, page))
+			continue;
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = grp->compressed_pages;
+
+	for(i = 0; i < clusterpages; ++i) {
+		unsigned pagenr;
+
+		page = compressed_pages[i];
+
+		/* all compressed pages ought to be valid */
+		DBG_BUGON(page == NULL);
+		DBG_BUGON(page->mapping == NULL);
+
+		if (z_erofs_is_stagingpage(page))
+			continue;
+
+		/* only non-head page could be reused as a compressed page */
+		pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+
+		overlapped = true;
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
+		/* FIXME! this should be fixed in the future */
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs,
+		z_erofs_onlinepage_endio);
+	if (err != -ENOTSUPP)
+		goto out_percpu;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (sparsemem_pages >= nr_pages) {
+		BUG_ON(sparsemem_pages > nr_pages);
+		goto skip_allocpage;
+	}
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = __stagingpage_alloc(page_pool, GFP_NOFS);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+		DBG_BUGON(page->mapping == NULL);
+
+		/* recycle all individual staging pages */
+		if (z_erofs_gather_if_stagingpage(page_pool, page))
+			continue;
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+out_percpu:
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual staging pages */
+		(void)z_erofs_gather_if_stagingpage(page_pool, page);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	/* all work locks MUST be taken before the following line */
+
+	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
+
+	/* all work locks SHOULD be released right now */
+	mutex_unlock(&work->lock);
+
+	z_erofs_vle_work_release(work);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	z_erofs_vle_owned_workgrp_t owned = io->head;
+
+	while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) {
+		struct z_erofs_vle_workgroup *grp;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned;
+		owned = READ_ONCE(grp->next);
+
+		z_erofs_vle_unzip(sb, grp, page_pool);
+	};
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline struct z_erofs_vle_unzip_io *
+prepare_io_handler(struct super_block *sb,
+		   struct z_erofs_vle_unzip_io *io,
+		   bool background)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	if (!background) {
+		/* waitqueue available for foreground io */
+		BUG_ON(io == NULL);
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+		goto out;
+	}
+
+	if (io != NULL)
+		BUG();
+	else {
+		/* allocate extra io descriptor for background io */
+		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+			GFP_KERNEL | __GFP_NOFAIL);
+		BUG_ON(iosb == NULL);
+
+		io = &iosb->io;
+	}
+
+	iosb->sb = sb;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+	io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED;
+	return io;
+}
+
+#define __FSIO_1 0
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   z_erofs_vle_owned_workgrp_t owned_head,
+				   struct list_head *pagepool,
+				   struct z_erofs_vle_unzip_io *fg_io,
+				   bool force_fg)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+	const gfp_t gfp = GFP_NOFS;
+	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
+	struct bio *bio;
+	tagptr1_t bi_private;
+	/* since bio will be NULL, no need to initialize last_index */
+	pgoff_t uninitialized_var(last_index);
+	bool force_submit = false;
+	unsigned nr_bios;
+
+	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
+		return false;
+
+	/*
+	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
+         * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
+	 */
+	if (force_fg) {
+		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
+	} else {
+		ios[__FSIO_1] = prepare_io_handler(sb, NULL, true);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1);
+	}
+
+	nr_bios = 0;
+	force_submit = false;
+	bio = NULL;
+
+	/* by default, all need io submission */
+	ios[__FSIO_1]->head = owned_head;
+
+	do {
+		struct z_erofs_vle_workgroup *grp;
+		struct page **compressed_pages, *oldpage, *page;
+		pgoff_t first_index;
+		unsigned i = 0;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned_head;
+
+		/* close the main owned chain at first */
+		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
+			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+		first_index = grp->obj.index;
+		compressed_pages = grp->compressed_pages;
+
+		force_submit |= (first_index != last_index + 1);
+repeat:
+		/* fulfill all compressed pages */
+		oldpage = page = READ_ONCE(compressed_pages[i]);
+
+		if (page != NULL)
+			BUG_ON(PageUptodate(page));
+		else {
+			page = __stagingpage_alloc(pagepool, gfp);
+
+			if (oldpage != cmpxchg(compressed_pages + i,
+				oldpage, page)) {
+				list_add(&page->lru, pagepool);
+				goto repeat;
+			}
+		}
+
+		if (bio != NULL && force_submit) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+
+		if (bio == NULL) {
+			bio = prepare_bio(sb, first_index + i,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(bi_private);
+
+			++nr_bios;
+		}
+
+		err = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		force_submit = false;
+		last_index = first_index + i;
+		if (++i < clusterpages)
+			goto repeat;
+	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	BUG_ON(!nr_bios);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
+	return true;
+}
+
+static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
+				     struct list_head *pagepool,
+				     bool force_fg)
+{
+	struct super_block *sb = f->inode->i_sb;
+	struct z_erofs_vle_unzip_io io[1 + __FSIO_1];
+
+	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
+		return;
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	wait_event(io[__FSIO_1].u.wait,
+		!atomic_read(&io[__FSIO_1].pending_bios));
+
+	/* let's synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct inode *const inode = page->mapping->host;
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	int err;
+	LIST_HEAD(pagepool);
+
+	err = z_erofs_do_read_page(&f, page, &pagepool);
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	z_erofs_submit_and_unzip(&f, &pagepool, true);
+out:
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct inode *const inode = mapping->host;
+
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	z_erofs_submit_and_unzip(&f, &pagepool, sync);
+
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+/* for VLE compressed files */
+const struct address_space_operations z_erofs_vle_normal_access_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h
index 8e23e44..b276af1 100644
--- a/fs/erofs/unzip_vle.h
+++ b/fs/erofs/unzip_vle.h
@@ -14,9 +14,213 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/*
+ *  - 0x5FA11OC8D ('fsallocated', Z_EROFS_MAPPING_STAGING) -
+ * used for temporary allocated pages (via erofs_allocpage),
+ * in order to seperate those from NULL mapping (eg. truncated pages)
+ */
+#define Z_EROFS_MAPPING_STAGING		((void *)0x5FA110C8D)
+
+#define z_erofs_is_stagingpage(page)	\
+	((page)->mapping == Z_EROFS_MAPPING_STAGING)
+
+static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool,
+						 struct page *page)
+{
+	if (z_erofs_is_stagingpage(page)) {
+		list_add(&page->lru, page_pool);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+
+	atomic_t refcount;
+#endif
+	struct mutex lock;
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
+
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	struct z_erofs_vle_work work;
+
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
+
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+	unsigned int llen, flags;
+};
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
+
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
+
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+
+#endif
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_vle_owned_workgrp_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ab37072..dd1ce5f 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
 
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
+
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		first_index = grp->index + 1;
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 102+ messages in thread

end of thread, other threads:[~2018-07-20 17:29 UTC | newest]

Thread overview: 102+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-27 14:20 [WIP] [NOMERGE] [RFC PATCH v0] erofs: introduce the new unzip subsystem Gao Xiang
2018-06-29 23:45 ` Chao Yu
2018-06-30  0:25   ` Gao Xiang
2018-06-30  9:18 ` [WIP] [NOMERGE] [RFC PATCH v0.2 1/2] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-06-30  9:18   ` [WIP] [NOMERGE] [RFC PATCH v0.2 2/2] erofs: introduce the new VLE unzip subsystem Gao Xiang
2018-06-30 15:17 ` [WIP] [NOMERGE] [RFC PATCH v0.3 1/6] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 2/6] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 3/6] erofs: introduce erofs_map_blocks_iter Gao Xiang
2018-07-01  3:56     ` Chao Yu
2018-07-01  4:17       ` Gao Xiang
2018-07-01  4:26         ` Gao Xiang
2018-07-02  1:47           ` Chao Yu
2018-07-02  2:48             ` Gao Xiang
2018-07-02  3:36               ` Chao Yu
2018-07-02  3:47                 ` Gao Xiang
2018-07-02 10:48                   ` Chao Yu
2018-07-02 11:53                     ` Gao Xiang
2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 4/6] erofs: add erofs_allocpage Gao Xiang
2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 5/6] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-06-30 15:17   ` [WIP] [NOMERGE] [RFC PATCH v0.3 6/6] erofs: introduce VLE decompression subsystem Gao Xiang
2018-07-02 14:53 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-02 14:53     ` Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 2/7] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 3/7] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 4/7] erofs: add erofs_allocpage Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 5/7] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 6/7] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-02 14:53   ` [WIP] [NOMERGE] [RFC PATCH v0.4 7/7] erofs: introduce VLE decompression subsystem Gao Xiang
2018-07-03 16:12 ` [WIP] [NOMERGE] [RFC PATCH v0.4 0/7] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-03 16:13 ` [WIP] [NOMERGE] [RFC PATCH v0.4 RESEND " Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 1/7] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 2/7] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 3/7] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 4/7] erofs: add erofs_allocpage Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 5/7] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 6/7] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-03 16:13   ` [WIP] [NOMERGE] [RFC PATCH RESEND v0.4 7/7] erofs: introduce VLE decompression subsystem Gao Xiang
2018-07-05  8:41 ` [WIP] [NOMERGE] [RFC PATCH v0.5 00/10] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-05  8:41   ` [WIP] [NOMERGE] [RFC PATCH v0.5 04/10] erofs: add erofs_allocpage Gao Xiang
2018-07-05  8:44   ` [WIP] [NOMERGE] [RFC PATCH v0.5 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 07/10] erofs: introduce superblock registration Gao Xiang
2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 08/10] erofs: introduce erofs shrinker Gao Xiang
2018-07-05  9:09       ` [WIP] [NOMERGE] [RFC PATCH v0.5 RESEND " Gao Xiang
2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 09/10] erofs: introduce workstation for decompression Gao Xiang
2018-07-05  8:44     ` [WIP] [NOMERGE] [RFC PATCH v0.5 10/10] erofs: introduce VLE decompression support Gao Xiang
2018-07-05  9:37       ` [WIP] [NOMERGE] [RFC PATCH v0.5 RESEND " Gao Xiang
2018-07-06 16:50 ` [WIP] [NOMERGE] [RFC PATCH v0.6 00/10] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 04/10] erofs: add erofs_allocpage Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 07/10] erofs: introduce superblock registration Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 08/10] erofs: introduce erofs shrinker Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 09/10] erofs: introduce workstation for decompression Gao Xiang
2018-07-06 16:50   ` [WIP] [NOMERGE] [RFC PATCH v0.6 10/10] erofs: introduce VLE decompression support Gao Xiang
2018-07-09 19:17 ` [WIP] [NOMERGE] [RFC PATCH v0.7 00/10] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 01/10] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 02/10] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 03/10] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 04/10] erofs: add erofs_allocpage Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 05/10] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 06/10] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 07/10] erofs: introduce superblock registration Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 08/10] erofs: introduce erofs shrinker Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 09/10] erofs: introduce workstation for decompression Gao Xiang
2018-07-09 19:17   ` [WIP] [NOMERGE] [RFC PATCH v0.7 10/10] erofs: introduce VLE decompression support Gao Xiang
2018-07-13 13:17     ` [PATCH 1/2] temp commit 1 Gao Xiang
2018-07-13 13:17       ` [PATCH 2/2] temp commit 2 Gao Xiang
2018-07-17 14:18 ` [RFC PATCH v1 00/11] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 02/11] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 03/11] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 04/11] erofs: add erofs_allocpage Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 05/11] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 06/11] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 07/11] erofs: introduce superblock registration Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 08/11] erofs: introduce erofs shrinker Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 09/11] erofs: introduce workstation for decompression Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 10/11] erofs: introduce VLE decompression support Gao Xiang
2018-07-17 14:18   ` [RFC PATCH v1 11/11] erofs: introduce cached decompression Gao Xiang
2018-07-20  2:52 ` [RFC PATCH v2 00/11] erofs: introduce the new unzip subsystem Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 01/11] <linux/tagptr.h>: Introduce tagged pointer Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 02/11] erofs: introduce pagevec for unzip subsystem Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 03/11] erofs: add erofs_map_blocks_iter Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 04/11] erofs: add erofs_allocpage Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 05/11] erofs: globalize prepare_bio and __submit_bio Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 06/11] erofs: add a generic z_erofs VLE decompressor Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 07/11] erofs: introduce superblock registration Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 08/11] erofs: introduce erofs shrinker Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 09/11] erofs: introduce workstation for decompression Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 10/11] erofs: introduce VLE decompression support Gao Xiang
2018-07-20 16:55     ` [RFC PATCH v3 " Gao Xiang
2018-07-20 16:55       ` [RFC PATCH v3 11/11] erofs: introduce cached decompression Gao Xiang
2018-07-20 17:29       ` [RFC PATCH v3 RESEND 10/11] erofs: introduce VLE decompression support Gao Xiang
2018-07-20  2:52   ` [RFC PATCH v2 11/11] erofs: introduce cached decompression Gao Xiang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.