Linux-f2fs-devel Archive on lore.kernel.org
 help / color / Atom feed
* [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
@ 2019-10-22 17:16 Jaegeuk Kim
  2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-22 17:16 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim

This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
by allocating fully valid 2MB segment.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h     |  4 +++-
 fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
 fs/f2fs/recovery.c |  2 +-
 fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
 fs/f2fs/segment.h  |  2 ++
 fs/f2fs/super.c    |  1 +
 fs/f2fs/sysfs.c    |  2 ++
 7 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ca342f4c7db1..c681f51e351b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -890,6 +890,7 @@ enum {
 	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
 	CURSEG_COLD_NODE,	/* indirect node blocks */
 	NO_CHECK_TYPE,
+	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
 };
 
 struct flush_cmd {
@@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
 
 	/* threshold for gc trials on pinned files */
 	u64 gc_pin_file_threshold;
+	struct rw_semaphore pin_sem;
 
 	/* maximum # of trials to find a victim segment for SSR and GC */
 	unsigned int max_victim_search;
@@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
 void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 					unsigned int start, unsigned int end);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
 					struct cp_control *cpc);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 29bc0a542759..f6c038e8a6a7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	if (off_end)
 		map.m_len++;
 
-	if (f2fs_is_pinned_file(inode))
-		map.m_seg_type = CURSEG_COLD_DATA;
+	if (!map.m_len)
+		return 0;
+
+	if (f2fs_is_pinned_file(inode)) {
+		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
+					sbi->log_blocks_per_seg;
+		block_t done = 0;
+
+		if (map.m_len % sbi->blocks_per_seg)
+			len += sbi->blocks_per_seg;
 
-	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
-						F2FS_GET_BLOCK_PRE_DIO :
-						F2FS_GET_BLOCK_PRE_AIO));
+		map.m_len = sbi->blocks_per_seg;
+next_alloc:
+		mutex_lock(&sbi->gc_mutex);
+		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+		if (err && err != -ENODATA && err != -EAGAIN)
+			goto out_err;
+
+		down_write(&sbi->pin_sem);
+		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
+		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
+		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+		up_write(&sbi->pin_sem);
+
+		done += map.m_len;
+		len -= map.m_len;
+		map.m_lblk += map.m_len;
+		if (!err && len)
+			goto next_alloc;
+
+		map.m_len = done;
+	} else {
+		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	}
+out_err:
 	if (err) {
 		pgoff_t last_off;
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 783773e4560d..76477f71d4ee 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 		f2fs_put_page(page, 1);
 	}
 	if (!err)
-		f2fs_allocate_new_segments(sbi);
+		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
 	return err;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 25c750cd0272..253d72c2663c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 	up_read(&SM_I(sbi)->curseg_lock);
 }
 
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg;
 	unsigned int old_segno;
@@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 	down_write(&SIT_I(sbi)->sentry_lock);
 
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		if (type != NO_CHECK_TYPE && i != type)
+			continue;
+
 		curseg = CURSEG_I(sbi, i);
 		old_segno = curseg->segno;
 		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
@@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	bool put_pin_sem = false;
+
+	if (type == CURSEG_COLD_DATA) {
+		/* GC during CURSEG_COLD_DATA_PINNED allocation */
+		if (down_read_trylock(&sbi->pin_sem)) {
+			put_pin_sem = true;
+		} else {
+			type = CURSEG_WARM_DATA;
+			curseg = CURSEG_I(sbi, type);
+		}
+	} else if (type == CURSEG_COLD_DATA_PINNED) {
+		type = CURSEG_COLD_DATA;
+	}
 
 	down_read(&SM_I(sbi)->curseg_lock);
 
@@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	mutex_unlock(&curseg->curseg_mutex);
 
 	up_read(&SM_I(sbi)->curseg_lock);
+
+	if (put_pin_sem)
+		up_read(&sbi->pin_sem);
 }
 
 static void update_device_state(struct f2fs_io_info *fio)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 325781a1ae4d..a95467b202ea 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -313,6 +313,8 @@ struct sit_entry_set {
  */
 static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
 {
+	if (type == CURSEG_COLD_DATA_PINNED)
+		type = CURSEG_COLD_DATA;
 	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f320fd11db48..c02a47ce551b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	spin_lock_init(&sbi->dev_lock);
 
 	init_rwsem(&sbi->sb_lock);
+	init_rwsem(&sbi->pin_sem);
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index b558b64a4c9c..f164959e4224 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_casefold(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "casefold");
+	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "pin_file");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
-- 
2.19.0.605.g01d371f741-goog



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-22 17:16 [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Jaegeuk Kim
@ 2019-10-22 17:16 ` Jaegeuk Kim
  2019-10-22 17:53   ` Ju Hyung Park
                     ` (2 more replies)
  2019-10-24  8:21 ` [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Chao Yu
  2019-11-07 19:14 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
  2 siblings, 3 replies; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-22 17:16 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim

From: Chao Yu <yuchao0@huawei.com>

This patch tries to support compression in f2fs.

- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.

- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.

- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.

- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext

Compress metadata layout:
                             [Dnode Structure]
             +-----------------------------------------------+
             | cluster 1 | cluster 2 | ......... | cluster N |
             +-----------------------------------------------+
             .           .                       .           .
       .                       .                .                      .
  .         Compressed Cluster       .        .        Normal Cluster            .
+----------+---------+---------+---------+  +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 |  | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+  +---------+---------+---------+---------+
           .                             .
         .                                           .
       .                                                           .
      +-------------+-------------+----------+----------------------------+
      | data length | data chksum | reserved |      compressed data       |
      +-------------+-------------+----------+----------------------------+

Changelog:

20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().

20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().

20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().

20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.

20190402
- don't preallocate blocks for compressed file.

- add lz4 compress algorithm
- process multiple post read works in one workqueue
  Now f2fs supports processing post read work in multiple workqueue,
  it shows low performance due to schedule overhead of multiple
  workqueue executing orderly.

- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR

One cluster contain 4 blocks

 before overwrite   after overwrite

- VVVV		->	CVNN
- CVNN		->	VVVV

- CVNN		->	CVNN
- CVNN		->	CVVV

- CVVV		->	CVNN
- CVVV		->	CVVV

[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt |   48 ++
 fs/f2fs/Kconfig                    |    4 +
 fs/f2fs/Makefile                   |    2 +-
 fs/f2fs/compress.c                 | 1066 ++++++++++++++++++++++++++++
 fs/f2fs/data.c                     |  468 ++++++++++--
 fs/f2fs/debug.c                    |    6 +
 fs/f2fs/f2fs.h                     |  205 +++++-
 fs/f2fs/file.c                     |  124 +++-
 fs/f2fs/inode.c                    |   29 +
 fs/f2fs/namei.c                    |   47 ++
 fs/f2fs/segment.c                  |    5 +-
 fs/f2fs/segment.h                  |   12 -
 fs/f2fs/super.c                    |  115 ++-
 fs/f2fs/sysfs.c                    |    7 +
 include/linux/f2fs_fs.h            |    8 +
 include/trace/events/f2fs.h        |   99 +++
 16 files changed, 2139 insertions(+), 106 deletions(-)
 create mode 100644 fs/f2fs/compress.c

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 29020af0cff9..d1accf665c86 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -235,6 +235,13 @@ checkpoint=%s[:%u[%]]     Set to "disable" to turn off checkpointing. Set to "en
                        hide up to all remaining free space. The actual space that
                        would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
                        This space is reclaimed once checkpoint=enable.
+compress_algorithm=%s  Control compress algorithm, currently f2fs supports "lzo"
+                       and "lz4" algorithm.
+compress_log_size=%u   Support configuring compress cluster size, the size will
+                       be 4kb * (1 << %u), 16kb is minimum size, also it's
+                       default size.
+compress_extension=%s  Support adding specified extension, so that f2fs can
+                       enable compression on those corresponding file.
 
 ================================================================================
 DEBUGFS ENTRIES
@@ -837,3 +844,44 @@ zero or random data, which is useful to the below scenario where:
  4. address = fibmap(fd, offset)
  5. open(blkdev)
  6. write(blkdev, address)
+
+Compression implementation
+--------------------------
+
+- New term named cluster is defined as basic unit of compression, file can
+be divided into multiple clusters logically. One cluster includes 4 << n
+(n >= 0) logical pages, compression size is also cluster size, each of
+cluster can be compressed or not.
+
+- In cluster metadata layout, one special flag is used to indicate cluster
+is compressed one or normal one, for compressed cluster, following metadata
+maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
+data including compress header and compressed data.
+
+- In order to eliminate write amplification during overwrite, F2FS only
+support compression on write-once file, data can be compressed only when
+all logical blocks in file are valid and cluster compress ratio is lower
+than specified threshold.
+
+- To enable compression on regular inode, there are three ways:
+* chattr +c file
+* chattr +c dir; touch dir/file
+* mount w/ -o compress_extension=ext; touch file.ext
+
+Compress metadata layout:
+                             [Dnode Structure]
+             +-----------------------------------------------+
+             | cluster 1 | cluster 2 | ......... | cluster N |
+             +-----------------------------------------------+
+             .           .                       .           .
+       .                       .                .                      .
+  .         Compressed Cluster       .        .        Normal Cluster            .
++----------+---------+---------+---------+  +---------+---------+---------+---------+
+|compr flag| block 1 | block 2 | block 3 |  | block 1 | block 2 | block 3 | block 4 |
++----------+---------+---------+---------+  +---------+---------+---------+---------+
+           .                             .
+         .                                           .
+       .                                                           .
+      +-------------+-------------+----------+----------------------------+
+      | data length | data chksum | reserved |      compressed data       |
+      +-------------+-------------+----------+----------------------------+
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 652fd2e2b23d..c12854c3b1a1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -6,6 +6,10 @@ config F2FS_FS
 	select CRYPTO
 	select CRYPTO_CRC32
 	select F2FS_FS_XATTR if FS_ENCRYPTION
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
 	help
 	  F2FS is based on Log-structured File System (LFS), which supports
 	  versatile "flash-friendly" features. The design has been focused on
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2aaecc63834f..a5c771a6367d 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
 
 f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
-f2fs-y		+= shrinker.o extent_cache.o sysfs.o
+f2fs-y		+= shrinker.o extent_cache.o sysfs.o compress.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
new file mode 100644
index 000000000000..f276d82a67aa
--- /dev/null
+++ b/fs/f2fs/compress.c
@@ -0,0 +1,1066 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * f2fs compress support
+ *
+ * Copyright (c) 2019 Chao Yu <chao@kernel.org>
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/writeback.h>
+#include <linux/lzo.h>
+#include <linux/lz4.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include <trace/events/f2fs.h>
+
+struct f2fs_compress_ops {
+	int (*init_compress_ctx)(struct compress_ctx *cc);
+	void (*destroy_compress_ctx)(struct compress_ctx *cc);
+	int (*compress_pages)(struct compress_ctx *cc);
+	int (*decompress_pages)(struct decompress_io_ctx *dic);
+};
+
+static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index)
+{
+	return index % cc->cluster_size;
+}
+
+static unsigned int cluster_idx(struct compress_ctx *cc, pgoff_t index)
+{
+	return index / cc->cluster_size;
+}
+
+static unsigned int start_idx_of_cluster(struct compress_ctx *cc)
+{
+	return cc->cluster_idx * cc->cluster_size;
+}
+
+bool f2fs_is_compressed_page(struct page *page)
+{
+	if (!page_private(page))
+		return false;
+	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
+		return false;
+	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
+}
+
+static void f2fs_set_compressed_page(struct page *page,
+		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
+{
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long)data);
+
+	/* i_crypto_info and iv index */
+	page->index = index;
+	page->mapping = inode->i_mapping;
+	if (r)
+		refcount_inc(r);
+}
+
+static void f2fs_put_compressed_page(struct page *page)
+{
+	set_page_private(page, (unsigned long)NULL);
+	ClearPagePrivate(page);
+	page->mapping = NULL;
+	unlock_page(page);
+	put_page(page);
+}
+
+struct page *f2fs_compress_control_page(struct page *page)
+{
+	return ((struct compress_io_ctx *)page_private(page))->rpages[0];
+}
+
+int f2fs_init_compress_ctx(struct compress_ctx *cc)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+
+	if (cc->rpages)
+		return 0;
+	cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) * cc->cluster_size,
+								GFP_KERNEL);
+	if (!cc->rpages)
+		return -ENOMEM;
+	return 0;
+}
+
+void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
+{
+	kvfree(cc->rpages);
+}
+
+int f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
+{
+	unsigned int cluster_ofs;
+
+	if (!f2fs_cluster_can_merge_page(cc, page->index))
+		return -EAGAIN;
+
+	cluster_ofs = offset_in_cluster(cc, page->index);
+	cc->rpages[cluster_ofs] = page;
+	cc->nr_rpages++;
+	cc->cluster_idx = cluster_idx(cc, page->index);
+	return 0;
+}
+
+static int lzo_init_compress_ctx(struct compress_ctx *cc)
+{
+	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+				LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (!cc->private)
+		return -ENOMEM;
+
+	cc->clen = lzo1x_worst_compress(PAGE_SIZE * cc->cluster_size);
+	return 0;
+}
+
+static void lzo_destroy_compress_ctx(struct compress_ctx *cc)
+{
+	kvfree(cc->private);
+	cc->private = NULL;
+}
+
+static int lzo_compress_pages(struct compress_ctx *cc)
+{
+	int ret;
+
+	ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
+					&cc->clen, cc->private);
+	if (ret != LZO_E_OK) {
+		printk_ratelimited("%sF2FS-fs: lzo compress failed, ret:%d\n",
+								KERN_ERR, ret);
+		return -EIO;
+	}
+	return 0;
+}
+
+static int lzo_decompress_pages(struct decompress_io_ctx *dic)
+{
+	int ret;
+
+	ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
+						dic->rbuf, &dic->rlen);
+	if (ret != LZO_E_OK) {
+		printk_ratelimited("%sF2FS-fs: lzo decompress failed, ret:%d\n",
+								KERN_ERR, ret);
+		return -EIO;
+	}
+
+	if (dic->rlen != PAGE_SIZE * dic->cluster_size) {
+		printk_ratelimited("%sF2FS-fs: lzo invalid rlen:%zu, "
+					"expected:%lu\n", KERN_ERR, dic->rlen,
+					PAGE_SIZE * dic->cluster_size);
+		return -EIO;
+	}
+	return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_lzo_ops = {
+	.init_compress_ctx	= lzo_init_compress_ctx,
+	.destroy_compress_ctx	= lzo_destroy_compress_ctx,
+	.compress_pages		= lzo_compress_pages,
+	.decompress_pages	= lzo_decompress_pages,
+};
+
+static int lz4_init_compress_ctx(struct compress_ctx *cc)
+{
+	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+				LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (!cc->private)
+		return -ENOMEM;
+
+	cc->clen = LZ4_compressBound(PAGE_SIZE * cc->cluster_size);
+	return 0;
+}
+
+static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
+{
+	kvfree(cc->private);
+	cc->private = NULL;
+}
+
+static int lz4_compress_pages(struct compress_ctx *cc)
+{
+	int len;
+
+	len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+						cc->clen, cc->private);
+	if (!len) {
+		printk_ratelimited("%sF2FS-fs: lz4 compress failed\n",
+								KERN_ERR);
+		return -EIO;
+	}
+	cc->clen = len;
+	return 0;
+}
+
+static int lz4_decompress_pages(struct decompress_io_ctx *dic)
+{
+	int ret;
+
+	ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
+						dic->clen, dic->rlen);
+	if (ret < 0) {
+		printk_ratelimited("%sF2FS-fs: lz4 decompress failed, ret:%d\n",
+								KERN_ERR, ret);
+		return -EIO;
+	}
+
+	if (ret != PAGE_SIZE * dic->cluster_size) {
+		printk_ratelimited("%sF2FS-fs: lz4 invalid rlen:%zu, "
+					"expected:%lu\n", KERN_ERR, dic->rlen,
+					PAGE_SIZE * dic->cluster_size);
+		return -EIO;
+	}
+	return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_lz4_ops = {
+	.init_compress_ctx	= lz4_init_compress_ctx,
+	.destroy_compress_ctx	= lz4_destroy_compress_ctx,
+	.compress_pages		= lz4_compress_pages,
+	.decompress_pages	= lz4_decompress_pages,
+};
+
+static void f2fs_release_cluster_pages(struct compress_ctx *cc)
+{
+	int i;
+
+	for (i = 0; i < cc->nr_rpages; i++) {
+		inode_dec_dirty_pages(cc->inode);
+		unlock_page(cc->rpages[i]);
+	}
+}
+
+static struct page *f2fs_grab_page(void)
+{
+	struct page *page;
+
+	page = alloc_pages(GFP_KERNEL, 0);
+	if (!page)
+		return NULL;
+	lock_page(page);
+	return page;
+}
+
+static int f2fs_compress_pages(struct compress_ctx *cc)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+	struct f2fs_inode_info *fi = F2FS_I(cc->inode);
+	const struct f2fs_compress_ops *cops =
+				sbi->cops[fi->i_compress_algorithm];
+	unsigned int max_len, nr_cpages;
+	int i, ret;
+
+	trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
+				cc->cluster_size, fi->i_compress_algorithm);
+
+	ret = cops->init_compress_ctx(cc);
+	if (ret)
+		goto out;
+
+	max_len = COMPRESS_HEADER_SIZE + cc->clen;
+	cc->nr_cpages = roundup(max_len, PAGE_SIZE) / PAGE_SIZE;
+
+	cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+					cc->nr_cpages, GFP_KERNEL);
+	if (!cc->cpages) {
+		ret = -ENOMEM;
+		goto destroy_compress_ctx;
+	}
+
+	for (i = 0; i < cc->nr_cpages; i++) {
+		cc->cpages[i] = f2fs_grab_page();
+		if (!cc->cpages[i]) {
+			ret = -ENOMEM;
+			goto out_free_cpages;
+		}
+	}
+
+	cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL);
+	if (!cc->rbuf) {
+		ret = -ENOMEM;
+		goto destroy_compress_ctx;
+	}
+
+	cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL);
+	if (!cc->cbuf) {
+		ret = -ENOMEM;
+		goto out_vunmap_rbuf;
+	}
+
+	ret = cops->compress_pages(cc);
+	if (ret)
+		goto out_vunmap_cbuf;
+
+	max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE;
+
+	if (cc->clen > max_len) {
+		ret = -EAGAIN;
+		goto out_vunmap_cbuf;
+	}
+
+	cc->cbuf->clen = cpu_to_le32(cc->clen);
+	cc->cbuf->chksum = 0;
+
+	vunmap(cc->cbuf);
+	vunmap(cc->rbuf);
+
+	nr_cpages = roundup(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE) /
+								PAGE_SIZE;
+
+	for (i = nr_cpages; i < cc->nr_cpages; i++) {
+		f2fs_put_compressed_page(cc->cpages[i]);
+		cc->cpages[i] = NULL;
+	}
+
+	cc->nr_cpages = nr_cpages;
+
+	trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
+							cc->clen, ret);
+	return 0;
+out_vunmap_cbuf:
+	vunmap(cc->cbuf);
+out_vunmap_rbuf:
+	vunmap(cc->rbuf);
+out_free_cpages:
+	for (i = 0; i < cc->nr_cpages; i++)
+		f2fs_put_compressed_page(cc->cpages[i]);
+	kvfree(cc->cpages);
+	cc->cpages = NULL;
+destroy_compress_ctx:
+	cops->destroy_compress_ctx(cc);
+out:
+	trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
+							cc->clen, ret);
+	return ret;
+}
+
+void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
+{
+	struct decompress_io_ctx *dic =
+			(struct decompress_io_ctx *)page_private(page);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+	struct f2fs_inode_info *fi= F2FS_I(dic->inode);
+	const struct f2fs_compress_ops *cops =
+			sbi->cops[fi->i_compress_algorithm];
+	int ret;
+
+	dec_page_count(sbi, F2FS_RD_DATA);
+
+	if (bio->bi_status)
+		dic->err = true;
+
+	if (refcount_dec_not_one(&dic->ref))
+		return;
+
+	trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
+				dic->cluster_size, fi->i_compress_algorithm);
+
+	/* submit partial compressed pages */
+	if (dic->err) {
+		ret = dic->err;
+		goto out_free_dic;
+	}
+
+	dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
+	if (!dic->rbuf) {
+		ret = -ENOMEM;
+		goto out_free_dic;
+	}
+
+	dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL);
+	if (!dic->cbuf) {
+		ret = -ENOMEM;
+		goto out_vunmap_rbuf;
+	}
+
+	dic->clen = le32_to_cpu(dic->cbuf->clen);
+	dic->rlen = PAGE_SIZE * dic->cluster_size;
+
+	if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
+		ret = -EFAULT;
+		goto out_vunmap_cbuf;
+	}
+
+	ret = cops->decompress_pages(dic);
+
+out_vunmap_cbuf:
+	vunmap(dic->cbuf);
+out_vunmap_rbuf:
+	vunmap(dic->rbuf);
+out_free_dic:
+	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
+	f2fs_free_dic(dic);
+
+	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
+							dic->clen, ret);
+}
+
+static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
+{
+	if (cc->cluster_idx == NULL_CLUSTER)
+		return true;
+	return cc->cluster_idx == cluster_idx(cc, index);
+}
+
+bool f2fs_cluster_is_empty(struct compress_ctx *cc)
+{
+	return cc->nr_rpages == 0;
+}
+
+static bool f2fs_cluster_is_full(struct compress_ctx *cc)
+{
+	return cc->cluster_size == cc->nr_rpages;
+}
+
+bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
+{
+	if (f2fs_cluster_is_empty(cc))
+		return true;
+	if (f2fs_cluster_is_full(cc))
+		return false;
+	return is_page_in_cluster(cc, index);
+}
+
+static bool __cluster_may_compress(struct compress_ctx *cc)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+	loff_t i_size = i_size_read(cc->inode);
+	const pgoff_t end_index = ((unsigned long long)i_size)
+					>> PAGE_SHIFT;
+	unsigned offset;
+	int i;
+
+	for (i = 0; i < cc->cluster_size; i++) {
+		struct page *page = cc->rpages[i];
+
+		f2fs_bug_on(sbi, !page);
+
+		if (unlikely(f2fs_cp_error(sbi)))
+			return false;
+		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+			return false;
+		if (f2fs_is_drop_cache(cc->inode))
+			return false;
+		if (f2fs_is_volatile_file(cc->inode))
+			return false;
+
+		offset = i_size & (PAGE_SIZE - 1);
+		if ((page->index > end_index) ||
+			(page->index == end_index && !offset))
+			return false;
+	}
+	return true;
+}
+
+int f2fs_is_cluster_existed(struct compress_ctx *cc)
+{
+	struct dnode_of_data dn;
+	unsigned int start_idx = start_idx_of_cluster(cc);
+	int ret;
+	int i;
+
+	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
+		block_t blkaddr = datablock_addr(dn.inode, dn.node_page,
+							dn.ofs_in_node);
+		if (blkaddr == COMPRESS_ADDR) {
+			ret = 1;
+			break;
+		}
+		if (__is_valid_data_blkaddr(blkaddr)) {
+			ret = 2;
+			break;
+		}
+	}
+	f2fs_put_dnode(&dn);
+	return ret;
+}
+
+static bool cluster_may_compress(struct compress_ctx *cc)
+{
+	if (!f2fs_compressed_file(cc->inode))
+		return false;
+	if (!f2fs_cluster_is_full(cc))
+		return false;
+	return __cluster_may_compress(cc);
+}
+
+void f2fs_reset_compress_ctx(struct compress_ctx *cc)
+{
+	if (cc->rpages)
+		memset(cc->rpages, 0, sizeof(struct page *) * cc->cluster_size);
+	cc->nr_rpages = 0;
+	cc->nr_cpages = 0;
+	cc->cluster_idx = NULL_CLUSTER;
+}
+
+static void set_cluster_writeback(struct compress_ctx *cc)
+{
+	int i;
+
+	for (i = 0; i < cc->cluster_size; i++)
+		set_page_writeback(cc->rpages[i]);
+}
+
+static void set_cluster_dirty(struct compress_ctx *cc)
+{
+	int i;
+
+	for (i = 0; i < cc->cluster_size; i++)
+		set_page_dirty(cc->rpages[i]);
+}
+
+int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
+					struct page **pagep, pgoff_t index,
+					void **fsdata, bool prealloc)
+{
+	struct inode *inode = cc->inode;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct bio *bio = NULL;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	struct dnode_of_data dn;
+	sector_t last_block_in_bio;
+	unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
+	unsigned int start_idx = cluster_idx(cc, index) * cc->cluster_size;
+	int i, idx;
+	int ret;
+
+	ret = f2fs_init_compress_ctx(cc);
+	if (ret)
+		goto out;
+retry:
+	/* keep page reference to avoid page reclaim */
+	for (i = 0; i < cc->cluster_size; i++) {
+		page = f2fs_pagecache_get_page(mapping, start_idx + i,
+							fgp_flag, GFP_NOFS);
+		if (!page) {
+			ret = -ENOMEM;
+			goto unlock_pages;
+		}
+
+		if (PageUptodate(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		ret = f2fs_compress_ctx_add_page(cc, page);
+		f2fs_bug_on(sbi, ret);
+	}
+
+	if (!f2fs_cluster_is_empty(cc)) {
+		ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
+						&last_block_in_bio, false);
+		if (ret)
+			goto out;
+
+		if (bio)
+			f2fs_submit_bio(sbi, bio, DATA);
+
+		ret = f2fs_init_compress_ctx(cc);
+		if (ret)
+			goto out;
+	}
+
+	for (i = 0; i < cc->cluster_size; i++) {
+		page = find_lock_page(mapping, start_idx + i);
+		f2fs_bug_on(sbi, !page);
+
+		f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+		cc->rpages[i] = page;
+		f2fs_put_page(page, 0);
+
+		if (!PageUptodate(page)) {
+			for (idx = i; idx >= 0; idx--) {
+				f2fs_put_page(cc->rpages[idx], 0);
+				f2fs_put_page(cc->rpages[idx], 1);
+			}
+			goto retry;
+		}
+
+	}
+
+	if (prealloc) {
+		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+
+		for (i = cc->cluster_size - 1; i > 0; i--) {
+			ret = f2fs_get_block(&dn, start_idx + i);
+			if (ret)
+				/* TODO: release preallocate blocks */
+				goto release_pages;
+
+			if (dn.data_blkaddr != NEW_ADDR)
+				break;
+		}
+
+		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+	}
+
+	*fsdata = cc->rpages;
+	*pagep = cc->rpages[offset_in_cluster(cc, index)];
+	return 0;
+unlock_pages:
+	for (idx = 0; idx < i; idx++) {
+		if (cc->rpages[idx])
+			unlock_page(cc->rpages[idx]);
+	}
+release_pages:
+	for (idx = 0; idx < i; idx++) {
+		page = find_lock_page(mapping, start_idx + idx);
+		f2fs_put_page(page, 0);
+		f2fs_put_page(page, 1);
+	}
+	f2fs_destroy_compress_ctx(cc);
+out:
+	return ret;
+}
+
+void f2fs_compress_write_end(struct inode *inode, void *fsdata,
+							bool written)
+{
+	struct compress_ctx cc = {
+		.cluster_size = F2FS_I(inode)->i_cluster_size,
+		.rpages = fsdata,
+	};
+	int i;
+
+	if (written)
+		set_cluster_dirty(&cc);
+
+	for (i = 0; i < cc.cluster_size; i++)
+		f2fs_put_page(cc.rpages[i], 1);
+}
+
+static int f2fs_write_compressed_pages(struct compress_ctx *cc,
+					int *submitted,
+					struct writeback_control *wbc,
+					enum iostat_type io_type)
+{
+	struct inode *inode = cc->inode;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_io_info fio = {
+		.sbi = sbi,
+		.ino = cc->inode->i_ino,
+		.type = DATA,
+		.op = REQ_OP_WRITE,
+		.op_flags = wbc_to_write_flags(wbc),
+		.old_blkaddr = NEW_ADDR,
+		.page = NULL,
+		.encrypted_page = NULL,
+		.compressed_page = NULL,
+		.submitted = false,
+		.need_lock = LOCK_RETRY,
+		.io_type = io_type,
+		.io_wbc = wbc,
+		.compressed = true,
+		.encrypted = f2fs_encrypted_file(cc->inode),
+	};
+	struct dnode_of_data dn;
+	struct node_info ni;
+	struct compress_io_ctx *cic;
+	unsigned int start_idx = start_idx_of_cluster(cc);
+	unsigned int last_index = cc->cluster_size - 1;
+	loff_t psize;
+	int pre_compressed_blocks = 0;
+	int i, err;
+
+	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+
+	f2fs_lock_op(sbi);
+
+	err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+	if (err)
+		goto out_unlock_op;
+
+	psize = (cc->rpages[last_index]->index + 1) << PAGE_SHIFT;
+
+	err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
+	if (err)
+		goto out_put_dnode;
+
+	fio.version = ni.version;
+
+	cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS);
+	if (!cic)
+		goto out_put_dnode;
+
+	cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
+	cic->inode = inode;
+	refcount_set(&cic->ref, 1);
+	cic->rpages = cc->rpages;
+	cic->nr_rpages = cc->cluster_size;
+
+	for (i = 0; i < cc->nr_cpages; i++) {
+		f2fs_set_compressed_page(cc->cpages[i], inode,
+					cc->rpages[i + 1]->index,
+					cic, i ? &cic->ref : NULL);
+		fio.compressed_page = cc->cpages[i];
+		if (fio.encrypted) {
+			fio.page = cc->rpages[i + 1];
+			err = f2fs_encrypt_one_page(&fio);
+			if (err)
+				goto out_destroy_crypt;
+			cc->cpages[i] = fio.encrypted_page;
+		}
+	}
+
+	set_cluster_writeback(cc);
+
+	for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
+		block_t blkaddr;
+
+		blkaddr = datablock_addr(dn.inode, dn.node_page,
+							dn.ofs_in_node);
+
+		/* cluster header */
+		if (i == 0) {
+			if (blkaddr == COMPRESS_ADDR)
+				pre_compressed_blocks++;
+			if (__is_valid_data_blkaddr(blkaddr))
+				f2fs_invalidate_blocks(sbi, blkaddr);
+			f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR);
+			continue;
+		}
+
+		if (pre_compressed_blocks && __is_valid_data_blkaddr(blkaddr))
+			pre_compressed_blocks++;
+
+		if (i > cc->nr_cpages) {
+			if (__is_valid_data_blkaddr(blkaddr)) {
+				f2fs_invalidate_blocks(sbi, blkaddr);
+				f2fs_update_data_blkaddr(&dn, NEW_ADDR);
+			}
+			continue;
+		}
+
+		f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR);
+
+		fio.page = cc->rpages[i];
+		fio.old_blkaddr = blkaddr;
+
+		if (fio.encrypted)
+			fio.encrypted_page = cc->cpages[i - 1];
+		else if (fio.compressed)
+			fio.compressed_page = cc->cpages[i - 1];
+		else
+			f2fs_bug_on(sbi, 1);
+		cc->cpages[i - 1] = NULL;
+		f2fs_outplace_write_data(&dn, &fio);
+		(*submitted)++;
+	}
+
+	if (pre_compressed_blocks) {
+		stat_sub_compr_blocks(inode,
+			cc->cluster_size - pre_compressed_blocks + 1);
+		F2FS_I(inode)->i_compressed_blocks -=
+			(cc->cluster_size - pre_compressed_blocks + 1);
+	}
+	stat_add_compr_blocks(inode, cc->cluster_size - cc->nr_cpages);
+	F2FS_I(inode)->i_compressed_blocks += cc->cluster_size - cc->nr_cpages;
+	f2fs_mark_inode_dirty_sync(inode, true);
+
+	set_inode_flag(cc->inode, FI_APPEND_WRITE);
+	if (cc->cluster_idx == 0)
+		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+	f2fs_put_dnode(&dn);
+	f2fs_unlock_op(sbi);
+
+	f2fs_release_cluster_pages(cc);
+
+	cc->rpages = NULL;
+
+	if (err) {
+		file_set_keep_isize(inode);
+	} else {
+		down_write(&fi->i_sem);
+		if (fi->last_disk_size < psize)
+			fi->last_disk_size = psize;
+		up_write(&fi->i_sem);
+	}
+	return 0;
+out_destroy_crypt:
+	for (i -= 1; i >= 0; i--)
+		fscrypt_finalize_bounce_page(&cc->cpages[i]);
+	for (i = 0; i < cc->nr_cpages; i++) {
+		if (!cc->cpages[i])
+			continue;
+		f2fs_put_page(cc->cpages[i], 1);
+	}
+out_put_dnode:
+	f2fs_put_dnode(&dn);
+out_unlock_op:
+	f2fs_unlock_op(sbi);
+	return -EAGAIN;
+}
+
+void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
+{
+	struct f2fs_sb_info *sbi = bio->bi_private;
+	struct compress_io_ctx *cic =
+			(struct compress_io_ctx *)page_private(page);
+	int i;
+
+	if (unlikely(bio->bi_status))
+		mapping_set_error(cic->inode->i_mapping, -EIO);
+
+	f2fs_put_compressed_page(page);
+
+	dec_page_count(sbi, F2FS_WB_DATA);
+
+	if (refcount_dec_not_one(&cic->ref))
+		return;
+
+	for (i = 0; i < cic->nr_rpages; i++) {
+		clear_cold_data(cic->rpages[i]);
+		end_page_writeback(cic->rpages[i]);
+	}
+
+	kvfree(cic->rpages);
+	kvfree(cic);
+}
+
+static int f2fs_write_raw_pages(struct compress_ctx *cc,
+					int *submitted,
+					struct writeback_control *wbc,
+					enum iostat_type io_type)
+{
+	int i, _submitted;
+	int ret, err = 0;
+
+	for (i = 0; i < cc->cluster_size; i++) {
+		if (!cc->rpages[i])
+			continue;
+		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
+						NULL, NULL, wbc, io_type);
+		if (ret) {
+			if (ret == AOP_WRITEPAGE_ACTIVATE)
+				unlock_page(cc->rpages[i]);
+			err = ret;
+			goto out_fail;
+		}
+
+		*submitted += _submitted;
+	}
+	return 0;
+out_fail:
+	/* TODO: revoke partially updated block addresses */
+	for (i += 1; i < cc->cluster_size; i++) {
+		if (!cc->rpages[i])
+			continue;
+		redirty_page_for_writepage(wbc, cc->rpages[i]);
+		unlock_page(cc->rpages[i]);
+	}
+	return err;
+}
+
+int f2fs_write_multi_pages(struct compress_ctx *cc,
+					int *submitted,
+					struct writeback_control *wbc,
+					enum iostat_type io_type)
+{
+	struct f2fs_inode_info *fi = F2FS_I(cc->inode);
+	const struct f2fs_compress_ops *cops =
+			F2FS_I_SB(cc->inode)->cops[fi->i_compress_algorithm];
+	int err = -EAGAIN;
+
+	*submitted = 0;
+
+	if (cluster_may_compress(cc)) {
+		err = f2fs_compress_pages(cc);
+		if (err) {
+			err = -EAGAIN;
+			goto write;
+		}
+		err = f2fs_write_compressed_pages(cc, submitted,
+							wbc, io_type);
+		cops->destroy_compress_ctx(cc);
+	}
+write:
+	if (err == -EAGAIN) {
+		f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted);
+		err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
+		if (f2fs_is_cluster_existed(cc) == 1) {
+			stat_sub_compr_blocks(cc->inode, *submitted);
+			F2FS_I(cc->inode)->i_compressed_blocks -= *submitted;
+			f2fs_mark_inode_dirty_sync(cc->inode, true);
+		}
+	}
+	f2fs_reset_compress_ctx(cc);
+	return err;
+}
+
+int f2fs_is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
+{
+	struct dnode_of_data dn;
+	unsigned int start_idx = cluster_idx(cc, index) * cc->cluster_size;
+	int ret, i;
+
+	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+	if (ret) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto fail;
+	}
+	if (dn.data_blkaddr == COMPRESS_ADDR) {
+		ret = CLUSTER_IS_FULL;
+		for (i = 1; i < cc->cluster_size; i++) {
+			block_t blkaddr;
+
+			blkaddr = datablock_addr(dn.inode,
+					dn.node_page, dn.ofs_in_node + i);
+			if (blkaddr == NULL_ADDR) {
+				ret = CLUSTER_HAS_SPACE;
+				break;
+			}
+		}
+	}
+fail:
+	f2fs_put_dnode(&dn);
+	return ret;
+}
+
+struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+	struct decompress_io_ctx *dic;
+	unsigned int start_idx = start_idx_of_cluster(cc);
+	int i;
+
+	dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_KERNEL);
+	if (!dic)
+		goto out;
+
+	dic->inode = cc->inode;
+	refcount_set(&dic->ref, 1);
+	dic->cluster_idx = cc->cluster_idx;
+	dic->cluster_size = cc->cluster_size;
+	dic->nr_cpages = cc->nr_cpages;
+	dic->err = false;
+
+	dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+					dic->nr_cpages, GFP_KERNEL);
+	if (!dic->cpages)
+		goto out_free;
+
+	for (i = 0; i < dic->nr_cpages; i++) {
+		struct page *page;
+
+		page = f2fs_grab_page();
+		if (!page)
+			goto out_free;
+
+		f2fs_set_compressed_page(page, cc->inode,
+					start_idx + i + 1,
+					dic, i ? &dic->ref : NULL);
+		dic->cpages[i] = page;
+	}
+
+	dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+					dic->cluster_size, GFP_KERNEL);
+	if (!dic->tpages)
+		goto out_free;
+
+	for (i = 0; i < dic->cluster_size; i++) {
+		if (cc->rpages[i])
+			continue;
+
+		dic->tpages[i] = f2fs_grab_page();
+		if (!dic->tpages[i])
+			goto out_free;
+	}
+
+	for (i = 0; i < dic->cluster_size; i++) {
+		if (dic->tpages[i])
+			continue;
+		dic->tpages[i] = cc->rpages[i];
+	}
+
+	dic->rpages = cc->rpages;
+	dic->nr_rpages = cc->cluster_size;
+
+	cc->rpages = NULL;
+	return dic;
+out_free:
+	f2fs_free_dic(dic);
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+void f2fs_free_dic(struct decompress_io_ctx *dic)
+{
+	int i;
+
+	if (dic->tpages) {
+		for (i = 0; i < dic->cluster_size; i++) {
+			if (dic->rpages[i])
+				continue;
+			unlock_page(dic->tpages[i]);
+			put_page(dic->tpages[i]);
+		}
+		kvfree(dic->tpages);
+	}
+
+	if (dic->cpages) {
+		for (i = 0; i < dic->nr_cpages; i++) {
+			if (!dic->cpages[i])
+				continue;
+			f2fs_put_compressed_page(dic->cpages[i]);
+		}
+		kvfree(dic->cpages);
+	}
+
+	kvfree(dic->rpages);
+	kvfree(dic);
+}
+
+void f2fs_set_cluster_uptodate(struct page **rpages,
+			unsigned int cluster_size, bool err, bool verity)
+{
+	int i;
+
+	for (i = 0; i < cluster_size; i++) {
+		struct page *rpage = rpages[i];
+
+		if (!rpage)
+			continue;
+
+		if (err || PageError(rpage)) {
+			ClearPageUptodate(rpage);
+			ClearPageError(rpage);
+		} else {
+			if (!verity || fsverity_verify_page(rpage))
+				SetPageUptodate(rpage);
+			else
+				SetPageError(rpage);
+		}
+		unlock_page(rpage);
+	}
+}
+
+static void f2fs_init_compress_ops(struct f2fs_sb_info *sbi)
+{
+	sbi->cops[COMPRESS_LZO] = &f2fs_lzo_ops;
+	sbi->cops[COMPRESS_LZ4] = &f2fs_lz4_ops;
+}
+
+void f2fs_init_compress_info(struct f2fs_sb_info *sbi)
+{
+	if (!f2fs_sb_has_compression(sbi))
+		return;
+
+	f2fs_init_compress_ops(sbi);
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ba3bcf4c7889..bac96c3a8bc9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -41,6 +41,9 @@ static bool __is_cp_guaranteed(struct page *page)
 	if (!mapping)
 		return false;
 
+	if (f2fs_is_compressed_page(page))
+		return false;
+
 	inode = mapping->host;
 	sbi = F2FS_I_SB(inode);
 
@@ -73,19 +76,19 @@ static enum count_type __read_io_type(struct page *page)
 
 /* postprocessing steps for read bios */
 enum bio_post_read_step {
-	STEP_INITIAL = 0,
 	STEP_DECRYPT,
+	STEP_DECOMPRESS,
 	STEP_VERITY,
 };
 
 struct bio_post_read_ctx {
 	struct bio *bio;
+	struct f2fs_sb_info *sbi;
 	struct work_struct work;
-	unsigned int cur_step;
 	unsigned int enabled_steps;
 };
 
-static void __read_end_io(struct bio *bio)
+static void __read_end_io(struct bio *bio, bool compr, bool verity)
 {
 	struct page *page;
 	struct bio_vec *bv;
@@ -94,6 +97,11 @@ static void __read_end_io(struct bio *bio)
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		page = bv->bv_page;
 
+		if (compr && PagePrivate(page)) {
+			f2fs_decompress_pages(bio, page, verity);
+			continue;
+		}
+
 		/* PG_error was set if any post_read step failed */
 		if (bio->bi_status || PageError(page)) {
 			ClearPageUptodate(page);
@@ -110,60 +118,67 @@ static void __read_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
+static void f2fs_decompress_bio(struct bio *bio, bool verity)
+{
+	__read_end_io(bio, true, verity);
+}
+
 static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
 
-static void decrypt_work(struct work_struct *work)
+static void decrypt_work(struct bio_post_read_ctx *ctx)
 {
-	struct bio_post_read_ctx *ctx =
-		container_of(work, struct bio_post_read_ctx, work);
-
 	fscrypt_decrypt_bio(ctx->bio);
+}
+
+static void decompress_work(struct bio_post_read_ctx *ctx, bool verity)
+{
+	f2fs_decompress_bio(ctx->bio, verity);
+}
 
-	bio_post_read_processing(ctx);
+static void verity_work(struct bio_post_read_ctx *ctx)
+{
+	fsverity_verify_bio(ctx->bio);
 }
 
-static void verity_work(struct work_struct *work)
+static void f2fs_post_read_work(struct work_struct *work)
 {
 	struct bio_post_read_ctx *ctx =
 		container_of(work, struct bio_post_read_ctx, work);
 
-	fsverity_verify_bio(ctx->bio);
+	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
+		decrypt_work(ctx);
 
-	bio_post_read_processing(ctx);
+	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
+		decompress_work(ctx,
+			ctx->enabled_steps & (1 << STEP_VERITY));
+		return;
+	}
+
+	if (ctx->enabled_steps & (1 << STEP_VERITY))
+		verity_work(ctx);
+
+	__read_end_io(ctx->bio, false, false);
+}
+
+static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
+						struct work_struct *work)
+{
+	queue_work(sbi->post_read_wq, work);
 }
 
 static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
 {
-	/*
-	 * We use different work queues for decryption and for verity because
-	 * verity may require reading metadata pages that need decryption, and
-	 * we shouldn't recurse to the same workqueue.
-	 */
-	switch (++ctx->cur_step) {
-	case STEP_DECRYPT:
-		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
-			INIT_WORK(&ctx->work, decrypt_work);
-			fscrypt_enqueue_decrypt_work(&ctx->work);
-			return;
-		}
-		ctx->cur_step++;
-		/* fall-through */
-	case STEP_VERITY:
-		if (ctx->enabled_steps & (1 << STEP_VERITY)) {
-			INIT_WORK(&ctx->work, verity_work);
-			fsverity_enqueue_verify_work(&ctx->work);
-			return;
-		}
-		ctx->cur_step++;
-		/* fall-through */
-	default:
-		__read_end_io(ctx->bio);
+	if (ctx->enabled_steps) {
+		INIT_WORK(&ctx->work, f2fs_post_read_work);
+		f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
+		return;
 	}
+	__read_end_io(ctx->bio, false, false);
 }
 
 static bool f2fs_bio_post_read_required(struct bio *bio)
 {
-	return bio->bi_private && !bio->bi_status;
+	return bio->bi_private;
 }
 
 static void f2fs_read_end_io(struct bio *bio)
@@ -177,12 +192,11 @@ static void f2fs_read_end_io(struct bio *bio)
 	if (f2fs_bio_post_read_required(bio)) {
 		struct bio_post_read_ctx *ctx = bio->bi_private;
 
-		ctx->cur_step = STEP_INITIAL;
 		bio_post_read_processing(ctx);
 		return;
 	}
 
-	__read_end_io(bio);
+	__read_end_io(bio, false, false);
 }
 
 static void f2fs_write_end_io(struct bio *bio)
@@ -213,6 +227,11 @@ static void f2fs_write_end_io(struct bio *bio)
 
 		fscrypt_finalize_bounce_page(&page);
 
+		if (f2fs_is_compressed_page(page)) {
+			f2fs_compress_write_end_io(bio, page);
+			continue;
+		}
+
 		if (unlikely(bio->bi_status)) {
 			mapping_set_error(page->mapping, -EIO);
 			if (type == F2FS_WB_CP_DATA)
@@ -357,6 +376,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 	submit_bio(bio);
 }
 
+void f2fs_submit_bio(struct f2fs_sb_info *sbi,
+				struct bio *bio, enum page_type type)
+{
+	__submit_bio(sbi, bio, type);
+}
+
 static void __submit_merged_bio(struct f2fs_bio_info *io)
 {
 	struct f2fs_io_info *fio = &io->fio;
@@ -379,7 +404,6 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
 						struct page *page, nid_t ino)
 {
 	struct bio_vec *bvec;
-	struct page *target;
 	struct bvec_iter_all iter_all;
 
 	if (!bio)
@@ -389,10 +413,12 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
 		return true;
 
 	bio_for_each_segment_all(bvec, bio, iter_all) {
+		struct page *target = bvec->bv_page;
 
-		target = bvec->bv_page;
 		if (fscrypt_is_bounce_page(target))
 			target = fscrypt_pagecache_page(target);
+		if (f2fs_is_compressed_page(target))
+			target = f2fs_compress_control_page(target);
 
 		if (inode && inode == target->mapping->host)
 			return true;
@@ -727,7 +753,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 
 	verify_fio_blkaddr(fio);
 
-	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+	if (fio->encrypted_page)
+		bio_page = fio->encrypted_page;
+	else if (fio->compressed_page)
+		bio_page = fio->compressed_page;
+	else
+		bio_page = fio->page;
 
 	/* set submitted = true as a return value */
 	fio->submitted = true;
@@ -796,7 +827,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 
 	if (f2fs_encrypted_file(inode))
 		post_read_steps |= 1 << STEP_DECRYPT;
-
+	if (f2fs_compressed_file(inode))
+		post_read_steps |= 1 << STEP_DECOMPRESS;
 	if (f2fs_need_verity(inode, first_idx))
 		post_read_steps |= 1 << STEP_VERITY;
 
@@ -807,6 +839,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 			return ERR_PTR(-ENOMEM);
 		}
 		ctx->bio = bio;
+		ctx->sbi = sbi;
 		ctx->enabled_steps = post_read_steps;
 		bio->bi_private = ctx;
 	}
@@ -1871,6 +1904,142 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
 	return ret;
 }
 
+int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+				unsigned nr_pages, sector_t *last_block_in_bio,
+				bool is_readahead)
+{
+	struct dnode_of_data dn;
+	struct inode *inode = cc->inode;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct bio *bio = *bio_ret;
+	unsigned int start_idx = cc->cluster_idx * cc->cluster_size;
+	sector_t last_block_in_file;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	struct decompress_io_ctx *dic = NULL;
+	int i;
+	int ret = 0;
+
+	f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
+
+	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+
+	/* get rid of pages beyond EOF */
+	for (i = cc->nr_rpages - 1; i >= 0; i--) {
+		struct page *page = cc->rpages[i];
+
+		if (!page)
+			continue;
+		if ((sector_t)page->index < last_block_in_file)
+			break;
+
+		zero_user_segment(page, 0, PAGE_SIZE);
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+
+		unlock_page(page);
+		cc->rpages[i] = NULL;
+		cc->nr_rpages--;
+	}
+
+	/* we are done since all pages are beyond EOF */
+	if (f2fs_cluster_is_empty(cc))
+		goto out;
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+	if (ret)
+		goto out;
+
+	f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);
+
+	for (i = 1; i < cc->cluster_size; i++) {
+		block_t blkaddr;
+
+		blkaddr = datablock_addr(dn.inode, dn.node_page,
+						dn.ofs_in_node + i);
+
+		if (!__is_valid_data_blkaddr(blkaddr))
+			break;
+
+		if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) {
+			ret = -EFAULT;
+			goto out_put_dnode;
+		}
+		cc->nr_cpages++;
+	}
+
+	/* nothing to decompress */
+	if (cc->nr_cpages == 0) {
+		ret = 0;
+		goto out_put_dnode;
+	}
+
+	dic = f2fs_alloc_dic(cc);
+	if (IS_ERR(dic)) {
+		ret = PTR_ERR(dic);
+		goto out_put_dnode;
+	}
+
+	for (i = 0; i < dic->nr_cpages; i++) {
+		struct page *page = dic->cpages[i];
+		block_t blkaddr;
+
+		blkaddr = datablock_addr(dn.inode, dn.node_page,
+						dn.ofs_in_node + i + 1);
+
+		if (bio && !page_is_mergeable(sbi, bio,
+					*last_block_in_bio, blkaddr)) {
+submit_and_realloc:
+			__submit_bio(sbi, bio, DATA);
+			bio = NULL;
+		}
+
+		if (!bio) {
+			bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
+					is_readahead ? REQ_RAHEAD : 0,
+					page->index);
+			if (IS_ERR(bio)) {
+				ret = PTR_ERR(bio);
+				bio = NULL;
+				dic->err = true;
+				if (refcount_sub_and_test(dic->nr_cpages - i,
+							&dic->ref))
+					f2fs_set_cluster_uptodate(dic->rpages,
+							cc->cluster_size, true,
+							false);
+				f2fs_free_dic(dic);
+				f2fs_put_dnode(&dn);
+				f2fs_reset_compress_ctx(cc);
+				*bio_ret = bio;
+				return ret;
+			}
+		}
+
+		f2fs_wait_on_block_writeback(inode, blkaddr);
+
+		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+			goto submit_and_realloc;
+
+		inc_page_count(sbi, F2FS_RD_DATA);
+		ClearPageError(page);
+		*last_block_in_bio = blkaddr;
+	}
+
+	f2fs_put_dnode(&dn);
+
+	f2fs_reset_compress_ctx(cc);
+	*bio_ret = bio;
+	return 0;
+out_put_dnode:
+	f2fs_put_dnode(&dn);
+out:
+	f2fs_set_cluster_uptodate(cc->rpages, cc->cluster_size, true, false);
+	f2fs_reset_compress_ctx(cc);
+	*bio_ret = bio;
+	return ret;
+}
+
 /*
  * This function was originally taken from fs/mpage.c, and customized for f2fs.
  * Major change was from block_size == page_size in f2fs by default.
@@ -1880,7 +2049,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
  * use ->readpage() or do the necessary surgery to decouple ->readpages()
  * from read-ahead.
  */
-static int f2fs_mpage_readpages(struct address_space *mapping,
+int f2fs_mpage_readpages(struct address_space *mapping,
 			struct list_head *pages, struct page *page,
 			unsigned nr_pages, bool is_readahead)
 {
@@ -1888,6 +2057,16 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 	sector_t last_block_in_bio = 0;
 	struct inode *inode = mapping->host;
 	struct f2fs_map_blocks map;
+	struct compress_ctx cc = {
+		.inode = inode,
+		.cluster_size = F2FS_I(inode)->i_cluster_size,
+		.cluster_idx = NULL_CLUSTER,
+		.rpages = NULL,
+		.cpages = NULL,
+		.nr_rpages = 0,
+		.nr_cpages = 0,
+	};
+	unsigned max_nr_pages = nr_pages;
 	int ret = 0;
 
 	map.m_pblk = 0;
@@ -1911,9 +2090,36 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 				goto next_page;
 		}
 
-		ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio,
-					&last_block_in_bio, is_readahead);
+		if (f2fs_compressed_file(inode)) {
+			/* there are remained comressed pages, submit them */
+			if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
+				ret = f2fs_read_multi_pages(&cc, &bio,
+							max_nr_pages,
+							&last_block_in_bio,
+							is_readahead);
+				if (ret)
+					goto set_error_page;
+			}
+			ret = f2fs_is_compressed_cluster(&cc, page->index);
+			if (ret < 0)
+				goto set_error_page;
+			else if (!ret)
+				goto read_single_page;
+
+			ret = f2fs_init_compress_ctx(&cc);
+			if (ret)
+				goto set_error_page;
+
+			ret = f2fs_compress_ctx_add_page(&cc, page);
+			f2fs_bug_on(F2FS_I_SB(inode), ret);
+
+			goto next_page;
+		}
+read_single_page:
+		ret = f2fs_read_single_page(inode, page, max_nr_pages, &map,
+					&bio, &last_block_in_bio, is_readahead);
 		if (ret) {
+set_error_page:
 			SetPageError(page);
 			zero_user_segment(page, 0, PAGE_SIZE);
 			unlock_page(page);
@@ -1921,6 +2127,15 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 next_page:
 		if (pages)
 			put_page(page);
+
+		if (f2fs_compressed_file(inode)) {
+			/* last page */
+			if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc))
+				ret = f2fs_read_multi_pages(&cc, &bio,
+							max_nr_pages,
+							&last_block_in_bio,
+							is_readahead);
+		}
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
@@ -1960,22 +2175,23 @@ static int f2fs_read_data_pages(struct file *file,
 	return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true);
 }
 
-static int encrypt_one_page(struct f2fs_io_info *fio)
+int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
 {
 	struct inode *inode = fio->page->mapping->host;
-	struct page *mpage;
+	struct page *mpage, *page;
 	gfp_t gfp_flags = GFP_NOFS;
 
 	if (!f2fs_encrypted_file(inode))
 		return 0;
 
+	page = fio->compressed_page ? fio->compressed_page : fio->page;
+
 	/* wait for GCed page writeback via META_MAPPING */
 	f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
 
 retry_encrypt:
-	fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page,
-							       PAGE_SIZE, 0,
-							       gfp_flags);
+	fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
+					PAGE_SIZE, 0, gfp_flags);
 	if (IS_ERR(fio->encrypted_page)) {
 		/* flush pending IOs and wait for a while in the ENOMEM case */
 		if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
@@ -2135,7 +2351,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 	if (ipu_force ||
 		(__is_valid_data_blkaddr(fio->old_blkaddr) &&
 					need_inplace_update(fio))) {
-		err = encrypt_one_page(fio);
+		err = f2fs_encrypt_one_page(fio);
 		if (err)
 			goto out_writepage;
 
@@ -2171,7 +2387,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 
 	fio->version = ni.version;
 
-	err = encrypt_one_page(fio);
+	err = f2fs_encrypt_one_page(fio);
 	if (err)
 		goto out_writepage;
 
@@ -2192,7 +2408,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 	return err;
 }
 
-static int __write_data_page(struct page *page, bool *submitted,
+int f2fs_write_single_data_page(struct page *page, int *submitted,
 				struct bio **bio,
 				sector_t *last_block,
 				struct writeback_control *wbc,
@@ -2201,7 +2417,7 @@ static int __write_data_page(struct page *page, bool *submitted,
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = ((unsigned long long) i_size)
+	const pgoff_t end_index = ((unsigned long long)i_size)
 							>> PAGE_SHIFT;
 	loff_t psize = (page->index + 1) << PAGE_SHIFT;
 	unsigned offset = 0;
@@ -2330,7 +2546,7 @@ static int __write_data_page(struct page *page, bool *submitted,
 	}
 
 	if (submitted)
-		*submitted = fio.submitted;
+		*submitted = fio.submitted ? 1 : 0;
 
 	return 0;
 
@@ -2351,7 +2567,8 @@ static int __write_data_page(struct page *page, bool *submitted,
 static int f2fs_write_data_page(struct page *page,
 					struct writeback_control *wbc)
 {
-	return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO);
+	return f2fs_write_single_data_page(page, NULL, NULL, NULL,
+						wbc, FS_DATA_IO);
 }
 
 /*
@@ -2369,6 +2586,19 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	struct bio *bio = NULL;
 	sector_t last_block;
+	struct inode *inode = mapping->host;
+	struct compress_ctx cc = {
+		.inode = inode,
+		.cluster_size = F2FS_I(inode)->i_cluster_size,
+		.cluster_idx = NULL_CLUSTER,
+		.rpages = NULL,
+		.nr_rpages = 0,
+		.cpages = NULL,
+		.rbuf = NULL,
+		.cbuf = NULL,
+		.rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size,
+		.private = NULL,
+	};
 	int nr_pages;
 	pgoff_t uninitialized_var(writeback_index);
 	pgoff_t index;
@@ -2378,6 +2608,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	int range_whole = 0;
 	xa_mark_t tag;
 	int nwritten = 0;
+	int submitted = 0;
+	int i;
 
 	pagevec_init(&pvec);
 
@@ -2411,8 +2643,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
-		int i;
-
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
 				tag);
 		if (nr_pages == 0)
@@ -2420,7 +2650,24 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
-			bool submitted = false;
+			bool need_readd = false;
+
+readd:
+			if (f2fs_compressed_file(inode)) {
+				ret = f2fs_init_compress_ctx(&cc);
+				if (ret) {
+					done = 1;
+					break;
+				}
+
+				if (!f2fs_cluster_can_merge_page(&cc,
+							page->index)) {
+					need_readd = true;
+					ret = f2fs_write_multi_pages(&cc,
+						&submitted, wbc, io_type);
+					goto result;
+				}
+			}
 
 			/* give a priority to WB_SYNC threads */
 			if (atomic_read(&sbi->wb_sync_req[DATA]) &&
@@ -2430,7 +2677,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 			}
 
 			done_index = page->index;
-retry_write:
+
 			lock_page(page);
 
 			if (unlikely(page->mapping != mapping)) {
@@ -2455,44 +2702,58 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			ret = __write_data_page(page, &submitted, &bio,
-					&last_block, wbc, io_type);
+			if (f2fs_compressed_file(mapping->host)) {
+				ret = f2fs_compress_ctx_add_page(&cc, page);
+				f2fs_bug_on(sbi, ret);
+				continue;
+			}
+			ret = f2fs_write_single_data_page(page, &submitted,
+					&bio, &last_block, wbc, io_type);
+			if (ret == AOP_WRITEPAGE_ACTIVATE)
+				unlock_page(page);
+result:
+			nwritten += submitted;
+			wbc->nr_to_write -= submitted;
+
 			if (unlikely(ret)) {
 				/*
 				 * keep nr_to_write, since vfs uses this to
 				 * get # of written pages.
 				 */
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
-					unlock_page(page);
 					ret = 0;
-					continue;
+					goto next;
 				} else if (ret == -EAGAIN) {
 					ret = 0;
-					if (wbc->sync_mode == WB_SYNC_ALL) {
-						cond_resched();
-						congestion_wait(BLK_RW_ASYNC,
-									HZ/50);
-						goto retry_write;
-					}
-					continue;
+					goto next;
 				}
 				done_index = page->index + 1;
 				done = 1;
 				break;
-			} else if (submitted) {
-				nwritten++;
 			}
 
-			if (--wbc->nr_to_write <= 0 &&
+			if (wbc->nr_to_write <= 0 &&
 					wbc->sync_mode == WB_SYNC_NONE) {
 				done = 1;
 				break;
 			}
+next:
+			if (need_readd)
+				goto readd;
 		}
+
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 
+	/* flush remained pages in compress cluster */
+	if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) {
+		ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type);
+		nwritten += submitted;
+		wbc->nr_to_write -= submitted;
+		/* TODO: error handling */
+	}
+
 	if (!cycled && !done) {
 		cycled = 1;
 		index = 0;
@@ -2509,6 +2770,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	if (bio)
 		f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
 
+	f2fs_destroy_compress_ctx(&cc);
+
 	return ret;
 }
 
@@ -2517,6 +2780,8 @@ static inline bool __should_serialize_io(struct inode *inode,
 {
 	if (!S_ISREG(inode->i_mode))
 		return false;
+	if (f2fs_compressed_file(inode))
+		return true;
 	if (IS_NOQUOTA(inode))
 		return false;
 	/* to avoid deadlock in path of data flush */
@@ -2659,6 +2924,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 		__do_map_lock(sbi, flag, true);
 		locked = true;
 	}
+
 restart:
 	/* check inline_data */
 	ipage = f2fs_get_node_page(sbi, inode->i_ino);
@@ -2749,6 +3015,30 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 		if (err)
 			goto fail;
 	}
+
+	if (f2fs_compressed_file(inode)) {
+		struct compress_ctx cc = {
+			.inode = inode,
+			.cluster_size = F2FS_I(inode)->i_cluster_size,
+			.cluster_idx = NULL_CLUSTER,
+			.rpages = NULL,
+			.nr_rpages = 0,
+		};
+
+		*fsdata = NULL;
+
+		err = f2fs_is_compressed_cluster(&cc, index);
+		if (err < 0)
+			goto fail;
+		if (!err)
+			goto repeat;
+
+		err = f2fs_prepare_compress_overwrite(&cc, pagep, index, fsdata,
+						err == CLUSTER_HAS_SPACE);
+		/* need to goto fail? */
+		return err;
+	}
+
 repeat:
 	/*
 	 * Do not use grab_cache_page_write_begin() to avoid deadlock due to
@@ -2761,6 +3051,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 		goto fail;
 	}
 
+	/* TODO: cluster can be compressed due to race with .writepage */
+
 	*pagep = page;
 
 	err = prepare_write_begin(sbi, page, pos, len,
@@ -2844,6 +3136,13 @@ static int f2fs_write_end(struct file *file,
 		else
 			SetPageUptodate(page);
 	}
+
+	/* overwrite compressed file */
+	if (f2fs_compressed_file(inode) && fsdata) {
+		f2fs_compress_write_end(inode, fsdata, copied);
+		goto update_time;
+	}
+
 	if (!copied)
 		goto unlock_out;
 
@@ -2854,6 +3153,7 @@ static int f2fs_write_end(struct file *file,
 		f2fs_i_size_write(inode, pos + copied);
 unlock_out:
 	f2fs_put_page(page, 1);
+update_time:
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return copied;
 }
@@ -3318,6 +3618,26 @@ void f2fs_destroy_post_read_processing(void)
 	kmem_cache_destroy(bio_post_read_ctx_cache);
 }
 
+int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
+{
+	if (!f2fs_sb_has_encrypt(sbi) &&
+		!f2fs_sb_has_compression(sbi))
+		return 0;
+
+	sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
+						 WQ_UNBOUND | WQ_HIGHPRI,
+						 num_online_cpus());
+	if (!sbi->post_read_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
+{
+	if (sbi->post_read_wq)
+		destroy_workqueue(sbi->post_read_wq);
+}
+
 int __init f2fs_init_bio_entry_cache(void)
 {
 	bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 9b0bedd82581..498207c9dbe2 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -94,6 +94,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->inline_xattr = atomic_read(&sbi->inline_xattr);
 	si->inline_inode = atomic_read(&sbi->inline_inode);
 	si->inline_dir = atomic_read(&sbi->inline_dir);
+	si->compr_inode = atomic_read(&sbi->compr_inode);
+	si->compr_blocks = atomic_read(&sbi->compr_blocks);
 	si->append = sbi->im[APPEND_INO].ino_num;
 	si->update = sbi->im[UPDATE_INO].ino_num;
 	si->orphans = sbi->im[ORPHAN_INO].ino_num;
@@ -315,6 +317,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->inline_inode);
 		seq_printf(s, "  - Inline_dentry Inode: %u\n",
 			   si->inline_dir);
+		seq_printf(s, "  - Compressed Inode: %u, Blocks: %u\n",
+			   si->compr_inode, si->compr_blocks);
 		seq_printf(s, "  - Orphan/Append/Update Inode: %u, %u, %u\n",
 			   si->orphans, si->append, si->update);
 		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
@@ -491,6 +495,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
 	atomic_set(&sbi->inline_dir, 0);
+	atomic_set(&sbi->compr_inode, 0);
+	atomic_set(&sbi->compr_blocks, 0);
 	atomic_set(&sbi->inplace_count, 0);
 	for (i = META_CP; i < META_MAX; i++)
 		atomic_set(&sbi->meta_count[i], 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c681f51e351b..775c96291490 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -116,6 +116,8 @@ typedef u32 block_t;	/*
 			 */
 typedef u32 nid_t;
 
+#define COMPRESS_EXT_NUM		16
+
 struct f2fs_mount_info {
 	unsigned int opt;
 	int write_io_size_bits;		/* Write IO size bits */
@@ -140,6 +142,12 @@ struct f2fs_mount_info {
 	block_t unusable_cap;		/* Amount of space allowed to be
 					 * unusable when disabling checkpoint
 					 */
+
+	/* For compression */
+	unsigned char compress_algorithm;	/* algorithm type */
+	unsigned compress_log_size;		/* cluster log size */
+	unsigned char compress_ext_cnt;		/* extension count */
+	unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN];	/* extensions */
 };
 
 #define F2FS_FEATURE_ENCRYPT		0x0001
@@ -155,6 +163,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_VERITY		0x0400
 #define F2FS_FEATURE_SB_CHKSUM		0x0800
 #define F2FS_FEATURE_CASEFOLD		0x1000
+#define F2FS_FEATURE_COMPRESSION	0x2000
 
 #define __F2FS_HAS_FEATURE(raw_super, mask)				\
 	((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -712,6 +721,12 @@ struct f2fs_inode_info {
 	int i_inline_xattr_size;	/* inline xattr size */
 	struct timespec64 i_crtime;	/* inode creation time */
 	struct timespec64 i_disk_time[4];/* inode disk times */
+
+	/* for file compress */
+	u64 i_compressed_blocks;		/* # of compressed blocks */
+	unsigned char i_compress_algorithm;	/* algorithm type */
+	unsigned char i_log_cluster_size;	/* log of cluster size */
+	unsigned int i_cluster_size;		/* cluster size */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -1056,12 +1071,15 @@ struct f2fs_io_info {
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
+	struct page *compressed_page;	/* compressed page */
 	struct list_head list;		/* serialize IOs */
 	bool submitted;		/* indicate IO submission */
 	int need_lock;		/* indicate we need to lock cp_rwsem */
 	bool in_list;		/* indicate fio is in io_list */
 	bool is_por;		/* indicate IO is from recovery or not */
 	bool retry;		/* need to reallocate block address */
+	bool compressed;	/* indicate cluster is compressed */
+	bool encrypted;		/* indicate file is encrypted */
 	enum iostat_type io_type;	/* io type */
 	struct writeback_control *io_wbc; /* writeback control */
 	struct bio **bio;		/* bio for ipu */
@@ -1169,6 +1187,18 @@ enum fsync_mode {
 	FSYNC_MODE_NOBARRIER,	/* fsync behaves nobarrier based on posix */
 };
 
+/*
+ * this value is set in page as a private data which indicate that
+ * the page is atomically written, and it is in inmem_pages list.
+ */
+#define ATOMIC_WRITTEN_PAGE		((unsigned long)-1)
+#define DUMMY_WRITTEN_PAGE		((unsigned long)-2)
+
+#define IS_ATOMIC_WRITTEN_PAGE(page)			\
+		(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+#define IS_DUMMY_WRITTEN_PAGE(page)			\
+		(page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
+
 #ifdef CONFIG_FS_ENCRYPTION
 #define DUMMY_ENCRYPTION_ENABLED(sbi) \
 			(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
@@ -1176,6 +1206,67 @@ enum fsync_mode {
 #define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
 #endif
 
+/* For compression */
+enum compress_algrithm_type {
+	COMPRESS_LZO,
+	COMPRESS_LZ4,
+	COMPRESS_MAX,
+};
+
+struct compress_data {
+	__le32 clen;
+	__le32 chksum;
+	__le32 reserved[4];
+	char cdata[];
+};
+
+#define COMPRESS_HEADER_SIZE	(sizeof(struct compress_data))
+
+struct compress_ctx {
+	struct inode *inode;
+	unsigned int cluster_size;
+	unsigned int cluster_idx;
+	struct page **rpages;
+	unsigned int nr_rpages;
+	struct page **cpages;
+	unsigned int nr_cpages;
+	void *rbuf;
+	struct compress_data *cbuf;
+	size_t rlen;
+	size_t clen;
+	void *private;
+};
+
+#define F2FS_COMPRESSED_PAGE_MAGIC	0xF5F2C000
+struct compress_io_ctx {
+	u32 magic;
+	struct inode *inode;
+	refcount_t ref;
+	struct page **rpages;
+	unsigned int nr_rpages;
+};
+
+struct decompress_io_ctx {
+	struct inode *inode;
+	refcount_t ref;
+	struct page **rpages;		/* raw pages from page cache */
+	unsigned int nr_rpages;
+	struct page **cpages;		/* pages contain compressed data */
+	unsigned int nr_cpages;
+	struct page **tpages;		/* temp pages to pad hole in cluster */
+	void *rbuf;
+	struct compress_data *cbuf;
+	size_t rlen;
+	size_t clen;
+	unsigned int cluster_idx;
+	unsigned int cluster_size;
+	bool err;
+};
+
+#define NULL_CLUSTER			((unsigned int)(~0))
+#define MIN_COMPRESS_LOG_SIZE		2
+#define MAX_COMPRESS_LOG_SIZE		8
+
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -1326,6 +1417,8 @@ struct f2fs_sb_info {
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
+	atomic_t compr_inode;			/* # of compressed inodes */
+	atomic_t compr_blocks;			/* # of compressed blocks */
 	atomic_t aw_cnt;			/* # of atomic writes */
 	atomic_t vw_cnt;			/* # of volatile writes */
 	atomic_t max_aw_cnt;			/* max # of atomic writes */
@@ -1364,6 +1457,11 @@ struct f2fs_sb_info {
 
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_chksum_seed;
+
+	struct workqueue_struct *post_read_wq;	/* post read workqueue */
+
+	/* For file compress */
+	const struct f2fs_compress_ops *cops[COMPRESS_MAX];
 };
 
 struct f2fs_private_dio {
@@ -2375,6 +2473,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 /*
  * On-disk inode flags (f2fs_inode::i_flags)
  */
+#define F2FS_COMPR_FL			0x00000004 /* Compress file */
+#define F2FS_NOCOMP_FL			0x00000400 /* Don't compress */
 #define F2FS_SYNC_FL			0x00000008 /* Synchronous updates */
 #define F2FS_IMMUTABLE_FL		0x00000010 /* Immutable file */
 #define F2FS_APPEND_FL			0x00000020 /* writes to file may only append */
@@ -2388,7 +2488,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 /* Flags that should be inherited by new inodes from their parent. */
 #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
 			   F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
-			   F2FS_CASEFOLD_FL)
+			   F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define F2FS_REG_FLMASK		(~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
@@ -2440,6 +2540,7 @@ enum {
 	FI_PIN_FILE,		/* indicate file should not be gced */
 	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
 	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
+	FI_COMPRESSED_FILE,	/* indicate file's data can be compressed */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2456,6 +2557,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
 	case FI_PIN_FILE:
+	case FI_COMPRESSED_FILE:
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
@@ -2611,16 +2713,27 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 	return is_inode_flag_set(inode, FI_INLINE_XATTR);
 }
 
+static inline int f2fs_compressed_file(struct inode *inode)
+{
+	return S_ISREG(inode->i_mode) &&
+		is_inode_flag_set(inode, FI_COMPRESSED_FILE);
+}
+
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
 	unsigned int addrs = CUR_ADDRS_PER_INODE(inode) -
 				get_inline_xattr_addrs(inode);
-	return ALIGN_DOWN(addrs, 1);
+
+	if (!f2fs_compressed_file(inode))
+		return addrs;
+	return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size);
 }
 
 static inline unsigned int addrs_per_block(struct inode *inode)
 {
-	return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, 1);
+	if (!f2fs_compressed_file(inode))
+		return DEF_ADDRS_PER_BLOCK;
+	return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, F2FS_I(inode)->i_cluster_size);
 }
 
 static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
@@ -2780,7 +2893,8 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
 	if (!test_opt(sbi, EXTENT_CACHE) ||
-			is_inode_flag_set(inode, FI_NO_EXTENT))
+			is_inode_flag_set(inode, FI_NO_EXTENT) ||
+			is_inode_flag_set(inode, FI_COMPRESSED_FILE))
 		return false;
 
 	/*
@@ -2900,7 +3014,8 @@ static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
 
 static inline bool __is_valid_data_blkaddr(block_t blkaddr)
 {
-	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR ||
+			blkaddr == COMPRESS_ADDR)
 		return false;
 	return true;
 }
@@ -3202,10 +3317,10 @@ void f2fs_destroy_checkpoint_caches(void);
 /*
  * data.c
  */
-int f2fs_init_post_read_processing(void);
-void f2fs_destroy_post_read_processing(void);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
+void f2fs_submit_bio(struct f2fs_sb_info *sbi,
+				struct bio *bio, enum page_type type);
 void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
 void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
 				struct inode *inode, struct page *page,
@@ -3226,6 +3341,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
 int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
 int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
+int f2fs_mpage_readpages(struct address_space *mapping,
+			struct list_head *pages, struct page *page,
+			unsigned nr_pages, bool is_readahead);
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 			int op_flags, bool for_write);
 struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
@@ -3239,8 +3357,13 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 			int create, int flag);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
+int f2fs_encrypt_one_page(struct f2fs_io_info *fio);
 bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
 bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
+int f2fs_write_single_data_page(struct page *page, int *submitted,
+				struct bio **bio, sector_t *last_block,
+				struct writeback_control *wbc,
+				enum iostat_type io_type);
 void f2fs_invalidate_page(struct page *page, unsigned int offset,
 			unsigned int length);
 int f2fs_release_page(struct page *page, gfp_t wait);
@@ -3250,6 +3373,10 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
 #endif
 bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
 void f2fs_clear_page_cache_dirty_tag(struct page *page);
+int f2fs_init_post_read_processing(void);
+void f2fs_destroy_post_read_processing(void);
+int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi);
+void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi);
 
 /*
  * gc.c
@@ -3296,6 +3423,7 @@ struct f2fs_stat_info {
 	int nr_discard_cmd;
 	unsigned int undiscard_blks;
 	int inline_xattr, inline_inode, inline_dir, append, update, orphans;
+	int compr_inode, compr_blocks;
 	int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
 	unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
 	unsigned int bimodal, avg_vblocks;
@@ -3366,6 +3494,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 		if (f2fs_has_inline_dentry(inode))			\
 			(atomic_dec(&F2FS_I_SB(inode)->inline_dir));	\
 	} while (0)
+#define stat_inc_compr_inode(inode)					\
+	do {								\
+		if (f2fs_compressed_file(inode))			\
+			(atomic_inc(&F2FS_I_SB(inode)->compr_inode));	\
+	} while (0)
+#define stat_dec_compr_inode(inode)					\
+	do {								\
+		if (f2fs_compressed_file(inode))			\
+			(atomic_dec(&F2FS_I_SB(inode)->compr_inode));	\
+	} while (0)
+#define stat_add_compr_blocks(inode, blocks)				\
+		(atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks))
+#define stat_sub_compr_blocks(inode, blocks)				\
+		(atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks))
 #define stat_inc_meta_count(sbi, blkaddr)				\
 	do {								\
 		if (blkaddr < SIT_I(sbi)->sit_base_addr)		\
@@ -3460,8 +3602,12 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_inline_inode(inode)			do { } while (0)
 #define stat_inc_inline_dir(inode)			do { } while (0)
 #define stat_dec_inline_dir(inode)			do { } while (0)
+#define stat_inc_compr_inode(inode)			do { } while (0)
+#define stat_dec_compr_inode(inode)			do { } while (0)
 #define stat_inc_atomic_write(inode)			do { } while (0)
 #define stat_dec_atomic_write(inode)			do { } while (0)
+#define stat_inc_compr_blocks(inode)			do { } while (0)
+#define stat_dec_compr_blocks(inode)			do { } while (0)
 #define stat_update_max_atomic_write(inode)		do { } while (0)
 #define stat_inc_volatile_write(inode)			do { } while (0)
 #define stat_dec_volatile_write(inode)			do { } while (0)
@@ -3599,9 +3745,42 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
  */
 static inline bool f2fs_post_read_required(struct inode *inode)
 {
-	return f2fs_encrypted_file(inode) || fsverity_active(inode);
+	return f2fs_encrypted_file(inode) || fsverity_active(inode) ||
+		f2fs_compressed_file(inode);
 }
 
+/*
+ * compress.c
+ */
+bool f2fs_is_compressed_page(struct page *page);
+struct page *f2fs_compress_control_page(struct page *page);
+void f2fs_reset_compress_ctx(struct compress_ctx *cc);
+int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
+					struct page **page_ret, pgoff_t index,
+					void **fsdata, bool prealloc);
+void f2fs_compress_write_end(struct inode *inode, void *fsdata,
+							bool written);
+void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
+void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
+bool f2fs_cluster_is_empty(struct compress_ctx *cc);
+bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
+int f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
+int f2fs_write_multi_pages(struct compress_ctx *cc,
+						int *submitted,
+						struct writeback_control *wbc,
+						enum iostat_type io_type);
+int f2fs_is_compressed_cluster(struct compress_ctx *cc, pgoff_t index);
+int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+				unsigned nr_pages, sector_t *last_block_in_bio,
+				bool is_readahead);
+struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
+void f2fs_free_dic(struct decompress_io_ctx *dic);
+void f2fs_set_cluster_uptodate(struct page **rpages,
+			unsigned int cluster_size, bool err, bool verity);
+int f2fs_init_compress_ctx(struct compress_ctx *cc);
+void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
+void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
+
 #define F2FS_FEATURE_FUNCS(name, flagname) \
 static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
 { \
@@ -3620,6 +3799,7 @@ F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
 F2FS_FEATURE_FUNCS(verity, VERITY);
 F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
 F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
+F2FS_FEATURE_FUNCS(compression, COMPRESSION);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
@@ -3701,6 +3881,15 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
+static inline bool f2fs_may_compress(struct inode *inode)
+{
+	if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
+				f2fs_is_atomic_file(inode) ||
+				f2fs_is_volatile_file(inode))
+		return false;
+	return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
 static inline int block_unaligned_IO(struct inode *inode,
 				struct kiocb *iocb, struct iov_iter *iter)
 {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f6c038e8a6a7..8a92e8fd648c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -518,6 +518,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	int nr_free = 0, ofs = dn->ofs_in_node, len = count;
 	__le32 *addr;
 	int base = 0;
+	bool compressed_cluster = false;
+	int cluster_index = 0, valid_blocks = 0;
+	int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
 
 	if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
 		base = get_extra_isize(dn->inode);
@@ -525,26 +528,51 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	raw_node = F2FS_NODE(dn->node_page);
 	addr = blkaddr_in_node(raw_node) + base + ofs;
 
-	for (; count > 0; count--, addr++, dn->ofs_in_node++) {
+	/* Assumption: truncateion starts with cluster */
+	for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
 		block_t blkaddr = le32_to_cpu(*addr);
 
+		if (f2fs_compressed_file(dn->inode) &&
+					!(cluster_index % cluster_size)) {
+			if (compressed_cluster) {
+				int compr_blocks = cluster_size - valid_blocks;
+
+				stat_sub_compr_blocks(dn->inode, compr_blocks);
+				F2FS_I(dn->inode)->i_compressed_blocks -=
+								compr_blocks;
+			}
+			compressed_cluster = (blkaddr == COMPRESS_ADDR);
+			valid_blocks = 0;
+		}
+
 		if (blkaddr == NULL_ADDR)
 			continue;
 
 		dn->data_blkaddr = NULL_ADDR;
 		f2fs_set_data_blkaddr(dn);
 
-		if (__is_valid_data_blkaddr(blkaddr) &&
-			!f2fs_is_valid_blkaddr(sbi, blkaddr,
+		if (__is_valid_data_blkaddr(blkaddr)) {
+			if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
 					DATA_GENERIC_ENHANCE))
-			continue;
+				continue;
+			if (compressed_cluster)
+				valid_blocks++;
+		}
 
-		f2fs_invalidate_blocks(sbi, blkaddr);
 		if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
 			clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
+
+		f2fs_invalidate_blocks(sbi, blkaddr);
 		nr_free++;
 	}
 
+	if (compressed_cluster) {
+		int compr_blocks = cluster_size - valid_blocks;
+
+		stat_sub_compr_blocks(dn->inode, compr_blocks);
+		F2FS_I(dn->inode)->i_compressed_blocks -= compr_blocks;
+	}
+
 	if (nr_free) {
 		pgoff_t fofs;
 		/*
@@ -587,6 +615,9 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 		return 0;
 	}
 
+	if (f2fs_compressed_file(inode))
+		return 0;
+
 	page = f2fs_get_lock_data_page(inode, index, true);
 	if (IS_ERR(page))
 		return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
@@ -602,7 +633,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 	return 0;
 }
 
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
+static int do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
@@ -667,6 +698,24 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
 	return err;
 }
 
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
+{
+	u64 free_from = from;
+
+	/*
+	 * for compressed file, only support cluster size
+	 * aligned truncation.
+	 */
+	if (f2fs_compressed_file(inode)) {
+		size_t cluster_size = F2FS_I(inode)->i_cluster_size * PAGE_SIZE;
+
+		free_from = (from + cluster_size - 1) /
+					cluster_size * cluster_size;
+	}
+
+	return do_truncate_blocks(inode, free_from, lock);
+}
+
 int f2fs_truncate(struct inode *inode)
 {
 	int err;
@@ -1023,8 +1072,8 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr,
 	} else if (ret == -ENOENT) {
 		if (dn.max_level == 0)
 			return -ENOENT;
-		done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - dn.ofs_in_node,
-									len);
+		done = min((pgoff_t)ADDRS_PER_BLOCK(inode) -
+						dn.ofs_in_node, len);
 		blkaddr += done;
 		do_replace += done;
 		goto next;
@@ -1624,6 +1673,11 @@ static long f2fs_fallocate(struct file *file, int mode,
 		(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
 		return -EOPNOTSUPP;
 
+	if (f2fs_compressed_file(inode) &&
+		(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+			FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)))
+		return -EOPNOTSUPP;
+
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 			FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
 			FALLOC_FL_INSERT_RANGE))
@@ -1713,7 +1767,44 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 			return -ENOTEMPTY;
 	}
 
+	if (iflags & (FS_COMPR_FL | FS_NOCOMP_FL)) {
+		if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+			return -EOPNOTSUPP;
+		if ((iflags & FS_COMPR_FL) && (iflags & FS_NOCOMP_FL))
+			return -EINVAL;
+	}
+
+	if ((iflags ^ fi->i_flags) & FS_COMPR_FL) {
+		if (S_ISREG(inode->i_mode) &&
+			(fi->i_flags & FS_COMPR_FL || i_size_read(inode) ||
+						F2FS_HAS_BLOCKS(inode)))
+			return -EINVAL;
+		if (iflags & FS_NOCOMP_FL)
+			return -EINVAL;
+		if (S_ISREG(inode->i_mode))
+			clear_inode_flag(inode, FI_INLINE_DATA);
+	}
+	if ((iflags ^ fi->i_flags) & FS_NOCOMP_FL) {
+		if (fi->i_flags & FS_COMPR_FL)
+			return -EINVAL;
+	}
+
 	fi->i_flags = iflags | (fi->i_flags & ~mask);
+	f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & FS_COMPR_FL) &&
+					(fi->i_flags & FS_NOCOMP_FL));
+
+	if (fi->i_flags & FS_COMPR_FL) {
+		int err = f2fs_convert_inline_inode(inode);
+
+		if (err)
+			return err;
+
+		if (!f2fs_may_compress(inode))
+			return -EINVAL;
+
+		set_inode_flag(inode, FI_COMPRESSED_FILE);
+		stat_inc_compr_inode(inode);
+	}
 
 	if (fi->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
@@ -1748,9 +1839,13 @@ static const struct {
 	{ F2FS_DIRSYNC_FL,	FS_DIRSYNC_FL },
 	{ F2FS_PROJINHERIT_FL,	FS_PROJINHERIT_FL },
 	{ F2FS_CASEFOLD_FL,	FS_CASEFOLD_FL },
+	{ F2FS_COMPR_FL,	FS_COMPR_FL },
+	{ F2FS_NOCOMP_FL,	FS_NOCOMP_FL },
 };
 
 #define F2FS_GETTABLE_FS_FL (		\
+		FS_COMPR_FL |		\
+		FS_NOCOMP_FL |		\
 		FS_SYNC_FL |		\
 		FS_IMMUTABLE_FL |	\
 		FS_APPEND_FL |		\
@@ -1766,6 +1861,8 @@ static const struct {
 		FS_CASEFOLD_FL)
 
 #define F2FS_SETTABLE_FS_FL (		\
+		FS_COMPR_FL |		\
+		FS_NOCOMP_FL |		\
 		FS_SYNC_FL |		\
 		FS_IMMUTABLE_FL |	\
 		FS_APPEND_FL |		\
@@ -3091,6 +3188,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 		ret = -EAGAIN;
 		goto out;
 	}
+
+	if (f2fs_compressed_file(inode)) {
+		if (F2FS_HAS_BLOCKS(inode) || i_size_read(inode)) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+		F2FS_I(inode)->i_flags &= ~FS_COMPR_FL;
+		clear_inode_flag(inode, FI_COMPRESSED_FILE);
+		stat_dec_compr_inode(inode);
+	}
+
 	ret = f2fs_convert_inline_inode(inode);
 	if (ret)
 		goto out;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 386ad54c13c3..e84ef90ffdee 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -407,6 +407,20 @@ static int do_read_inode(struct inode *inode)
 		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
 	}
 
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi)) {
+		if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
+					i_log_cluster_size)) {
+			fi->i_compressed_blocks =
+					le64_to_cpu(ri->i_compressed_blocks);
+			fi->i_compress_algorithm = ri->i_compress_algorithm;
+			fi->i_log_cluster_size = ri->i_log_cluster_size;
+			fi->i_cluster_size = 1 << fi->i_log_cluster_size;
+		}
+
+		if ((fi->i_flags & FS_COMPR_FL) && f2fs_may_compress(inode))
+			set_inode_flag(inode, FI_COMPRESSED_FILE);
+	}
+
 	F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
 	F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
 	F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
@@ -416,6 +430,8 @@ static int do_read_inode(struct inode *inode)
 	stat_inc_inline_xattr(inode);
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
+	stat_inc_compr_inode(inode);
+	stat_add_compr_blocks(inode, F2FS_I(inode)->i_compressed_blocks);
 
 	return 0;
 }
@@ -569,6 +585,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 			ri->i_crtime_nsec =
 				cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec);
 		}
+
+		if (f2fs_sb_has_compression(F2FS_I_SB(inode)) &&
+			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+							i_log_cluster_size)) {
+			ri->i_compressed_blocks =
+				cpu_to_le64(F2FS_I(inode)->i_compressed_blocks);
+			ri->i_compress_algorithm =
+				F2FS_I(inode)->i_compress_algorithm;
+			ri->i_log_cluster_size =
+				F2FS_I(inode)->i_log_cluster_size;
+		}
 	}
 
 	__set_inode_rdev(inode, ri);
@@ -711,6 +738,8 @@ void f2fs_evict_inode(struct inode *inode)
 	stat_dec_inline_xattr(inode);
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
+	stat_dec_compr_inode(inode);
+	stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compressed_blocks);
 
 	if (likely(!f2fs_cp_error(sbi) &&
 				!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 4faf06e8bf89..9f37e95c4a4b 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -119,6 +119,20 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 
+	if (f2fs_sb_has_compression(sbi)) {
+		F2FS_I(inode)->i_compress_algorithm =
+				F2FS_OPTION(sbi).compress_algorithm;
+		F2FS_I(inode)->i_log_cluster_size =
+				F2FS_OPTION(sbi).compress_log_size;
+		F2FS_I(inode)->i_cluster_size =
+				1 << F2FS_I(inode)->i_log_cluster_size;
+
+		/* Inherit the compression flag in directory */
+		if ((F2FS_I(inode)->i_flags & FS_COMPR_FL) &&
+					f2fs_may_compress(inode))
+			set_inode_flag(inode, FI_COMPRESSED_FILE);
+	}
+
 	f2fs_set_inode_flags(inode);
 
 	trace_f2fs_new_inode(inode, 0);
@@ -149,6 +163,9 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
 	size_t sublen = strlen(sub);
 	int i;
 
+	if (sublen == 1 && *sub == '*')
+		return 1;
+
 	/*
 	 * filename format of multimedia file should be defined as:
 	 * "filename + '.' + extension + (optional: '.' + temp extension)".
@@ -262,6 +279,33 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
 	return 0;
 }
 
+static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
+						const unsigned char *name)
+{
+	unsigned char (*ext)[F2FS_EXTENSION_LEN];
+	unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+	int i, cold_count, hot_count;
+
+	if (!f2fs_sb_has_compression(sbi) ||
+			is_inode_flag_set(inode, FI_COMPRESSED_FILE) ||
+			F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
+			!f2fs_may_compress(inode))
+		return;
+
+	ext = F2FS_OPTION(sbi).extensions;
+
+	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	hot_count = sbi->raw_super->hot_ext_count;
+
+	for (i = 0; i < ext_cnt; i++) {
+		if (is_extension_exist(name, ext[i]) && !file_is_hot(inode)) {
+			F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
+			set_inode_flag(inode, FI_COMPRESSED_FILE);
+			return;
+		}
+	}
+}
+
 static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 						bool excl)
 {
@@ -286,6 +330,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
 		set_file_temperature(sbi, inode, dentry->d_name.name);
 
+	set_compress_inode(sbi, inode, dentry->d_name.name);
+
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -297,6 +343,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		goto out;
 	f2fs_unlock_op(sbi);
 
+	stat_inc_compr_inode(inode);
 	f2fs_alloc_nid_done(sbi, ino);
 
 	d_instantiate_new(dentry, inode);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 253d72c2663c..0faa97876f81 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2214,7 +2214,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 	struct sit_info *sit_i = SIT_I(sbi);
 
 	f2fs_bug_on(sbi, addr == NULL_ADDR);
-	if (addr == NEW_ADDR)
+	if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
 		return;
 
 	invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
@@ -3021,7 +3021,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 	if (fio->type == DATA) {
 		struct inode *inode = fio->page->mapping->host;
 
-		if (is_cold_data(fio->page) || file_is_cold(inode))
+		if (is_cold_data(fio->page) || file_is_cold(inode) ||
+				f2fs_compressed_file(inode))
 			return CURSEG_COLD_DATA;
 		if (file_is_hot(inode) ||
 				is_inode_flag_set(inode, FI_HOT_DATA) ||
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a95467b202ea..a1b3951367cd 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -200,18 +200,6 @@ struct segment_allocation {
 	void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
 };
 
-/*
- * this value is set in page as a private data which indicate that
- * the page is atomically written, and it is in inmem_pages list.
- */
-#define ATOMIC_WRITTEN_PAGE		((unsigned long)-1)
-#define DUMMY_WRITTEN_PAGE		((unsigned long)-2)
-
-#define IS_ATOMIC_WRITTEN_PAGE(page)			\
-		(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
-#define IS_DUMMY_WRITTEN_PAGE(page)			\
-		(page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
-
 #define MAX_SKIP_GC_COUNT			16
 
 struct inmem_pages {
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c02a47ce551b..de516b7a147a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -141,6 +141,9 @@ enum {
 	Opt_checkpoint_disable_cap,
 	Opt_checkpoint_disable_cap_perc,
 	Opt_checkpoint_enable,
+	Opt_compress_algorithm,
+	Opt_compress_log_size,
+	Opt_compress_extension,
 	Opt_err,
 };
 
@@ -203,6 +206,9 @@ static match_table_t f2fs_tokens = {
 	{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
 	{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
 	{Opt_checkpoint_enable, "checkpoint=enable"},
+	{Opt_compress_algorithm, "compress_algorithm=%s"},
+	{Opt_compress_log_size, "compress_log_size=%u"},
+	{Opt_compress_extension, "compress_extension=%s"},
 	{Opt_err, NULL},
 };
 
@@ -391,8 +397,9 @@ static int parse_options(struct super_block *sb, char *options)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	substring_t args[MAX_OPT_ARGS];
+	unsigned char (*ext)[F2FS_EXTENSION_LEN];
 	char *p, *name;
-	int arg = 0;
+	int arg = 0, ext_cnt;
 	kuid_t uid;
 	kgid_t gid;
 #ifdef CONFIG_QUOTA
@@ -810,6 +817,66 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_checkpoint_enable:
 			clear_opt(sbi, DISABLE_CHECKPOINT);
 			break;
+		case Opt_compress_algorithm:
+			if (!f2fs_sb_has_compression(sbi)) {
+				f2fs_err(sbi, "Compression feature if off");
+				return -EINVAL;
+			}
+			name = match_strdup(&args[0]);
+			if (!name)
+				return -ENOMEM;
+			if (strlen(name) == 3 && !strncmp(name, "lzo", 3)) {
+				F2FS_OPTION(sbi).compress_algorithm =
+								COMPRESS_LZO;
+			} else if (strlen(name) == 3 &&
+					!strncmp(name, "lz4", 3)) {
+				F2FS_OPTION(sbi).compress_algorithm =
+								COMPRESS_LZ4;
+			} else {
+				kvfree(name);
+				return -EINVAL;
+			}
+			kvfree(name);
+			break;
+		case Opt_compress_log_size:
+			if (!f2fs_sb_has_compression(sbi)) {
+				f2fs_err(sbi, "Compression feature if off");
+				return -EINVAL;
+			}
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (arg < MIN_COMPRESS_LOG_SIZE ||
+				arg > MAX_COMPRESS_LOG_SIZE) {
+				f2fs_err(sbi,
+					"Compress cluster log size if out of range");
+				return -EINVAL;
+			}
+			F2FS_OPTION(sbi).compress_log_size = arg;
+			break;
+		case Opt_compress_extension:
+			if (!f2fs_sb_has_compression(sbi)) {
+				f2fs_err(sbi, "Compression feature if off");
+				return -EINVAL;
+			}
+			name = match_strdup(&args[0]);
+			if (!name)
+				return -ENOMEM;
+
+			ext = F2FS_OPTION(sbi).extensions;
+			ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+
+			if (strlen(name) >= F2FS_EXTENSION_LEN ||
+				ext_cnt >= COMPRESS_EXT_NUM) {
+				f2fs_err(sbi,
+					"invalid extension length/number");
+				kvfree(name);
+				return -EINVAL;
+			}
+
+			strcpy(ext[ext_cnt], name);
+			F2FS_OPTION(sbi).compress_ext_cnt++;
+			kvfree(name);
+			break;
 		default:
 			f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
 				 p);
@@ -1125,6 +1192,8 @@ static void f2fs_put_super(struct super_block *sb)
 	f2fs_destroy_node_manager(sbi);
 	f2fs_destroy_segment_manager(sbi);
 
+	f2fs_destroy_post_read_wq(sbi);
+
 	kvfree(sbi->ckpt);
 
 	f2fs_unregister_sysfs(sbi);
@@ -1332,6 +1401,35 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
 #endif
 }
 
+static inline void f2fs_show_compress_options(struct seq_file *seq,
+							struct super_block *sb)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	char *algtype = "";
+	int i;
+
+	if (!f2fs_sb_has_compression(sbi))
+		return;
+
+	switch (F2FS_OPTION(sbi).compress_algorithm) {
+	case COMPRESS_LZO:
+		algtype = "lzo";
+		break;
+	case COMPRESS_LZ4:
+		algtype = "lz4";
+		break;
+	}
+	seq_printf(seq, ",compress_algorithm=%s", algtype);
+
+	seq_printf(seq, ",compress_log_size=%u",
+			F2FS_OPTION(sbi).compress_log_size);
+
+	for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) {
+		seq_printf(seq, ",compress_extension=%s",
+			F2FS_OPTION(sbi).extensions[i]);
+	}
+}
+
 static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
@@ -1454,6 +1552,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",fsync_mode=%s", "strict");
 	else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER)
 		seq_printf(seq, ",fsync_mode=%s", "nobarrier");
+
+	f2fs_show_compress_options(seq, sbi->sb);
 	return 0;
 }
 
@@ -1468,6 +1568,9 @@ static void default_options(struct f2fs_sb_info *sbi)
 	F2FS_OPTION(sbi).test_dummy_encryption = false;
 	F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
 	F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
+	F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO;
+	F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
+	F2FS_OPTION(sbi).compress_ext_cnt = 0;
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_XATTR);
@@ -3397,6 +3500,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_devices;
 	}
 
+	err = f2fs_init_post_read_wq(sbi);
+	if (err) {
+		f2fs_err(sbi, "Failed to initialize post read workqueue");
+		goto free_devices;
+	}
+
+	/* compression initialization */
+	f2fs_init_compress_info(sbi);
+
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	percpu_counter_set(&sbi->total_valid_inode_count,
@@ -3618,6 +3730,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	f2fs_destroy_node_manager(sbi);
 free_sm:
 	f2fs_destroy_segment_manager(sbi);
+	f2fs_destroy_post_read_wq(sbi);
 free_devices:
 	destroy_device_list(sbi);
 	kvfree(sbi->ckpt);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f164959e4224..612a2b16d55c 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -154,6 +154,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_casefold(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "casefold");
+	if (f2fs_sb_has_compression(sbi))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "compression");
 	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "pin_file");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
@@ -389,6 +392,7 @@ enum feat_id {
 	FEAT_VERITY,
 	FEAT_SB_CHECKSUM,
 	FEAT_CASEFOLD,
+	FEAT_COMPRESSION,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -408,6 +412,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_VERITY:
 	case FEAT_SB_CHECKSUM:
 	case FEAT_CASEFOLD:
+	case FEAT_COMPRESSION:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -502,6 +507,7 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
 #endif
 F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
 F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -571,6 +577,7 @@ static struct attribute *f2fs_feat_attrs[] = {
 #endif
 	ATTR_LIST(sb_checksum),
 	ATTR_LIST(casefold),
+	ATTR_LIST(compression),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs_feat);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 284738996028..a9e7e37fcb8a 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -21,8 +21,12 @@
 #define F2FS_EXTENSION_LEN		8	/* max size of extension */
 #define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
 
+#define CLUSTER_IS_FULL			1
+#define	CLUSTER_HAS_SPACE		2
+
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
+#define COMPRESS_ADDR		((block_t)-2)	/* used as compressed data flag */
 
 #define F2FS_BYTES_TO_BLK(bytes)	((bytes) >> F2FS_BLKSIZE_BITS)
 #define F2FS_BLK_TO_BYTES(blk)		((blk) << F2FS_BLKSIZE_BITS)
@@ -271,6 +275,10 @@ struct f2fs_inode {
 			__le32 i_inode_checksum;/* inode meta checksum */
 			__le64 i_crtime;	/* creation time */
 			__le32 i_crtime_nsec;	/* creation time in nano scale */
+			__le64 i_compressed_blocks;	/* # of compressed blocks */
+			__u8 i_compress_algorithm;	/* compress algorithm */
+			__u8 i_log_cluster_size;	/* log of cluster size */
+			__le16 i_padding;		/* padding */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		} __packed;
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 1796ff99c3e9..cb51ea00dbc7 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -148,6 +148,11 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ F2FS_GOING_DOWN_METAFLUSH,	"meta flush" },		\
 		{ F2FS_GOING_DOWN_NEED_FSCK,	"need fsck" })
 
+#define show_compress_algorithm(type)					\
+	__print_symbolic(type,						\
+		{ COMPRESS_LZO,		"LZO" },			\
+		{ COMPRESS_LZ4,		"LZ4" })
+
 struct f2fs_sb_info;
 struct f2fs_io_info;
 struct extent_info;
@@ -1710,6 +1715,100 @@ TRACE_EVENT(f2fs_shutdown,
 		__entry->ret)
 );
 
+DECLARE_EVENT_CLASS(f2fs_zip_start,
+
+	TP_PROTO(struct inode *inode, unsigned int cluster_idx,
+			unsigned int cluster_size, unsigned char algtype),
+
+	TP_ARGS(inode, cluster_idx, cluster_size, algtype),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(unsigned int, idx)
+		__field(unsigned int, size)
+		__field(unsigned int, algtype)
+	),
+
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->idx = cluster_idx;
+		__entry->size = cluster_size;
+		__entry->algtype = algtype;
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%u, "
+		"cluster_size = %u, algorithm = %s",
+		show_dev_ino(__entry),
+		__entry->idx,
+		__entry->size,
+		show_compress_algorithm(__entry->algtype))
+);
+
+DECLARE_EVENT_CLASS(f2fs_zip_end,
+
+	TP_PROTO(struct inode *inode, unsigned int cluster_idx,
+			unsigned int compressed_size, int ret),
+
+	TP_ARGS(inode, cluster_idx, compressed_size, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(unsigned int, idx)
+		__field(unsigned int, size)
+		__field(unsigned int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->idx = cluster_idx;
+		__entry->size = compressed_size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%u, "
+		"compressed_size = %u, ret = %d",
+		show_dev_ino(__entry),
+		__entry->idx,
+		__entry->size,
+		__entry->ret)
+);
+
+DEFINE_EVENT(f2fs_zip_start, f2fs_compress_pages_start,
+
+	TP_PROTO(struct inode *inode, unsigned int cluster_idx,
+		unsigned int cluster_size, unsigned char algtype),
+
+	TP_ARGS(inode, cluster_idx, cluster_size, algtype)
+);
+
+DEFINE_EVENT(f2fs_zip_start, f2fs_decompress_pages_start,
+
+	TP_PROTO(struct inode *inode, unsigned int cluster_idx,
+		unsigned int cluster_size, unsigned char algtype),
+
+	TP_ARGS(inode, cluster_idx, cluster_size, algtype)
+);
+
+DEFINE_EVENT(f2fs_zip_end, f2fs_compress_pages_end,
+
+	TP_PROTO(struct inode *inode, unsigned int cluster_idx,
+			unsigned int compressed_size, int ret),
+
+	TP_ARGS(inode, cluster_idx, compressed_size, ret)
+);
+
+DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end,
+
+	TP_PROTO(struct inode *inode, unsigned int cluster_idx,
+			unsigned int compressed_size, int ret),
+
+	TP_ARGS(inode, cluster_idx, compressed_size, ret)
+);
+
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */
-- 
2.19.0.605.g01d371f741-goog



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
@ 2019-10-22 17:53   ` Ju Hyung Park
  2019-10-24  9:10     ` Chao Yu
  2019-10-23  5:24   ` Eric Biggers
  2019-10-27 22:50   ` Eric Biggers
  2 siblings, 1 reply; 32+ messages in thread
From: Ju Hyung Park @ 2019-10-22 17:53 UTC (permalink / raw)
  To: Jaegeuk Kim, Chao Yu; +Cc: linux-f2fs-devel

Hi Jaegeuk and Chao,

Nice to see this finally getting into shape :) Great work
I'm excited to see possible use-cases for this in the future.

Would f2fs compress files automatically like how btrfs' "compress" option works?
Or is it per-extension basis for now?

On Wed, Oct 23, 2019 at 2:16 AM Jaegeuk Kim <jaegeuk@kernel.org> wrote:
> +compress_algorithm=%s  Control compress algorithm, currently f2fs supports "lzo"
> +                       and "lz4" algorithm.

I see absolutely no reason to support regular lzo variant at this time.
Everyone should use lz4 instead of lzo. If one wants zlib-level
compression, they should use zstd.

However, there's recent conversation on new lzo-rle and how it could
be a better candidate than lz4.

Since the mainline now have lz4, zstd and lzo-rle, I don't think
supporting lzo is a good idea.

> diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
> index 652fd2e2b23d..c12854c3b1a1 100644
> --- a/fs/f2fs/Kconfig
> +++ b/fs/f2fs/Kconfig
> @@ -6,6 +6,10 @@ config F2FS_FS
>         select CRYPTO
>         select CRYPTO_CRC32
>         select F2FS_FS_XATTR if FS_ENCRYPTION
> +       select LZO_COMPRESS
> +       select LZO_DECOMPRESS
> +       select LZ4_COMPRESS
> +       select LZ4_DECOMPRESS

This is a bad idea.
This unnecessarily increases kernel binary image when no the user
intends to change the defaults.

For example, my Android kernel doesn't use lzo anywhere and this
wouldn't be welcome.

> diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
> new file mode 100644
> index 000000000000..f276d82a67aa
> --- /dev/null
> +++ b/fs/f2fs/compress.c
> @@ -0,0 +1,1066 @@
> +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index)
> +static unsigned int cluster_idx(struct compress_ctx *cc, pgoff_t index)
> +static unsigned int start_idx_of_cluster(struct compress_ctx *cc)

Looks like these would be better if they were explicitly marked as inline.

> +static void f2fs_init_compress_ops(struct f2fs_sb_info *sbi)
> +{
> +       sbi->cops[COMPRESS_LZO] = &f2fs_lzo_ops;
> +       sbi->cops[COMPRESS_LZ4] = &f2fs_lz4_ops;
> +}

Would it be possible for f2fs to use generic crypto compression APIs?
Hardcoding for lzo/lz4 would make it harder to venture future different options.

Have a look at mm/zswap.c:__zswap_pool_create_fallback().

> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index c681f51e351b..775c96291490 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -155,6 +163,7 @@ struct f2fs_mount_info {
>  #define F2FS_FEATURE_VERITY            0x0400
>  #define F2FS_FEATURE_SB_CHKSUM         0x0800
>  #define F2FS_FEATURE_CASEFOLD          0x1000
> +#define F2FS_FEATURE_COMPRESSION       0x2000

How would older versions of f2fs behave if an image was used by the
latest f2fs and have compressed pages?
I hope fail-safes are in place.

Thanks.

> --
> 2.19.0.605.g01d371f741-goog
>
>
>
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
  2019-10-22 17:53   ` Ju Hyung Park
@ 2019-10-23  5:24   ` Eric Biggers
  2019-10-23 17:28     ` Jaegeuk Kim
  2019-10-25  9:07     ` Chao Yu
  2019-10-27 22:50   ` Eric Biggers
  2 siblings, 2 replies; 32+ messages in thread
From: Eric Biggers @ 2019-10-23  5:24 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

On Tue, Oct 22, 2019 at 10:16:02AM -0700, Jaegeuk Kim wrote:
> From: Chao Yu <yuchao0@huawei.com>
> 
> This patch tries to support compression in f2fs.
> 
> - New term named cluster is defined as basic unit of compression, file can
> be divided into multiple clusters logically. One cluster includes 4 << n
> (n >= 0) logical pages, compression size is also cluster size, each of
> cluster can be compressed or not.
> 
> - In cluster metadata layout, one special flag is used to indicate cluster
> is compressed one or normal one, for compressed cluster, following metadata
> maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
> data including compress header and compressed data.
> 
> - In order to eliminate write amplification during overwrite, F2FS only
> support compression on write-once file, data can be compressed only when
> all logical blocks in file are valid and cluster compress ratio is lower
> than specified threshold.
> 
> - To enable compression on regular inode, there are three ways:
> * chattr +c file
> * chattr +c dir; touch dir/file
> * mount w/ -o compress_extension=ext; touch file.ext
> 
> Compress metadata layout:
>                              [Dnode Structure]
>              +-----------------------------------------------+
>              | cluster 1 | cluster 2 | ......... | cluster N |
>              +-----------------------------------------------+
>              .           .                       .           .
>        .                       .                .                      .
>   .         Compressed Cluster       .        .        Normal Cluster            .
> +----------+---------+---------+---------+  +---------+---------+---------+---------+
> |compr flag| block 1 | block 2 | block 3 |  | block 1 | block 2 | block 3 | block 4 |
> +----------+---------+---------+---------+  +---------+---------+---------+---------+
>            .                             .
>          .                                           .
>        .                                                           .
>       +-------------+-------------+----------+----------------------------+
>       | data length | data chksum | reserved |      compressed data       |
>       +-------------+-------------+----------+----------------------------+
> 
> Changelog:
> 
> 20190326:
> - fix error handling of read_end_io().
> - remove unneeded comments in f2fs_encrypt_one_page().
> 
> 20190327:
> - fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
> - don't jump into loop directly to avoid uninitialized variables.
> - add TODO tag in error path of f2fs_write_cache_pages().
> 
> 20190328:
> - fix wrong merge condition in f2fs_read_multi_pages().
> - check compressed file in f2fs_post_read_required().
> 
> 20190401
> - allow overwrite on non-compressed cluster.
> - check cluster meta before writing compressed data.
> 
> 20190402
> - don't preallocate blocks for compressed file.
> 
> - add lz4 compress algorithm
> - process multiple post read works in one workqueue
>   Now f2fs supports processing post read work in multiple workqueue,
>   it shows low performance due to schedule overhead of multiple
>   workqueue executing orderly.
> 
> - compress: support buffered overwrite
> C: compress cluster flag
> V: valid block address
> N: NEW_ADDR
> 
> One cluster contain 4 blocks
> 
>  before overwrite   after overwrite
> 
> - VVVV		->	CVNN
> - CVNN		->	VVVV
> 
> - CVNN		->	CVNN
> - CVNN		->	CVVV
> 
> - CVVV		->	CVNN
> - CVVV		->	CVVV
> 
> [Jaegeuk Kim]
> - add tracepoint for f2fs_{,de}compress_pages()
> - fix many bugs and add some compression stats
> 
> Signed-off-by: Chao Yu <yuchao0@huawei.com>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

How was this tested?  Shouldn't there a mount option analogous to
test_dummy_encryption that causes all files to be auto-compressed, so that a
full run of xfstests can be done with compression?  I see "compress_extension",
but apparently it's only for a file extension?  Also, since reads can involve
any combination of decryption, compression, and verity, it's important to test
as many combinations as possible, including all at once.  Has that been done?

I also tried running the fs-verity xfstests on this with
'kvm-xfstests -c f2fs -g verity', but the kernel immediately crashes:

BUG: kernel NULL pointer dereference, address: 0000000000000182
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0 
Oops: 0000 [#1] SMP
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.4.0-rc1-00119-g60f351f4c50f #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20191013_105130-anatol 04/01/2014
RIP: 0010:__queue_work+0x3e/0x5f0 kernel/workqueue.c:1409
Code: d4 53 48 83 ec 18 89 7d d4 8b 3d c1 bf 2a 01 85 ff 74 17 65 48 8b 04 25 80 5d 01 00 8b b0 0c 07 00 00 85 f6 0f 84 1
RSP: 0018:ffffc900000a8db0 EFLAGS: 00010046
RAX: ffff88807d94e340 RBX: 0000000000000246 RCX: 0000000000000000
RDX: ffff88807d9e0be8 RSI: 0000000000000000 RDI: 0000000000000001
RBP: ffffc900000a8df0 R08: 0000000000000000 R09: 0000000000000001
R10: ffff888075f2bc68 R11: 0000000000000000 R12: ffff88807d9e0be8
R13: 0000000000000000 R14: 0000000000000030 R15: ffff88807c2c6780
FS:  0000000000000000(0000) GS:ffff88807fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000182 CR3: 00000000757e3000 CR4: 00000000003406e0
Call Trace:
 <IRQ>
 queue_work_on+0x67/0x70 kernel/workqueue.c:1518
 queue_work include/linux/workqueue.h:494 [inline]
 f2fs_enqueue_post_read_work fs/f2fs/data.c:166 [inline]
 bio_post_read_processing fs/f2fs/data.c:173 [inline]
 f2fs_read_end_io+0xcb/0xe0 fs/f2fs/data.c:195
 bio_endio+0xa4/0x1a0 block/bio.c:1818
 req_bio_endio block/blk-core.c:242 [inline]
 blk_update_request+0xf6/0x310 block/blk-core.c:1462
 blk_mq_end_request+0x1c/0x130 block/blk-mq.c:568
 virtblk_request_done+0x32/0x80 drivers/block/virtio_blk.c:226
 blk_done_softirq+0x98/0xc0 block/blk-softirq.c:37
 __do_softirq+0xc1/0x40d kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0xb3/0xc0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 do_IRQ+0x5b/0x110 arch/x86/kernel/irq.c:263
 common_interrupt+0xf/0xf arch/x86/entry/entry_64.S:607
 </IRQ>
RIP: 0010:native_safe_halt arch/x86/include/asm/irqflags.h:60 [inline]
RIP: 0010:arch_safe_halt arch/x86/include/asm/irqflags.h:103 [inline]
RIP: 0010:default_idle+0x29/0x160 arch/x86/kernel/process.c:580
Code: 90 55 48 89 e5 41 55 41 54 65 44 8b 25 70 64 76 7e 53 0f 1f 44 00 00 e8 95 13 88 ff e9 07 00 00 00 0f 00 2d 8b c0 b
RSP: 0018:ffffc90000073e78 EFLAGS: 00000202 ORIG_RAX: ffffffffffffffdc
RAX: ffff88807d94e340 RBX: 0000000000000001 RCX: 0000000000000000
RDX: 0000000000000046 RSI: 0000000000000006 RDI: ffff88807d94e340
RBP: ffffc90000073e90 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: ffff88807d94e340 R14: 0000000000000000 R15: 0000000000000000
 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
 default_idle_call+0x1e/0x30 kernel/sched/idle.c:94
 cpuidle_idle_call kernel/sched/idle.c:154 [inline]
 do_idle+0x1e4/0x210 kernel/sched/idle.c:263
 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355
 start_secondary+0x151/0x1a0 arch/x86/kernel/smpboot.c:264
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241
CR2: 0000000000000182
---[ end trace 86328090a3179142 ]---
RIP: 0010:__queue_work+0x3e/0x5f0 kernel/workqueue.c:1409
Code: d4 53 48 83 ec 18 89 7d d4 8b 3d c1 bf 2a 01 85 ff 74 17 65 48 8b 04 25 80 5d 01 00 8b b0 0c 07 00 00 85 f6 0f 84 1
RSP: 0018:ffffc900000a8db0 EFLAGS: 00010046
RAX: ffff88807d94e340 RBX: 0000000000000246 RCX: 0000000000000000
RDX: ffff88807d9e0be8 RSI: 0000000000000000 RDI: 0000000000000001
RBP: ffffc900000a8df0 R08: 0000000000000000 R09: 0000000000000001
R10: ffff888075f2bc68 R11: 0000000000000000 R12: ffff88807d9e0be8
R13: 0000000000000000 R14: 0000000000000030 R15: ffff88807c2c6780
FS:  0000000000000000(0000) GS:ffff88807fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000182 CR3: 00000000757e3000 CR4: 00000000003406e0
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: disabled
Rebooting in 5 seconds..


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-23  5:24   ` Eric Biggers
@ 2019-10-23 17:28     ` Jaegeuk Kim
  2019-10-25  9:07     ` Chao Yu
  1 sibling, 0 replies; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-23 17:28 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel

On 10/22, Eric Biggers wrote:
> On Tue, Oct 22, 2019 at 10:16:02AM -0700, Jaegeuk Kim wrote:
> > From: Chao Yu <yuchao0@huawei.com>
> > 
> > This patch tries to support compression in f2fs.
> > 
> > - New term named cluster is defined as basic unit of compression, file can
> > be divided into multiple clusters logically. One cluster includes 4 << n
> > (n >= 0) logical pages, compression size is also cluster size, each of
> > cluster can be compressed or not.
> > 
> > - In cluster metadata layout, one special flag is used to indicate cluster
> > is compressed one or normal one, for compressed cluster, following metadata
> > maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
> > data including compress header and compressed data.
> > 
> > - In order to eliminate write amplification during overwrite, F2FS only
> > support compression on write-once file, data can be compressed only when
> > all logical blocks in file are valid and cluster compress ratio is lower
> > than specified threshold.
> > 
> > - To enable compression on regular inode, there are three ways:
> > * chattr +c file
> > * chattr +c dir; touch dir/file
> > * mount w/ -o compress_extension=ext; touch file.ext
> > 
> > Compress metadata layout:
> >                              [Dnode Structure]
> >              +-----------------------------------------------+
> >              | cluster 1 | cluster 2 | ......... | cluster N |
> >              +-----------------------------------------------+
> >              .           .                       .           .
> >        .                       .                .                      .
> >   .         Compressed Cluster       .        .        Normal Cluster            .
> > +----------+---------+---------+---------+  +---------+---------+---------+---------+
> > |compr flag| block 1 | block 2 | block 3 |  | block 1 | block 2 | block 3 | block 4 |
> > +----------+---------+---------+---------+  +---------+---------+---------+---------+
> >            .                             .
> >          .                                           .
> >        .                                                           .
> >       +-------------+-------------+----------+----------------------------+
> >       | data length | data chksum | reserved |      compressed data       |
> >       +-------------+-------------+----------+----------------------------+
> > 
> > Changelog:
> > 
> > 20190326:
> > - fix error handling of read_end_io().
> > - remove unneeded comments in f2fs_encrypt_one_page().
> > 
> > 20190327:
> > - fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
> > - don't jump into loop directly to avoid uninitialized variables.
> > - add TODO tag in error path of f2fs_write_cache_pages().
> > 
> > 20190328:
> > - fix wrong merge condition in f2fs_read_multi_pages().
> > - check compressed file in f2fs_post_read_required().
> > 
> > 20190401
> > - allow overwrite on non-compressed cluster.
> > - check cluster meta before writing compressed data.
> > 
> > 20190402
> > - don't preallocate blocks for compressed file.
> > 
> > - add lz4 compress algorithm
> > - process multiple post read works in one workqueue
> >   Now f2fs supports processing post read work in multiple workqueue,
> >   it shows low performance due to schedule overhead of multiple
> >   workqueue executing orderly.
> > 
> > - compress: support buffered overwrite
> > C: compress cluster flag
> > V: valid block address
> > N: NEW_ADDR
> > 
> > One cluster contain 4 blocks
> > 
> >  before overwrite   after overwrite
> > 
> > - VVVV		->	CVNN
> > - CVNN		->	VVVV
> > 
> > - CVNN		->	CVNN
> > - CVNN		->	CVVV
> > 
> > - CVVV		->	CVNN
> > - CVVV		->	CVVV
> > 
> > [Jaegeuk Kim]
> > - add tracepoint for f2fs_{,de}compress_pages()
> > - fix many bugs and add some compression stats
> > 
> > Signed-off-by: Chao Yu <yuchao0@huawei.com>
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> 
> How was this tested?  Shouldn't there a mount option analogous to
> test_dummy_encryption that causes all files to be auto-compressed, so that a
> full run of xfstests can be done with compression?  I see "compress_extension",
> but apparently it's only for a file extension?  Also, since reads can involve
> any combination of decryption, compression, and verity, it's important to test
> as many combinations as possible, including all at once.  Has that been done?

This patch should be RFC which requires as many tests as possible. I posted it
quite early in order to get some reviews and feedback as well.

What I've done so far would look like:
- mkfs.f2fs -f -O encrypt -O quota -O compression -O extra_attr /dev/sdb1
- mount -t f2fs /dev/sdb1 /mnt/test
- mkdir /mnt/test/comp_dir
- f2fs_io setflags compression /mnt/test/comp_dir
- cd /mnt/test/comp_dir
- git clone kernel.git
- compile kernel
- or, fsstress on top of it

> 
> I also tried running the fs-verity xfstests on this with
> 'kvm-xfstests -c f2fs -g verity', but the kernel immediately crashes:

I didn't check verity yet. I'll take a look at this soon.

> 
> BUG: kernel NULL pointer dereference, address: 0000000000000182
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0 
> Oops: 0000 [#1] SMP
> CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.4.0-rc1-00119-g60f351f4c50f #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20191013_105130-anatol 04/01/2014
> RIP: 0010:__queue_work+0x3e/0x5f0 kernel/workqueue.c:1409
> Code: d4 53 48 83 ec 18 89 7d d4 8b 3d c1 bf 2a 01 85 ff 74 17 65 48 8b 04 25 80 5d 01 00 8b b0 0c 07 00 00 85 f6 0f 84 1
> RSP: 0018:ffffc900000a8db0 EFLAGS: 00010046
> RAX: ffff88807d94e340 RBX: 0000000000000246 RCX: 0000000000000000
> RDX: ffff88807d9e0be8 RSI: 0000000000000000 RDI: 0000000000000001
> RBP: ffffc900000a8df0 R08: 0000000000000000 R09: 0000000000000001
> R10: ffff888075f2bc68 R11: 0000000000000000 R12: ffff88807d9e0be8
> R13: 0000000000000000 R14: 0000000000000030 R15: ffff88807c2c6780
> FS:  0000000000000000(0000) GS:ffff88807fd00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000182 CR3: 00000000757e3000 CR4: 00000000003406e0
> Call Trace:
>  <IRQ>
>  queue_work_on+0x67/0x70 kernel/workqueue.c:1518
>  queue_work include/linux/workqueue.h:494 [inline]
>  f2fs_enqueue_post_read_work fs/f2fs/data.c:166 [inline]
>  bio_post_read_processing fs/f2fs/data.c:173 [inline]
>  f2fs_read_end_io+0xcb/0xe0 fs/f2fs/data.c:195
>  bio_endio+0xa4/0x1a0 block/bio.c:1818
>  req_bio_endio block/blk-core.c:242 [inline]
>  blk_update_request+0xf6/0x310 block/blk-core.c:1462
>  blk_mq_end_request+0x1c/0x130 block/blk-mq.c:568
>  virtblk_request_done+0x32/0x80 drivers/block/virtio_blk.c:226
>  blk_done_softirq+0x98/0xc0 block/blk-softirq.c:37
>  __do_softirq+0xc1/0x40d kernel/softirq.c:292
>  invoke_softirq kernel/softirq.c:373 [inline]
>  irq_exit+0xb3/0xc0 kernel/softirq.c:413
>  exiting_irq arch/x86/include/asm/apic.h:536 [inline]
>  do_IRQ+0x5b/0x110 arch/x86/kernel/irq.c:263
>  common_interrupt+0xf/0xf arch/x86/entry/entry_64.S:607
>  </IRQ>
> RIP: 0010:native_safe_halt arch/x86/include/asm/irqflags.h:60 [inline]
> RIP: 0010:arch_safe_halt arch/x86/include/asm/irqflags.h:103 [inline]
> RIP: 0010:default_idle+0x29/0x160 arch/x86/kernel/process.c:580
> Code: 90 55 48 89 e5 41 55 41 54 65 44 8b 25 70 64 76 7e 53 0f 1f 44 00 00 e8 95 13 88 ff e9 07 00 00 00 0f 00 2d 8b c0 b
> RSP: 0018:ffffc90000073e78 EFLAGS: 00000202 ORIG_RAX: ffffffffffffffdc
> RAX: ffff88807d94e340 RBX: 0000000000000001 RCX: 0000000000000000
> RDX: 0000000000000046 RSI: 0000000000000006 RDI: ffff88807d94e340
> RBP: ffffc90000073e90 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
> R13: ffff88807d94e340 R14: 0000000000000000 R15: 0000000000000000
>  arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
>  default_idle_call+0x1e/0x30 kernel/sched/idle.c:94
>  cpuidle_idle_call kernel/sched/idle.c:154 [inline]
>  do_idle+0x1e4/0x210 kernel/sched/idle.c:263
>  cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355
>  start_secondary+0x151/0x1a0 arch/x86/kernel/smpboot.c:264
>  secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241
> CR2: 0000000000000182
> ---[ end trace 86328090a3179142 ]---
> RIP: 0010:__queue_work+0x3e/0x5f0 kernel/workqueue.c:1409
> Code: d4 53 48 83 ec 18 89 7d d4 8b 3d c1 bf 2a 01 85 ff 74 17 65 48 8b 04 25 80 5d 01 00 8b b0 0c 07 00 00 85 f6 0f 84 1
> RSP: 0018:ffffc900000a8db0 EFLAGS: 00010046
> RAX: ffff88807d94e340 RBX: 0000000000000246 RCX: 0000000000000000
> RDX: ffff88807d9e0be8 RSI: 0000000000000000 RDI: 0000000000000001
> RBP: ffffc900000a8df0 R08: 0000000000000000 R09: 0000000000000001
> R10: ffff888075f2bc68 R11: 0000000000000000 R12: ffff88807d9e0be8
> R13: 0000000000000000 R14: 0000000000000030 R15: ffff88807c2c6780
> FS:  0000000000000000(0000) GS:ffff88807fd00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000182 CR3: 00000000757e3000 CR4: 00000000003406e0
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: disabled
> Rebooting in 5 seconds..


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-22 17:16 [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Jaegeuk Kim
  2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
@ 2019-10-24  8:21 ` Chao Yu
  2019-10-25 18:18   ` Jaegeuk Kim
  2019-11-07 19:14 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
  2 siblings, 1 reply; 32+ messages in thread
From: Chao Yu @ 2019-10-24  8:21 UTC (permalink / raw)
  To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

On 2019/10/23 1:16, Jaegeuk Kim wrote:
> This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
> by allocating fully valid 2MB segment.
> 
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
>  fs/f2fs/f2fs.h     |  4 +++-
>  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
>  fs/f2fs/recovery.c |  2 +-
>  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
>  fs/f2fs/segment.h  |  2 ++
>  fs/f2fs/super.c    |  1 +
>  fs/f2fs/sysfs.c    |  2 ++
>  7 files changed, 63 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index ca342f4c7db1..c681f51e351b 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -890,6 +890,7 @@ enum {
>  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
>  	CURSEG_COLD_NODE,	/* indirect node blocks */
>  	NO_CHECK_TYPE,
> +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
>  };
>  
>  struct flush_cmd {
> @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
>  
>  	/* threshold for gc trials on pinned files */
>  	u64 gc_pin_file_threshold;
> +	struct rw_semaphore pin_sem;
>  
>  	/* maximum # of trials to find a victim segment for SSR and GC */
>  	unsigned int max_victim_search;
> @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
>  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
>  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>  					unsigned int start, unsigned int end);
> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
>  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
>  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
>  					struct cp_control *cpc);
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 29bc0a542759..f6c038e8a6a7 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
>  	if (off_end)
>  		map.m_len++;
>  
> -	if (f2fs_is_pinned_file(inode))
> -		map.m_seg_type = CURSEG_COLD_DATA;
> +	if (!map.m_len)
> +		return 0;
> +
> +	if (f2fs_is_pinned_file(inode)) {
> +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
> +					sbi->log_blocks_per_seg;
> +		block_t done = 0;
> +
> +		if (map.m_len % sbi->blocks_per_seg)
> +			len += sbi->blocks_per_seg;
>  
> -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
> -						F2FS_GET_BLOCK_PRE_DIO :
> -						F2FS_GET_BLOCK_PRE_AIO));
> +		map.m_len = sbi->blocks_per_seg;
> +next_alloc:
> +		mutex_lock(&sbi->gc_mutex);
> +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
> +		if (err && err != -ENODATA && err != -EAGAIN)
> +			goto out_err;

To grab enough free space?

Shouldn't we call

	if (has_not_enough_free_secs(sbi, 0, 0)) {
		mutex_lock(&sbi->gc_mutex);
		f2fs_gc(sbi, false, false, NULL_SEGNO);
	}

> +
> +		down_write(&sbi->pin_sem);
> +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
> +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
> +		up_write(&sbi->pin_sem);
> +
> +		done += map.m_len;
> +		len -= map.m_len;
> +		map.m_lblk += map.m_len;
> +		if (!err && len)
> +			goto next_alloc;
> +
> +		map.m_len = done;
> +	} else {
> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
> +	}
> +out_err:
>  	if (err) {
>  		pgoff_t last_off;
>  
> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> index 783773e4560d..76477f71d4ee 100644
> --- a/fs/f2fs/recovery.c
> +++ b/fs/f2fs/recovery.c
> @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
>  		f2fs_put_page(page, 1);
>  	}
>  	if (!err)
> -		f2fs_allocate_new_segments(sbi);
> +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
>  	return err;
>  }
>  
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 25c750cd0272..253d72c2663c 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>  	up_read(&SM_I(sbi)->curseg_lock);
>  }
>  
> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
>  {
>  	struct curseg_info *curseg;
>  	unsigned int old_segno;
> @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>  	down_write(&SIT_I(sbi)->sentry_lock);
>  
>  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
> +		if (type != NO_CHECK_TYPE && i != type)
> +			continue;
> +
>  		curseg = CURSEG_I(sbi, i);
>  		old_segno = curseg->segno;
>  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
> @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>  {
>  	struct sit_info *sit_i = SIT_I(sbi);
>  	struct curseg_info *curseg = CURSEG_I(sbi, type);
> +	bool put_pin_sem = false;
> +
> +	if (type == CURSEG_COLD_DATA) {
> +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
> +		if (down_read_trylock(&sbi->pin_sem)) {
> +			put_pin_sem = true;
> +		} else {
> +			type = CURSEG_WARM_DATA;
> +			curseg = CURSEG_I(sbi, type);

It will mix pending cold data into warm area... rather than recovering curseg to
write pointer of last cold segment?

I know maybe that fallocate aligned address could be corner case, but I guess
there should be some better solutions can handle race case more effectively.

One solution could be: allocating a virtual log header to select free segment as
2m-aligned space target.

Thanks,

> +		}
> +	} else if (type == CURSEG_COLD_DATA_PINNED) {
> +		type = CURSEG_COLD_DATA;
> +	}
>  
>  	down_read(&SM_I(sbi)->curseg_lock);
>  
> @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>  	mutex_unlock(&curseg->curseg_mutex);
>  
>  	up_read(&SM_I(sbi)->curseg_lock);
> +
> +	if (put_pin_sem)
> +		up_read(&sbi->pin_sem);
>  }
>  
>  static void update_device_state(struct f2fs_io_info *fio)
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 325781a1ae4d..a95467b202ea 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -313,6 +313,8 @@ struct sit_entry_set {
>   */
>  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
>  {
> +	if (type == CURSEG_COLD_DATA_PINNED)
> +		type = CURSEG_COLD_DATA;
>  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
>  }
>  
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index f320fd11db48..c02a47ce551b 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
>  	spin_lock_init(&sbi->dev_lock);
>  
>  	init_rwsem(&sbi->sb_lock);
> +	init_rwsem(&sbi->pin_sem);
>  }
>  
>  static int init_percpu_info(struct f2fs_sb_info *sbi)
> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
> index b558b64a4c9c..f164959e4224 100644
> --- a/fs/f2fs/sysfs.c
> +++ b/fs/f2fs/sysfs.c
> @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
>  	if (f2fs_sb_has_casefold(sbi))
>  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>  				len ? ", " : "", "casefold");
> +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> +				len ? ", " : "", "pin_file");
>  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
>  	return len;
>  }
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-22 17:53   ` Ju Hyung Park
@ 2019-10-24  9:10     ` Chao Yu
  0 siblings, 0 replies; 32+ messages in thread
From: Chao Yu @ 2019-10-24  9:10 UTC (permalink / raw)
  To: Ju Hyung Park, Jaegeuk Kim; +Cc: linux-f2fs-devel

Hi Ju Hyung,

On 2019/10/23 1:53, Ju Hyung Park wrote:
> Hi Jaegeuk and Chao,
> 
> Nice to see this finally getting into shape :) Great work
> I'm excited to see possible use-cases for this in the future.
> 
> Would f2fs compress files automatically like how btrfs' "compress" option works?
> Or is it per-extension basis for now?

We support three ways to active file compression:

Quoted:

- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext

> 
> On Wed, Oct 23, 2019 at 2:16 AM Jaegeuk Kim <jaegeuk@kernel.org> wrote:
>> +compress_algorithm=%s  Control compress algorithm, currently f2fs supports "lzo"
>> +                       and "lz4" algorithm.
> 
> I see absolutely no reason to support regular lzo variant at this time.
> Everyone should use lz4 instead of lzo. If one wants zlib-level
> compression, they should use zstd.
> 
> However, there's recent conversation on new lzo-rle and how it could
> be a better candidate than lz4.
> 
> Since the mainline now have lz4, zstd and lzo-rle, I don't think
> supporting lzo is a good idea.

This is just RFC version, we can change it anytime, let's decide whether
deleting it before final version.

> 
>> diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
>> index 652fd2e2b23d..c12854c3b1a1 100644
>> --- a/fs/f2fs/Kconfig
>> +++ b/fs/f2fs/Kconfig
>> @@ -6,6 +6,10 @@ config F2FS_FS
>>         select CRYPTO
>>         select CRYPTO_CRC32
>>         select F2FS_FS_XATTR if FS_ENCRYPTION
>> +       select LZO_COMPRESS
>> +       select LZO_DECOMPRESS
>> +       select LZ4_COMPRESS
>> +       select LZ4_DECOMPRESS
> 
> This is a bad idea.
> This unnecessarily increases kernel binary image when no the user
> intends to change the defaults.
> 
> For example, my Android kernel doesn't use lzo anywhere and this
> wouldn't be welcome.

Agreed, maybe we need a kconfig entry for compress.c as well.

> 
>> diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
>> new file mode 100644
>> index 000000000000..f276d82a67aa
>> --- /dev/null
>> +++ b/fs/f2fs/compress.c
>> @@ -0,0 +1,1066 @@
>> +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index)
>> +static unsigned int cluster_idx(struct compress_ctx *cc, pgoff_t index)
>> +static unsigned int start_idx_of_cluster(struct compress_ctx *cc)
> 
> Looks like these would be better if they were explicitly marked as inline.
> 
>> +static void f2fs_init_compress_ops(struct f2fs_sb_info *sbi)
>> +{
>> +       sbi->cops[COMPRESS_LZO] = &f2fs_lzo_ops;
>> +       sbi->cops[COMPRESS_LZ4] = &f2fs_lz4_ops;
>> +}
> 
> Would it be possible for f2fs to use generic crypto compression APIs?
> Hardcoding for lzo/lz4 would make it harder to venture future different options.
> 
> Have a look at mm/zswap.c:__zswap_pool_create_fallback().

Not sure, I think I could look into it later, now Jaegeuk and I have to
stabilize codes first.

Thanks for your advice anyway.

> 
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index c681f51e351b..775c96291490 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -155,6 +163,7 @@ struct f2fs_mount_info {
>>  #define F2FS_FEATURE_VERITY            0x0400
>>  #define F2FS_FEATURE_SB_CHKSUM         0x0800
>>  #define F2FS_FEATURE_CASEFOLD          0x1000
>> +#define F2FS_FEATURE_COMPRESSION       0x2000
> 
> How would older versions of f2fs behave if an image was used by the
> latest f2fs and have compressed pages?
> I hope fail-safes are in place.

That patch haven't merged yet, since there is detailed implementation which is
under discussion.

Thanks,

> 
> Thanks.
> 
>> --
>> 2.19.0.605.g01d371f741-goog
>>
>>
>>
>> _______________________________________________
>> Linux-f2fs-devel mailing list
>> Linux-f2fs-devel@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-23  5:24   ` Eric Biggers
  2019-10-23 17:28     ` Jaegeuk Kim
@ 2019-10-25  9:07     ` Chao Yu
  1 sibling, 0 replies; 32+ messages in thread
From: Chao Yu @ 2019-10-25  9:07 UTC (permalink / raw)
  To: Eric Biggers; +Cc: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2019/10/23 13:24, Eric Biggers wrote:
> How was this tested?  Shouldn't there a mount option analogous to

This should be a pre-RFC version..., I only didn't simple test on it, will do
more later with combination of other features.

Thanks,


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-24  8:21 ` [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Chao Yu
@ 2019-10-25 18:18   ` Jaegeuk Kim
  2019-10-26  1:31     ` Chao Yu
  0 siblings, 1 reply; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-25 18:18 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On 10/24, Chao Yu wrote:
> Hi Jaegeuk,
> 
> On 2019/10/23 1:16, Jaegeuk Kim wrote:
> > This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
> > by allocating fully valid 2MB segment.
> > 
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > ---
> >  fs/f2fs/f2fs.h     |  4 +++-
> >  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
> >  fs/f2fs/recovery.c |  2 +-
> >  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
> >  fs/f2fs/segment.h  |  2 ++
> >  fs/f2fs/super.c    |  1 +
> >  fs/f2fs/sysfs.c    |  2 ++
> >  7 files changed, 63 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index ca342f4c7db1..c681f51e351b 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -890,6 +890,7 @@ enum {
> >  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
> >  	CURSEG_COLD_NODE,	/* indirect node blocks */
> >  	NO_CHECK_TYPE,
> > +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
> >  };
> >  
> >  struct flush_cmd {
> > @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
> >  
> >  	/* threshold for gc trials on pinned files */
> >  	u64 gc_pin_file_threshold;
> > +	struct rw_semaphore pin_sem;
> >  
> >  	/* maximum # of trials to find a victim segment for SSR and GC */
> >  	unsigned int max_victim_search;
> > @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
> >  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
> >  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
> >  					unsigned int start, unsigned int end);
> > -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
> > +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
> >  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
> >  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
> >  					struct cp_control *cpc);
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index 29bc0a542759..f6c038e8a6a7 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
> >  	if (off_end)
> >  		map.m_len++;
> >  
> > -	if (f2fs_is_pinned_file(inode))
> > -		map.m_seg_type = CURSEG_COLD_DATA;
> > +	if (!map.m_len)
> > +		return 0;
> > +
> > +	if (f2fs_is_pinned_file(inode)) {
> > +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
> > +					sbi->log_blocks_per_seg;
> > +		block_t done = 0;
> > +
> > +		if (map.m_len % sbi->blocks_per_seg)
> > +			len += sbi->blocks_per_seg;
> >  
> > -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
> > -						F2FS_GET_BLOCK_PRE_DIO :
> > -						F2FS_GET_BLOCK_PRE_AIO));
> > +		map.m_len = sbi->blocks_per_seg;
> > +next_alloc:
> > +		mutex_lock(&sbi->gc_mutex);
> > +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
> > +		if (err && err != -ENODATA && err != -EAGAIN)
> > +			goto out_err;
> 
> To grab enough free space?
> 
> Shouldn't we call
> 
> 	if (has_not_enough_free_secs(sbi, 0, 0)) {
> 		mutex_lock(&sbi->gc_mutex);
> 		f2fs_gc(sbi, false, false, NULL_SEGNO);
> 	}

The above calls gc all the time. Do we need this?

> 
> > +
> > +		down_write(&sbi->pin_sem);
> > +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
> > +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
> > +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
> > +		up_write(&sbi->pin_sem);
> > +
> > +		done += map.m_len;
> > +		len -= map.m_len;
> > +		map.m_lblk += map.m_len;
> > +		if (!err && len)
> > +			goto next_alloc;
> > +
> > +		map.m_len = done;
> > +	} else {
> > +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
> > +	}
> > +out_err:
> >  	if (err) {
> >  		pgoff_t last_off;
> >  
> > diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> > index 783773e4560d..76477f71d4ee 100644
> > --- a/fs/f2fs/recovery.c
> > +++ b/fs/f2fs/recovery.c
> > @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
> >  		f2fs_put_page(page, 1);
> >  	}
> >  	if (!err)
> > -		f2fs_allocate_new_segments(sbi);
> > +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
> >  	return err;
> >  }
> >  
> > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> > index 25c750cd0272..253d72c2663c 100644
> > --- a/fs/f2fs/segment.c
> > +++ b/fs/f2fs/segment.c
> > @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
> >  	up_read(&SM_I(sbi)->curseg_lock);
> >  }
> >  
> > -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> > +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
> >  {
> >  	struct curseg_info *curseg;
> >  	unsigned int old_segno;
> > @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> >  	down_write(&SIT_I(sbi)->sentry_lock);
> >  
> >  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
> > +		if (type != NO_CHECK_TYPE && i != type)
> > +			continue;
> > +
> >  		curseg = CURSEG_I(sbi, i);
> >  		old_segno = curseg->segno;
> >  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
> > @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
> >  {
> >  	struct sit_info *sit_i = SIT_I(sbi);
> >  	struct curseg_info *curseg = CURSEG_I(sbi, type);
> > +	bool put_pin_sem = false;
> > +
> > +	if (type == CURSEG_COLD_DATA) {
> > +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
> > +		if (down_read_trylock(&sbi->pin_sem)) {
> > +			put_pin_sem = true;
> > +		} else {
> > +			type = CURSEG_WARM_DATA;
> > +			curseg = CURSEG_I(sbi, type);
> 
> It will mix pending cold data into warm area... rather than recovering curseg to
> write pointer of last cold segment?
> 
> I know maybe that fallocate aligned address could be corner case, but I guess
> there should be some better solutions can handle race case more effectively.
> 
> One solution could be: allocating a virtual log header to select free segment as
> 2m-aligned space target.

I thought about that, but concluded to avoid too much changes.

> 
> Thanks,
> 
> > +		}
> > +	} else if (type == CURSEG_COLD_DATA_PINNED) {
> > +		type = CURSEG_COLD_DATA;
> > +	}
> >  
> >  	down_read(&SM_I(sbi)->curseg_lock);
> >  
> > @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
> >  	mutex_unlock(&curseg->curseg_mutex);
> >  
> >  	up_read(&SM_I(sbi)->curseg_lock);
> > +
> > +	if (put_pin_sem)
> > +		up_read(&sbi->pin_sem);
> >  }
> >  
> >  static void update_device_state(struct f2fs_io_info *fio)
> > diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> > index 325781a1ae4d..a95467b202ea 100644
> > --- a/fs/f2fs/segment.h
> > +++ b/fs/f2fs/segment.h
> > @@ -313,6 +313,8 @@ struct sit_entry_set {
> >   */
> >  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
> >  {
> > +	if (type == CURSEG_COLD_DATA_PINNED)
> > +		type = CURSEG_COLD_DATA;
> >  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
> >  }
> >  
> > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> > index f320fd11db48..c02a47ce551b 100644
> > --- a/fs/f2fs/super.c
> > +++ b/fs/f2fs/super.c
> > @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
> >  	spin_lock_init(&sbi->dev_lock);
> >  
> >  	init_rwsem(&sbi->sb_lock);
> > +	init_rwsem(&sbi->pin_sem);
> >  }
> >  
> >  static int init_percpu_info(struct f2fs_sb_info *sbi)
> > diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
> > index b558b64a4c9c..f164959e4224 100644
> > --- a/fs/f2fs/sysfs.c
> > +++ b/fs/f2fs/sysfs.c
> > @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
> >  	if (f2fs_sb_has_casefold(sbi))
> >  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> >  				len ? ", " : "", "casefold");
> > +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> > +				len ? ", " : "", "pin_file");
> >  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
> >  	return len;
> >  }
> > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-25 18:18   ` Jaegeuk Kim
@ 2019-10-26  1:31     ` Chao Yu
  2019-10-30 16:09       ` Jaegeuk Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Chao Yu @ 2019-10-26  1:31 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

On 2019/10/26 2:18, Jaegeuk Kim wrote:
> On 10/24, Chao Yu wrote:
>> Hi Jaegeuk,
>>
>> On 2019/10/23 1:16, Jaegeuk Kim wrote:
>>> This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
>>> by allocating fully valid 2MB segment.
>>>
>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>>> ---
>>>  fs/f2fs/f2fs.h     |  4 +++-
>>>  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
>>>  fs/f2fs/recovery.c |  2 +-
>>>  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
>>>  fs/f2fs/segment.h  |  2 ++
>>>  fs/f2fs/super.c    |  1 +
>>>  fs/f2fs/sysfs.c    |  2 ++
>>>  7 files changed, 63 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>> index ca342f4c7db1..c681f51e351b 100644
>>> --- a/fs/f2fs/f2fs.h
>>> +++ b/fs/f2fs/f2fs.h
>>> @@ -890,6 +890,7 @@ enum {
>>>  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
>>>  	CURSEG_COLD_NODE,	/* indirect node blocks */
>>>  	NO_CHECK_TYPE,
>>> +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
>>>  };
>>>  
>>>  struct flush_cmd {
>>> @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
>>>  
>>>  	/* threshold for gc trials on pinned files */
>>>  	u64 gc_pin_file_threshold;
>>> +	struct rw_semaphore pin_sem;
>>>  
>>>  	/* maximum # of trials to find a victim segment for SSR and GC */
>>>  	unsigned int max_victim_search;
>>> @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
>>>  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
>>>  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>>>  					unsigned int start, unsigned int end);
>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
>>>  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
>>>  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
>>>  					struct cp_control *cpc);
>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>> index 29bc0a542759..f6c038e8a6a7 100644
>>> --- a/fs/f2fs/file.c
>>> +++ b/fs/f2fs/file.c
>>> @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
>>>  	if (off_end)
>>>  		map.m_len++;
>>>  
>>> -	if (f2fs_is_pinned_file(inode))
>>> -		map.m_seg_type = CURSEG_COLD_DATA;
>>> +	if (!map.m_len)
>>> +		return 0;
>>> +
>>> +	if (f2fs_is_pinned_file(inode)) {
>>> +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
>>> +					sbi->log_blocks_per_seg;
>>> +		block_t done = 0;
>>> +
>>> +		if (map.m_len % sbi->blocks_per_seg)
>>> +			len += sbi->blocks_per_seg;
>>>  
>>> -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
>>> -						F2FS_GET_BLOCK_PRE_DIO :
>>> -						F2FS_GET_BLOCK_PRE_AIO));
>>> +		map.m_len = sbi->blocks_per_seg;
>>> +next_alloc:
>>> +		mutex_lock(&sbi->gc_mutex);
>>> +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
>>> +		if (err && err != -ENODATA && err != -EAGAIN)
>>> +			goto out_err;
>>
>> To grab enough free space?
>>
>> Shouldn't we call
>>
>> 	if (has_not_enough_free_secs(sbi, 0, 0)) {
>> 		mutex_lock(&sbi->gc_mutex);
>> 		f2fs_gc(sbi, false, false, NULL_SEGNO);
>> 	}
> 
> The above calls gc all the time. Do we need this?

Hmmm... my concern is why we need to run foreground GC even if there is enough
free space..

> 
>>
>>> +
>>> +		down_write(&sbi->pin_sem);
>>> +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
>>> +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
>>> +		up_write(&sbi->pin_sem);
>>> +
>>> +		done += map.m_len;
>>> +		len -= map.m_len;
>>> +		map.m_lblk += map.m_len;
>>> +		if (!err && len)
>>> +			goto next_alloc;
>>> +
>>> +		map.m_len = done;
>>> +	} else {
>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
>>> +	}
>>> +out_err:
>>>  	if (err) {
>>>  		pgoff_t last_off;
>>>  
>>> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
>>> index 783773e4560d..76477f71d4ee 100644
>>> --- a/fs/f2fs/recovery.c
>>> +++ b/fs/f2fs/recovery.c
>>> @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
>>>  		f2fs_put_page(page, 1);
>>>  	}
>>>  	if (!err)
>>> -		f2fs_allocate_new_segments(sbi);
>>> +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
>>>  	return err;
>>>  }
>>>  
>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>> index 25c750cd0272..253d72c2663c 100644
>>> --- a/fs/f2fs/segment.c
>>> +++ b/fs/f2fs/segment.c
>>> @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>>>  	up_read(&SM_I(sbi)->curseg_lock);
>>>  }
>>>  
>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
>>>  {
>>>  	struct curseg_info *curseg;
>>>  	unsigned int old_segno;
>>> @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>>>  	down_write(&SIT_I(sbi)->sentry_lock);
>>>  
>>>  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
>>> +		if (type != NO_CHECK_TYPE && i != type)
>>> +			continue;
>>> +
>>>  		curseg = CURSEG_I(sbi, i);
>>>  		old_segno = curseg->segno;
>>>  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
>>> @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>>>  {
>>>  	struct sit_info *sit_i = SIT_I(sbi);
>>>  	struct curseg_info *curseg = CURSEG_I(sbi, type);
>>> +	bool put_pin_sem = false;
>>> +
>>> +	if (type == CURSEG_COLD_DATA) {
>>> +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
>>> +		if (down_read_trylock(&sbi->pin_sem)) {
>>> +			put_pin_sem = true;
>>> +		} else {
>>> +			type = CURSEG_WARM_DATA;
>>> +			curseg = CURSEG_I(sbi, type);
>>
>> It will mix pending cold data into warm area... rather than recovering curseg to
>> write pointer of last cold segment?
>>
>> I know maybe that fallocate aligned address could be corner case, but I guess
>> there should be some better solutions can handle race case more effectively.
>>
>> One solution could be: allocating a virtual log header to select free segment as
>> 2m-aligned space target.
> 
> I thought about that, but concluded to avoid too much changes.

We have an unupstreamed feature which is based on virtual log header, I can
introduce that basic virtual log fwk, which can be used for aligned allocation
and later new features, would you like to check that?

Thanks,

> 
>>
>> Thanks,
>>
>>> +		}
>>> +	} else if (type == CURSEG_COLD_DATA_PINNED) {
>>> +		type = CURSEG_COLD_DATA;
>>> +	}
>>>  
>>>  	down_read(&SM_I(sbi)->curseg_lock);
>>>  
>>> @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>>>  	mutex_unlock(&curseg->curseg_mutex);
>>>  
>>>  	up_read(&SM_I(sbi)->curseg_lock);
>>> +
>>> +	if (put_pin_sem)
>>> +		up_read(&sbi->pin_sem);
>>>  }
>>>  
>>>  static void update_device_state(struct f2fs_io_info *fio)
>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>> index 325781a1ae4d..a95467b202ea 100644
>>> --- a/fs/f2fs/segment.h
>>> +++ b/fs/f2fs/segment.h
>>> @@ -313,6 +313,8 @@ struct sit_entry_set {
>>>   */
>>>  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
>>>  {
>>> +	if (type == CURSEG_COLD_DATA_PINNED)
>>> +		type = CURSEG_COLD_DATA;
>>>  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
>>>  }
>>>  
>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>> index f320fd11db48..c02a47ce551b 100644
>>> --- a/fs/f2fs/super.c
>>> +++ b/fs/f2fs/super.c
>>> @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
>>>  	spin_lock_init(&sbi->dev_lock);
>>>  
>>>  	init_rwsem(&sbi->sb_lock);
>>> +	init_rwsem(&sbi->pin_sem);
>>>  }
>>>  
>>>  static int init_percpu_info(struct f2fs_sb_info *sbi)
>>> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
>>> index b558b64a4c9c..f164959e4224 100644
>>> --- a/fs/f2fs/sysfs.c
>>> +++ b/fs/f2fs/sysfs.c
>>> @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
>>>  	if (f2fs_sb_has_casefold(sbi))
>>>  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>>>  				len ? ", " : "", "casefold");
>>> +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>>> +				len ? ", " : "", "pin_file");
>>>  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
>>>  	return len;
>>>  }
>>>
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
  2019-10-22 17:53   ` Ju Hyung Park
  2019-10-23  5:24   ` Eric Biggers
@ 2019-10-27 22:50   ` Eric Biggers
  2019-10-28  2:33     ` Chao Yu
  2019-10-29  8:33     ` Chao Yu
  2 siblings, 2 replies; 32+ messages in thread
From: Eric Biggers @ 2019-10-27 22:50 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

On Tue, Oct 22, 2019 at 10:16:02AM -0700, Jaegeuk Kim wrote:
> diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
> index 29020af0cff9..d1accf665c86 100644
> --- a/Documentation/filesystems/f2fs.txt
> +++ b/Documentation/filesystems/f2fs.txt
> @@ -235,6 +235,13 @@ checkpoint=%s[:%u[%]]     Set to "disable" to turn off checkpointing. Set to "en
>                         hide up to all remaining free space. The actual space that
>                         would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
>                         This space is reclaimed once checkpoint=enable.
> +compress_algorithm=%s  Control compress algorithm, currently f2fs supports "lzo"
> +                       and "lz4" algorithm.
> +compress_log_size=%u   Support configuring compress cluster size, the size will
> +                       be 4kb * (1 << %u), 16kb is minimum size, also it's
> +                       default size.

kb means kilobits, not kilobytes.

> +compress_extension=%s  Support adding specified extension, so that f2fs can
> +                       enable compression on those corresponding file.

What does "Support adding specified extension" mean?  And does "so that f2fs can
enable compression on those corresponding file" mean that f2fs can't enable
compression on other files?  Please be clear about what this option does.

>  
>  ================================================================================
>  DEBUGFS ENTRIES
> @@ -837,3 +844,44 @@ zero or random data, which is useful to the below scenario where:
>   4. address = fibmap(fd, offset)
>   5. open(blkdev)
>   6. write(blkdev, address)
> +
> +Compression implementation
> +--------------------------
> +
> +- New term named cluster is defined as basic unit of compression, file can
> +be divided into multiple clusters logically. One cluster includes 4 << n
> +(n >= 0) logical pages, compression size is also cluster size, each of
> +cluster can be compressed or not.
> +
> +- In cluster metadata layout, one special flag is used to indicate cluster
> +is compressed one or normal one, for compressed cluster, following metadata
> +maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
> +data including compress header and compressed data.

In the code it's actually a special block address, not a "flag".

> +
> +- In order to eliminate write amplification during overwrite, F2FS only
> +support compression on write-once file, data can be compressed only when
> +all logical blocks in file are valid and cluster compress ratio is lower
> +than specified threshold.
> +
> +- To enable compression on regular inode, there are three ways:
> +* chattr +c file
> +* chattr +c dir; touch dir/file
> +* mount w/ -o compress_extension=ext; touch file.ext
> +
> +Compress metadata layout:
> +                             [Dnode Structure]
> +             +-----------------------------------------------+
> +             | cluster 1 | cluster 2 | ......... | cluster N |
> +             +-----------------------------------------------+
> +             .           .                       .           .
> +       .                       .                .                      .
> +  .         Compressed Cluster       .        .        Normal Cluster            .
> ++----------+---------+---------+---------+  +---------+---------+---------+---------+
> +|compr flag| block 1 | block 2 | block 3 |  | block 1 | block 2 | block 3 | block 4 |
> ++----------+---------+---------+---------+  +---------+---------+---------+---------+
> +           .                             .
> +         .                                           .
> +       .                                                           .
> +      +-------------+-------------+----------+----------------------------+
> +      | data length | data chksum | reserved |      compressed data       |
> +      +-------------+-------------+----------+----------------------------+
> diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
> index 652fd2e2b23d..c12854c3b1a1 100644
> --- a/fs/f2fs/Kconfig
> +++ b/fs/f2fs/Kconfig
> @@ -6,6 +6,10 @@ config F2FS_FS
>  	select CRYPTO
>  	select CRYPTO_CRC32
>  	select F2FS_FS_XATTR if FS_ENCRYPTION
> +	select LZO_COMPRESS
> +	select LZO_DECOMPRESS
> +	select LZ4_COMPRESS
> +	select LZ4_DECOMPRESS

As someone else suggested, there's not much need to support LZO, since LZ4 is
usually better.  Also, compression support should be behind a kconfig option, so
it doesn't cause bloat or extra attack surface for people who don't want it.

> diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
> new file mode 100644
> index 000000000000..f276d82a67aa
> --- /dev/null
> +++ b/fs/f2fs/compress.c
> @@ -0,0 +1,1066 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * f2fs compress support
> + *
> + * Copyright (c) 2019 Chao Yu <chao@kernel.org>
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/f2fs_fs.h>
> +#include <linux/writeback.h>
> +#include <linux/lzo.h>
> +#include <linux/lz4.h>
> +
> +#include "f2fs.h"
> +#include "node.h"
> +#include <trace/events/f2fs.h>
> +
> +struct f2fs_compress_ops {
> +	int (*init_compress_ctx)(struct compress_ctx *cc);
> +	void (*destroy_compress_ctx)(struct compress_ctx *cc);
> +	int (*compress_pages)(struct compress_ctx *cc);
> +	int (*decompress_pages)(struct decompress_io_ctx *dic);
> +};
> +
> +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index)
> +{
> +	return index % cc->cluster_size;
> +}
> +
> +static unsigned int cluster_idx(struct compress_ctx *cc, pgoff_t index)
> +{
> +	return index / cc->cluster_size;
> +}

% and / are slow on values that aren't power-of-2 constants.  Since cluster_size
is always a power of 2, how about also keeping cluster_size_bits and doing:

	index & (cc->cluster_size - 1)

and
	index >> cc->cluster_size_bits

> +
> +static unsigned int start_idx_of_cluster(struct compress_ctx *cc)
> +{
> +	return cc->cluster_idx * cc->cluster_size;
> +}

and here:

	cc->cluster_idx << cc->cluster_size_bits

> +bool f2fs_is_compressed_page(struct page *page)
> +{
> +	if (!page_private(page))
> +		return false;
> +	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
> +		return false;
> +	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
> +}

This code implies that there can be multiple page private structures each of
which has a different magic number.  But I only see F2FS_COMPRESSED_PAGE_MAGIC.
Where in the code is the other one(s)?

> +
> +static void f2fs_set_compressed_page(struct page *page,
> +		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
> +{
> +	SetPagePrivate(page);
> +	set_page_private(page, (unsigned long)data);
> +
> +	/* i_crypto_info and iv index */
> +	page->index = index;
> +	page->mapping = inode->i_mapping;
> +	if (r)
> +		refcount_inc(r);
> +}

It isn't really appropriate to create fake pagecache pages like this.  Did you
consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?

> +
> +static void f2fs_put_compressed_page(struct page *page)
> +{
> +	set_page_private(page, (unsigned long)NULL);
> +	ClearPagePrivate(page);
> +	page->mapping = NULL;
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +struct page *f2fs_compress_control_page(struct page *page)
> +{
> +	return ((struct compress_io_ctx *)page_private(page))->rpages[0];
> +}
> +
> +int f2fs_init_compress_ctx(struct compress_ctx *cc)
> +{
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
> +
> +	if (cc->rpages)
> +		return 0;
> +	cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) * cc->cluster_size,
> +								GFP_KERNEL);
> +	if (!cc->rpages)
> +		return -ENOMEM;
> +	return 0;
> +}

Is it really okay to be using GFP_KERNEL from ->writepages()?

> +
> +void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
> +{
> +	kvfree(cc->rpages);
> +}

The memory is allocated with kzalloc(), so why is it freed with kvfree() and not
just kfree()?

> +
> +int f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
> +{
> +	unsigned int cluster_ofs;
> +
> +	if (!f2fs_cluster_can_merge_page(cc, page->index))
> +		return -EAGAIN;

All callers do f2fs_bug_on() if this error is hit, so why not do the
f2fs_bug_on() here instead?

> +
> +	cluster_ofs = offset_in_cluster(cc, page->index);
> +	cc->rpages[cluster_ofs] = page;
> +	cc->nr_rpages++;
> +	cc->cluster_idx = cluster_idx(cc, page->index);
> +	return 0;
> +}
> +
> +static int lzo_init_compress_ctx(struct compress_ctx *cc)
> +{
> +	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
> +				LZO1X_MEM_COMPRESS, GFP_KERNEL);
> +	if (!cc->private)
> +		return -ENOMEM;
> +
> +	cc->clen = lzo1x_worst_compress(PAGE_SIZE * cc->cluster_size);
> +	return 0;
> +}
> +
> +static void lzo_destroy_compress_ctx(struct compress_ctx *cc)
> +{
> +	kvfree(cc->private);
> +	cc->private = NULL;
> +}
> +
> +static int lzo_compress_pages(struct compress_ctx *cc)
> +{
> +	int ret;
> +
> +	ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
> +					&cc->clen, cc->private);
> +	if (ret != LZO_E_OK) {
> +		printk_ratelimited("%sF2FS-fs: lzo compress failed, ret:%d\n",
> +								KERN_ERR, ret);
> +		return -EIO;
> +	}
> +	return 0;
> +}

Why not using f2fs_err()?  Same in lots of other places.

> +
> +static int lzo_decompress_pages(struct decompress_io_ctx *dic)
> +{
> +	int ret;
> +
> +	ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
> +						dic->rbuf, &dic->rlen);
> +	if (ret != LZO_E_OK) {
> +		printk_ratelimited("%sF2FS-fs: lzo decompress failed, ret:%d\n",
> +								KERN_ERR, ret);
> +		return -EIO;
> +	}
> +
> +	if (dic->rlen != PAGE_SIZE * dic->cluster_size) {
> +		printk_ratelimited("%sF2FS-fs: lzo invalid rlen:%zu, "
> +					"expected:%lu\n", KERN_ERR, dic->rlen,
> +					PAGE_SIZE * dic->cluster_size);
> +		return -EIO;
> +	}
> +	return 0;
> +}
> +
> +static const struct f2fs_compress_ops f2fs_lzo_ops = {
> +	.init_compress_ctx	= lzo_init_compress_ctx,
> +	.destroy_compress_ctx	= lzo_destroy_compress_ctx,
> +	.compress_pages		= lzo_compress_pages,
> +	.decompress_pages	= lzo_decompress_pages,
> +};
> +
> +static int lz4_init_compress_ctx(struct compress_ctx *cc)
> +{
> +	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
> +				LZO1X_MEM_COMPRESS, GFP_KERNEL);

Why is it using LZO1X_MEM_COMPRESS for LZ4?

> +	if (!cc->private)
> +		return -ENOMEM;
> +
> +	cc->clen = LZ4_compressBound(PAGE_SIZE * cc->cluster_size);
> +	return 0;
> +}
> +
> +static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
> +{
> +	kvfree(cc->private);
> +	cc->private = NULL;
> +}
> +
> +static int lz4_compress_pages(struct compress_ctx *cc)
> +{
> +	int len;
> +
> +	len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
> +						cc->clen, cc->private);
> +	if (!len) {
> +		printk_ratelimited("%sF2FS-fs: lz4 compress failed\n",
> +								KERN_ERR);
> +		return -EIO;
> +	}
> +	cc->clen = len;
> +	return 0;
> +}
> +
> +static int lz4_decompress_pages(struct decompress_io_ctx *dic)
> +{
> +	int ret;
> +
> +	ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
> +						dic->clen, dic->rlen);
> +	if (ret < 0) {
> +		printk_ratelimited("%sF2FS-fs: lz4 decompress failed, ret:%d\n",
> +								KERN_ERR, ret);
> +		return -EIO;
> +	}
> +
> +	if (ret != PAGE_SIZE * dic->cluster_size) {
> +		printk_ratelimited("%sF2FS-fs: lz4 invalid rlen:%zu, "
> +					"expected:%lu\n", KERN_ERR, dic->rlen,
> +					PAGE_SIZE * dic->cluster_size);
> +		return -EIO;
> +	}
> +	return 0;
> +}
> +
> +static const struct f2fs_compress_ops f2fs_lz4_ops = {
> +	.init_compress_ctx	= lz4_init_compress_ctx,
> +	.destroy_compress_ctx	= lz4_destroy_compress_ctx,
> +	.compress_pages		= lz4_compress_pages,
> +	.decompress_pages	= lz4_decompress_pages,
> +};
> +
> +static void f2fs_release_cluster_pages(struct compress_ctx *cc)
> +{
> +	int i;
> +
> +	for (i = 0; i < cc->nr_rpages; i++) {
> +		inode_dec_dirty_pages(cc->inode);
> +		unlock_page(cc->rpages[i]);
> +	}
> +}
> +
> +static struct page *f2fs_grab_page(void)
> +{
> +	struct page *page;
> +
> +	page = alloc_pages(GFP_KERNEL, 0);

This should use alloc_page(), not alloc_pages().

> +	if (!page)
> +		return NULL;
> +	lock_page(page);
> +	return page;
> +}
> +
> +static int f2fs_compress_pages(struct compress_ctx *cc)
> +{
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
> +	struct f2fs_inode_info *fi = F2FS_I(cc->inode);
> +	const struct f2fs_compress_ops *cops =
> +				sbi->cops[fi->i_compress_algorithm];
> +	unsigned int max_len, nr_cpages;
> +	int i, ret;
> +
> +	trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
> +				cc->cluster_size, fi->i_compress_algorithm);
> +
> +	ret = cops->init_compress_ctx(cc);
> +	if (ret)
> +		goto out;
> +
> +	max_len = COMPRESS_HEADER_SIZE + cc->clen;
> +	cc->nr_cpages = roundup(max_len, PAGE_SIZE) / PAGE_SIZE;
> +
> +	cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
> +					cc->nr_cpages, GFP_KERNEL);
> +	if (!cc->cpages) {
> +		ret = -ENOMEM;
> +		goto destroy_compress_ctx;
> +	}
> +
> +	for (i = 0; i < cc->nr_cpages; i++) {
> +		cc->cpages[i] = f2fs_grab_page();
> +		if (!cc->cpages[i]) {
> +			ret = -ENOMEM;
> +			goto out_free_cpages;
> +		}

If this fails, then at out_free_cpages it will dereference a NULL pointer in
cc->cpages[i].

> +	}
> +
> +	cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL);
> +	if (!cc->rbuf) {
> +		ret = -ENOMEM;
> +		goto destroy_compress_ctx;
> +	}

Wrong error label.  Should be out_free_cpages.

> +
> +	cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL);
> +	if (!cc->cbuf) {
> +		ret = -ENOMEM;
> +		goto out_vunmap_rbuf;
> +	}

It would be sufficient to map these pages read-only, i.e. use PAGE_KERNEL_RO.

> +
> +	ret = cops->compress_pages(cc);
> +	if (ret)
> +		goto out_vunmap_cbuf;
> +
> +	max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE;
> +
> +	if (cc->clen > max_len) {
> +		ret = -EAGAIN;
> +		goto out_vunmap_cbuf;
> +	}

Since we already know the max length we're willing to compress to (the max
length for any space to be saved), why is more space than that being allocated?
LZ4_compress_default() will return an error if there isn't enough space, so that
error could just be used as the indication to store the data uncompressed.

> +
> +	cc->cbuf->clen = cpu_to_le32(cc->clen);
> +	cc->cbuf->chksum = 0;

What is the point of the chksum field?  It's always set to 0 and never checked.

> +
> +	vunmap(cc->cbuf);
> +	vunmap(cc->rbuf);
> +
> +	nr_cpages = roundup(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE) /
> +								PAGE_SIZE;
> +
> +	for (i = nr_cpages; i < cc->nr_cpages; i++) {
> +		f2fs_put_compressed_page(cc->cpages[i]);
> +		cc->cpages[i] = NULL;
> +	}
> +
> +	cc->nr_cpages = nr_cpages;
> +
> +	trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
> +							cc->clen, ret);
> +	return 0;
> +out_vunmap_cbuf:
> +	vunmap(cc->cbuf);
> +out_vunmap_rbuf:
> +	vunmap(cc->rbuf);
> +out_free_cpages:
> +	for (i = 0; i < cc->nr_cpages; i++)
> +		f2fs_put_compressed_page(cc->cpages[i]);
> +	kvfree(cc->cpages);
> +	cc->cpages = NULL;
> +destroy_compress_ctx:
> +	cops->destroy_compress_ctx(cc);
> +out:
> +	trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
> +							cc->clen, ret);
> +	return ret;
> +}
> +
> +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
> +{
> +	struct decompress_io_ctx *dic =
> +			(struct decompress_io_ctx *)page_private(page);
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
> +	struct f2fs_inode_info *fi= F2FS_I(dic->inode);
> +	const struct f2fs_compress_ops *cops =
> +			sbi->cops[fi->i_compress_algorithm];

Where is it checked that i_compress_algorithm is a valid compression algorithm?

> +	int ret;
> +
> +	dec_page_count(sbi, F2FS_RD_DATA);
> +
> +	if (bio->bi_status)
> +		dic->err = true;
> +
> +	if (refcount_dec_not_one(&dic->ref))
> +		return;
> +
> +	trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
> +				dic->cluster_size, fi->i_compress_algorithm);
> +
> +	/* submit partial compressed pages */
> +	if (dic->err) {
> +		ret = dic->err;

This sets 'ret' to a bool, whereas elsewhere it's set to a negative error value.
Which one is it?

> +		goto out_free_dic;
> +	}
> +
> +	dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
> +	if (!dic->rbuf) {
> +		ret = -ENOMEM;
> +		goto out_free_dic;
> +	}
> +
> +	dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL);
> +	if (!dic->cbuf) {
> +		ret = -ENOMEM;
> +		goto out_vunmap_rbuf;
> +	}

It would be sufficient to map the source pages read-only.

> +
> +	dic->clen = le32_to_cpu(dic->cbuf->clen);
> +	dic->rlen = PAGE_SIZE * dic->cluster_size;

Shouldn't it also be verified that the reserved header fields are 0?
Otherwise, it may be difficult to use them for anything in the future.

> +
> +	if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
> +		ret = -EFAULT;
> +		goto out_vunmap_cbuf;
> +	}

EFAULT isn't an appropriate error code for corrupt on-disk data.  It should be
EFSCORRUPTED.

> +
> +	ret = cops->decompress_pages(dic);
> +
> +out_vunmap_cbuf:
> +	vunmap(dic->cbuf);
> +out_vunmap_rbuf:
> +	vunmap(dic->rbuf);
> +out_free_dic:
> +	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);

This is passing a -errno value to a function that takes a bool.

> +	f2fs_free_dic(dic);
> +
> +	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
> +							dic->clen, ret);

This is freeing 'dic' and then immediately using it again...

> +
> +static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
> +{
> +	if (cc->cluster_idx == NULL_CLUSTER)
> +		return true;
> +	return cc->cluster_idx == cluster_idx(cc, index);
> +}
> +
> +bool f2fs_cluster_is_empty(struct compress_ctx *cc)
> +{
> +	return cc->nr_rpages == 0;
> +}
> +
> +static bool f2fs_cluster_is_full(struct compress_ctx *cc)
> +{
> +	return cc->cluster_size == cc->nr_rpages;
> +}
> +
> +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
> +{
> +	if (f2fs_cluster_is_empty(cc))
> +		return true;
> +	if (f2fs_cluster_is_full(cc))
> +		return false;
> +	return is_page_in_cluster(cc, index);
> +}

Why is the f2fs_cluster_is_full() check needed in f2fs_cluster_can_merge_page()?
If all pages of the cluster have already been added, then the next one can't be
in the same cluster.

> +
> +static bool __cluster_may_compress(struct compress_ctx *cc)
> +{
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
> +	loff_t i_size = i_size_read(cc->inode);
> +	const pgoff_t end_index = ((unsigned long long)i_size)
> +					>> PAGE_SHIFT;
> +	unsigned offset;
> +	int i;
> +
> +	for (i = 0; i < cc->cluster_size; i++) {
> +		struct page *page = cc->rpages[i];
> +
> +		f2fs_bug_on(sbi, !page);
> +
> +		if (unlikely(f2fs_cp_error(sbi)))
> +			return false;
> +		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
> +			return false;
> +		if (f2fs_is_drop_cache(cc->inode))
> +			return false;
> +		if (f2fs_is_volatile_file(cc->inode))
> +			return false;
> +
> +		offset = i_size & (PAGE_SIZE - 1);
> +		if ((page->index > end_index) ||
> +			(page->index == end_index && !offset))
> +			return false;

No need to have a special case for when i_size is a multiple of the page size.
Just replace end_index with 'nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE)' and
check for page->index >= nr_pages.

> +	}
> +	return true;
> +}
> +
> +int f2fs_is_cluster_existed(struct compress_ctx *cc)
> +{

This function name doesn't make sense.  "is" is present tense whereas "existed"
is past tense.  Also, the name implies it returns a bool, whereas actually it
returns a negative errno value, 1, or 2.

> +out_fail:
> +	/* TODO: revoke partially updated block addresses */
> +	for (i += 1; i < cc->cluster_size; i++) {
> +		if (!cc->rpages[i])
> +			continue;
> +		redirty_page_for_writepage(wbc, cc->rpages[i]);
> +		unlock_page(cc->rpages[i]);
> +	}
> +	return err;

Un-addressed TODO.

> +static void f2fs_init_compress_ops(struct f2fs_sb_info *sbi)
> +{
> +	sbi->cops[COMPRESS_LZO] = &f2fs_lzo_ops;
> +	sbi->cops[COMPRESS_LZ4] = &f2fs_lz4_ops;
> +}

Why are the compression operations a per-superblock thing?  Seems this should be
a global table.

> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index ba3bcf4c7889..bac96c3a8bc9 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -41,6 +41,9 @@ static bool __is_cp_guaranteed(struct page *page)
>  	if (!mapping)
>  		return false;
>  
> +	if (f2fs_is_compressed_page(page))
> +		return false;
> +
>  	inode = mapping->host;
>  	sbi = F2FS_I_SB(inode);
>  
> @@ -73,19 +76,19 @@ static enum count_type __read_io_type(struct page *page)
>  
>  /* postprocessing steps for read bios */
>  enum bio_post_read_step {
> -	STEP_INITIAL = 0,
>  	STEP_DECRYPT,
> +	STEP_DECOMPRESS,
>  	STEP_VERITY,
>  };
>  
>  struct bio_post_read_ctx {
>  	struct bio *bio;
> +	struct f2fs_sb_info *sbi;
>  	struct work_struct work;
> -	unsigned int cur_step;
>  	unsigned int enabled_steps;
>  };
>  
> -static void __read_end_io(struct bio *bio)
> +static void __read_end_io(struct bio *bio, bool compr, bool verity)
>  {
>  	struct page *page;
>  	struct bio_vec *bv;
> @@ -94,6 +97,11 @@ static void __read_end_io(struct bio *bio)
>  	bio_for_each_segment_all(bv, bio, iter_all) {
>  		page = bv->bv_page;
>  
> +		if (compr && PagePrivate(page)) {
> +			f2fs_decompress_pages(bio, page, verity);
> +			continue;
> +		}
> +
>  		/* PG_error was set if any post_read step failed */
>  		if (bio->bi_status || PageError(page)) {
>  			ClearPageUptodate(page);
> @@ -110,60 +118,67 @@ static void __read_end_io(struct bio *bio)
>  	bio_put(bio);
>  }
>  
> +static void f2fs_decompress_bio(struct bio *bio, bool verity)
> +{
> +	__read_end_io(bio, true, verity);
> +}
> +
>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
>  
> -static void decrypt_work(struct work_struct *work)
> +static void decrypt_work(struct bio_post_read_ctx *ctx)
>  {
> -	struct bio_post_read_ctx *ctx =
> -		container_of(work, struct bio_post_read_ctx, work);
> -
>  	fscrypt_decrypt_bio(ctx->bio);
> +}
> +
> +static void decompress_work(struct bio_post_read_ctx *ctx, bool verity)
> +{
> +	f2fs_decompress_bio(ctx->bio, verity);
> +}
>  
> -	bio_post_read_processing(ctx);
> +static void verity_work(struct bio_post_read_ctx *ctx)
> +{
> +	fsverity_verify_bio(ctx->bio);
>  }
>  
> -static void verity_work(struct work_struct *work)
> +static void f2fs_post_read_work(struct work_struct *work)
>  {
>  	struct bio_post_read_ctx *ctx =
>  		container_of(work, struct bio_post_read_ctx, work);
>  
> -	fsverity_verify_bio(ctx->bio);
> +	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
> +		decrypt_work(ctx);
>  
> -	bio_post_read_processing(ctx);
> +	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
> +		decompress_work(ctx,
> +			ctx->enabled_steps & (1 << STEP_VERITY));
> +		return;
> +	}
> +
> +	if (ctx->enabled_steps & (1 << STEP_VERITY))
> +		verity_work(ctx);
> +
> +	__read_end_io(ctx->bio, false, false);
> +}
> +
> +static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
> +						struct work_struct *work)
> +{
> +	queue_work(sbi->post_read_wq, work);
>  }
>  
>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
>  {
> -	/*
> -	 * We use different work queues for decryption and for verity because
> -	 * verity may require reading metadata pages that need decryption, and
> -	 * we shouldn't recurse to the same workqueue.
> -	 */

Why is it okay (i.e., no deadlocks) to no longer use different work queues for
decryption and for verity?  See the comment above which is being deleted.

> +			bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
> +					is_readahead ? REQ_RAHEAD : 0,
> +					page->index);
> +			if (IS_ERR(bio)) {
> +				ret = PTR_ERR(bio);
> +				bio = NULL;
> +				dic->err = true;

'err' conventionally means a -errno value.  Please call this 'failed' instead.

> +	/* TODO: cluster can be compressed due to race with .writepage */
> +

Another un-addressed TODO.

> +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
> +{
> +	if (!f2fs_sb_has_encrypt(sbi) &&
> +		!f2fs_sb_has_compression(sbi))
> +		return 0;
> +
> +	sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
> +						 WQ_UNBOUND | WQ_HIGHPRI,
> +						 num_online_cpus());

post_read_wq is also needed if verity is enabled.

> +/* For compression */
> +enum compress_algrithm_type {
> +	COMPRESS_LZO,
> +	COMPRESS_LZ4,
> +	COMPRESS_MAX,
> +};

"algorithm" is misspelled.

> +
> +struct compress_data {
> +	__le32 clen;
> +	__le32 chksum;
> +	__le32 reserved[4];
> +	char cdata[];
> +};

cdata is binary, not a string.  So it should be 'u8', not 'char'.

> +
> +struct compress_ctx {
> +	struct inode *inode;
> +	unsigned int cluster_size;
> +	unsigned int cluster_idx;
> +	struct page **rpages;
> +	unsigned int nr_rpages;
> +	struct page **cpages;
> +	unsigned int nr_cpages;
> +	void *rbuf;
> +	struct compress_data *cbuf;
> +	size_t rlen;
> +	size_t clen;
> +	void *private;
> +};
> +
> +#define F2FS_COMPRESSED_PAGE_MAGIC	0xF5F2C000
> +struct compress_io_ctx {
> +	u32 magic;
> +	struct inode *inode;
> +	refcount_t ref;
> +	struct page **rpages;
> +	unsigned int nr_rpages;
> +};
> +
> +struct decompress_io_ctx {
> +	struct inode *inode;
> +	refcount_t ref;
> +	struct page **rpages;		/* raw pages from page cache */
> +	unsigned int nr_rpages;
> +	struct page **cpages;		/* pages contain compressed data */
> +	unsigned int nr_cpages;
> +	struct page **tpages;		/* temp pages to pad hole in cluster */
> +	void *rbuf;
> +	struct compress_data *cbuf;
> +	size_t rlen;
> +	size_t clen;
> +	unsigned int cluster_idx;
> +	unsigned int cluster_size;
> +	bool err;
> +};

Please add comments properly documenting these structures.

>  struct f2fs_private_dio {
> @@ -2375,6 +2473,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
>  /*
>   * On-disk inode flags (f2fs_inode::i_flags)
>   */
> +#define F2FS_COMPR_FL			0x00000004 /* Compress file */
> +#define F2FS_NOCOMP_FL			0x00000400 /* Don't compress */
>  #define F2FS_SYNC_FL			0x00000008 /* Synchronous updates */
>  #define F2FS_IMMUTABLE_FL		0x00000010 /* Immutable file */
>  #define F2FS_APPEND_FL			0x00000020 /* writes to file may only append */

Please keep these in numerical order.

> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index 386ad54c13c3..e84ef90ffdee 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -407,6 +407,20 @@ static int do_read_inode(struct inode *inode)
>  		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
>  	}
>  
> +	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi)) {
> +		if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
> +					i_log_cluster_size)) {
> +			fi->i_compressed_blocks =
> +					le64_to_cpu(ri->i_compressed_blocks);
> +			fi->i_compress_algorithm = ri->i_compress_algorithm;
> +			fi->i_log_cluster_size = ri->i_log_cluster_size;
> +			fi->i_cluster_size = 1 << fi->i_log_cluster_size;
> +		}
> +
> +		if ((fi->i_flags & FS_COMPR_FL) && f2fs_may_compress(inode))
> +			set_inode_flag(inode, FI_COMPRESSED_FILE);
> +	}

Need to validate that these fields are valid.

> @@ -119,6 +119,20 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
>  	if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
>  		set_inode_flag(inode, FI_PROJ_INHERIT);
>  
> +	if (f2fs_sb_has_compression(sbi)) {
> +		F2FS_I(inode)->i_compress_algorithm =
> +				F2FS_OPTION(sbi).compress_algorithm;
> +		F2FS_I(inode)->i_log_cluster_size =
> +				F2FS_OPTION(sbi).compress_log_size;
> +		F2FS_I(inode)->i_cluster_size =
> +				1 << F2FS_I(inode)->i_log_cluster_size;
> +

Why are these compression fields being set on uncompressed files?

> @@ -810,6 +817,66 @@ static int parse_options(struct super_block *sb, char *options)
>  		case Opt_checkpoint_enable:
>  			clear_opt(sbi, DISABLE_CHECKPOINT);
>  			break;
> +		case Opt_compress_algorithm:
> +			if (!f2fs_sb_has_compression(sbi)) {
> +				f2fs_err(sbi, "Compression feature if off");
> +				return -EINVAL;

"if off" => "is off"

> +			}
> +			name = match_strdup(&args[0]);
> +			if (!name)
> +				return -ENOMEM;
> +			if (strlen(name) == 3 && !strncmp(name, "lzo", 3)) {

!strcmp(name, "lzo")

> +				F2FS_OPTION(sbi).compress_algorithm =
> +								COMPRESS_LZO;
> +			} else if (strlen(name) == 3 &&
> +					!strncmp(name, "lz4", 3)) {

!strcmp(name, "lz4")

> +				F2FS_OPTION(sbi).compress_algorithm =
> +								COMPRESS_LZ4;
> +			} else {
> +				kvfree(name);

Why not kfree()?

> +				return -EINVAL;
> +			}
> +			kvfree(name);
> +		case Opt_compress_log_size:
> +			if (!f2fs_sb_has_compression(sbi)) {
> +				f2fs_err(sbi, "Compression feature if off");
> +				return -EINVAL;
> +			}

"if off" => "is off"

> +			if (args->from && match_int(args, &arg))
> +				return -EINVAL;
> +			if (arg < MIN_COMPRESS_LOG_SIZE ||
> +				arg > MAX_COMPRESS_LOG_SIZE) {
> +				f2fs_err(sbi,
> +					"Compress cluster log size if out of range");

"if out of range" => "is out of range"

> +				return -EINVAL;
> +			}
> +			F2FS_OPTION(sbi).compress_log_size = arg;
> +			break;
> +		case Opt_compress_extension:
> +			if (!f2fs_sb_has_compression(sbi)) {
> +				f2fs_err(sbi, "Compression feature if off");

"if off" => "is off"

> +				return -EINVAL;
> +			}
> +			name = match_strdup(&args[0]);
> +			if (!name)
> +				return -ENOMEM;
> +
> +			ext = F2FS_OPTION(sbi).extensions;
> +			ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
> +
> +			if (strlen(name) >= F2FS_EXTENSION_LEN ||
> +				ext_cnt >= COMPRESS_EXT_NUM) {
> +				f2fs_err(sbi,
> +					"invalid extension length/number");
> +				kvfree(name);
> +				return -EINVAL;
> +			}
> +
> +			strcpy(ext[ext_cnt], name);
> +			F2FS_OPTION(sbi).compress_ext_cnt++;
> +			kvfree(name);

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-27 22:50   ` Eric Biggers
@ 2019-10-28  2:33     ` Chao Yu
  2019-10-29  8:33     ` Chao Yu
  1 sibling, 0 replies; 32+ messages in thread
From: Chao Yu @ 2019-10-28  2:33 UTC (permalink / raw)
  To: Jaegeuk Kim, Eric Biggers; +Cc: linux-kernel, linux-f2fs-devel

Eric, Jaegeuk,

On 2019/10/28 6:50, Eric Biggers wrote:
> On Tue, Oct 22, 2019 at 10:16:02AM -0700, Jaegeuk Kim wrote:

Let me update the patch according to comments.

Thanks,


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-27 22:50   ` Eric Biggers
  2019-10-28  2:33     ` Chao Yu
@ 2019-10-29  8:33     ` Chao Yu
  2019-10-30  2:55       ` Eric Biggers
  1 sibling, 1 reply; 32+ messages in thread
From: Chao Yu @ 2019-10-29  8:33 UTC (permalink / raw)
  To: Eric Biggers; +Cc: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2019/10/28 6:50, Eric Biggers wrote:
>> +bool f2fs_is_compressed_page(struct page *page)
>> +{
>> +	if (!page_private(page))
>> +		return false;
>> +	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
>> +		return false;
>> +	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
>> +}
> 
> This code implies that there can be multiple page private structures each of
> which has a different magic number.  But I only see F2FS_COMPRESSED_PAGE_MAGIC.
> Where in the code is the other one(s)?

I'm not sure I understood you correctly, did you mean it needs to introduce
f2fs_is_atomic_written_page() and f2fs_is_dummy_written_page() like
f2fs_is_compressed_page()?

> 
>> +
>> +static void f2fs_set_compressed_page(struct page *page,
>> +		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
>> +{
>> +	SetPagePrivate(page);
>> +	set_page_private(page, (unsigned long)data);
>> +
>> +	/* i_crypto_info and iv index */
>> +	page->index = index;
>> +	page->mapping = inode->i_mapping;
>> +	if (r)
>> +		refcount_inc(r);
>> +}
> 
> It isn't really appropriate to create fake pagecache pages like this.  Did you
> consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?

We need to store i_crypto_info and iv index somewhere, in order to pass them to
fscrypt_decrypt_block_inplace(), where did you suggest to store them?

>> +
>> +void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
>> +{
>> +	kvfree(cc->rpages);
>> +}
> 
> The memory is allocated with kzalloc(), so why is it freed with kvfree() and not
> just kfree()?

It was allocated by f2fs_*alloc() which will fallback to kvmalloc() once
kmalloc() failed.

>> +static int lzo_compress_pages(struct compress_ctx *cc)
>> +{
>> +	int ret;
>> +
>> +	ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
>> +					&cc->clen, cc->private);
>> +	if (ret != LZO_E_OK) {
>> +		printk_ratelimited("%sF2FS-fs: lzo compress failed, ret:%d\n",
>> +								KERN_ERR, ret);
>> +		return -EIO;
>> +	}
>> +	return 0;
>> +}
> 
> Why not using f2fs_err()?  Same in lots of other places.

We use printk_ratelimited at some points where we can afford to lose logs,
otherwise we use f2fs_{err,warn...} to record info as much as possible for
troubleshoot.

>> +
>> +	ret = cops->compress_pages(cc);
>> +	if (ret)
>> +		goto out_vunmap_cbuf;
>> +
>> +	max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE;
>> +
>> +	if (cc->clen > max_len) {
>> +		ret = -EAGAIN;
>> +		goto out_vunmap_cbuf;
>> +	}
> 
> Since we already know the max length we're willing to compress to (the max
> length for any space to be saved), why is more space than that being allocated?
> LZ4_compress_default() will return an error if there isn't enough space, so that
> error could just be used as the indication to store the data uncompressed.

AFAIK, there is no such common error code returned from all compression
algorithms indicating there is no room for limited target size, however we need
that information to fallback to write raw pages. Any better idea?

> 
>> +
>> +	cc->cbuf->clen = cpu_to_le32(cc->clen);
>> +	cc->cbuf->chksum = 0;
> 
> What is the point of the chksum field?  It's always set to 0 and never checked.

When I written initial codes, I doubt that I may lose to check some SPO corner
cases, in where we missed to write whole cluster, so I added that to help to
recall that case, however I didn't have time to cover those cases, resulting
leaving unfinished code there... :(, I'm okay to delete it in a formal version.

BTW, for data checksum feature, I guess we need to reconstruct dnode layout to
cover both compressed/non-compressed data.

> 
>> +
>> +static bool __cluster_may_compress(struct compress_ctx *cc)
>> +{
>> +	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
>> +	loff_t i_size = i_size_read(cc->inode);
>> +	const pgoff_t end_index = ((unsigned long long)i_size)
>> +					>> PAGE_SHIFT;
>> +	unsigned offset;
>> +	int i;
>> +
>> +	for (i = 0; i < cc->cluster_size; i++) {
>> +		struct page *page = cc->rpages[i];
>> +
>> +		f2fs_bug_on(sbi, !page);
>> +
>> +		if (unlikely(f2fs_cp_error(sbi)))
>> +			return false;
>> +		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
>> +			return false;
>> +		if (f2fs_is_drop_cache(cc->inode))
>> +			return false;
>> +		if (f2fs_is_volatile_file(cc->inode))
>> +			return false;
>> +
>> +		offset = i_size & (PAGE_SIZE - 1);
>> +		if ((page->index > end_index) ||
>> +			(page->index == end_index && !offset))
>> +			return false;
> 
> No need to have a special case for when i_size is a multiple of the page size.
> Just replace end_index with 'nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE)' and
> check for page->index >= nr_pages.

That is copied from f2fs_write_data_page(), let's clean up in a separated patch.

> 
>> +out_fail:
>> +	/* TODO: revoke partially updated block addresses */
>> +	for (i += 1; i < cc->cluster_size; i++) {
>> +		if (!cc->rpages[i])
>> +			continue;
>> +		redirty_page_for_writepage(wbc, cc->rpages[i]);
>> +		unlock_page(cc->rpages[i]);
>> +	}
>> +	return err;
> 
> Un-addressed TODO.

Will fix a little later.

>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
>>  {
>> -	/*
>> -	 * We use different work queues for decryption and for verity because
>> -	 * verity may require reading metadata pages that need decryption, and
>> -	 * we shouldn't recurse to the same workqueue.
>> -	 */
> 
> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
> decryption and for verity?  See the comment above which is being deleted.

Could you explain more about how deadlock happen? or share me a link address if
you have described that case somewhere?

> 
>> +	/* TODO: cluster can be compressed due to race with .writepage */
>> +
> 
> Another un-addressed TODO.

Will fix a little later.

> 
>> +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
>> +{
>> +	if (!f2fs_sb_has_encrypt(sbi) &&
>> +		!f2fs_sb_has_compression(sbi))
>> +		return 0;
>> +
>> +	sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
>> +						 WQ_UNBOUND | WQ_HIGHPRI,
>> +						 num_online_cpus());
> 
> post_read_wq is also needed if verity is enabled.

Yes, we missed this as verity was not merged when implementing this....

Thanks,


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-29  8:33     ` Chao Yu
@ 2019-10-30  2:55       ` Eric Biggers
  2019-10-30  8:43         ` Chao Yu
  0 siblings, 1 reply; 32+ messages in thread
From: Eric Biggers @ 2019-10-30  2:55 UTC (permalink / raw)
  To: Chao Yu; +Cc: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On Tue, Oct 29, 2019 at 04:33:36PM +0800, Chao Yu wrote:
> On 2019/10/28 6:50, Eric Biggers wrote:
> >> +bool f2fs_is_compressed_page(struct page *page)
> >> +{
> >> +	if (!page_private(page))
> >> +		return false;
> >> +	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
> >> +		return false;
> >> +	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
> >> +}
> > 
> > This code implies that there can be multiple page private structures each of
> > which has a different magic number.  But I only see F2FS_COMPRESSED_PAGE_MAGIC.
> > Where in the code is the other one(s)?
> 
> I'm not sure I understood you correctly, did you mean it needs to introduce
> f2fs_is_atomic_written_page() and f2fs_is_dummy_written_page() like
> f2fs_is_compressed_page()?
> 

No, I'm asking what is the case where the line

	*((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC

returns false?

> > 
> >> +
> >> +static void f2fs_set_compressed_page(struct page *page,
> >> +		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
> >> +{
> >> +	SetPagePrivate(page);
> >> +	set_page_private(page, (unsigned long)data);
> >> +
> >> +	/* i_crypto_info and iv index */
> >> +	page->index = index;
> >> +	page->mapping = inode->i_mapping;
> >> +	if (r)
> >> +		refcount_inc(r);
> >> +}
> > 
> > It isn't really appropriate to create fake pagecache pages like this.  Did you
> > consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?
> 
> We need to store i_crypto_info and iv index somewhere, in order to pass them to
> fscrypt_decrypt_block_inplace(), where did you suggest to store them?
> 

The same place where the pages are stored.

> >> +
> >> +void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
> >> +{
> >> +	kvfree(cc->rpages);
> >> +}
> > 
> > The memory is allocated with kzalloc(), so why is it freed with kvfree() and not
> > just kfree()?
> 
> It was allocated by f2fs_*alloc() which will fallback to kvmalloc() once
> kmalloc() failed.

This seems to be a bug in f2fs_kmalloc() -- it inappropriately falls back to
kvmalloc().  As per its name, it should only use kmalloc().  f2fs_kvmalloc()
already exists, so it can be used when the fallback is wanted.

> 
> >> +static int lzo_compress_pages(struct compress_ctx *cc)
> >> +{
> >> +	int ret;
> >> +
> >> +	ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
> >> +					&cc->clen, cc->private);
> >> +	if (ret != LZO_E_OK) {
> >> +		printk_ratelimited("%sF2FS-fs: lzo compress failed, ret:%d\n",
> >> +								KERN_ERR, ret);
> >> +		return -EIO;
> >> +	}
> >> +	return 0;
> >> +}
> > 
> > Why not using f2fs_err()?  Same in lots of other places.
> 
> We use printk_ratelimited at some points where we can afford to lose logs,
> otherwise we use f2fs_{err,warn...} to record info as much as possible for
> troubleshoot.
> 

It used to be the case that f2fs_msg() was ratelimited.  What stops it from
spamming the logs now?

The problem with a bare printk is that it doesn't show which filesystem instance
the message is coming from.

> >> +
> >> +	ret = cops->compress_pages(cc);
> >> +	if (ret)
> >> +		goto out_vunmap_cbuf;
> >> +
> >> +	max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE;
> >> +
> >> +	if (cc->clen > max_len) {
> >> +		ret = -EAGAIN;
> >> +		goto out_vunmap_cbuf;
> >> +	}
> > 
> > Since we already know the max length we're willing to compress to (the max
> > length for any space to be saved), why is more space than that being allocated?
> > LZ4_compress_default() will return an error if there isn't enough space, so that
> > error could just be used as the indication to store the data uncompressed.
> 
> AFAIK, there is no such common error code returned from all compression
> algorithms indicating there is no room for limited target size, however we need
> that information to fallback to write raw pages. Any better idea?
> 

"Not enough room" is the only reasonable way for compression to fail, so all
that's needed is the ability for compression to report errors at all.  What
specifically prevents this approach from working?

> >>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
> >>  {
> >> -	/*
> >> -	 * We use different work queues for decryption and for verity because
> >> -	 * verity may require reading metadata pages that need decryption, and
> >> -	 * we shouldn't recurse to the same workqueue.
> >> -	 */
> > 
> > Why is it okay (i.e., no deadlocks) to no longer use different work queues for
> > decryption and for verity?  See the comment above which is being deleted.
> 
> Could you explain more about how deadlock happen? or share me a link address if
> you have described that case somewhere?
> 

The verity work can read pages from the file which require decryption.  I'm
concerned that it could deadlock if the work is scheduled on the same workqueue.
Granted, I'm not an expert in Linux workqueues, so if you've investigated this
and determined that it's safe, can you explain why?

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30  2:55       ` Eric Biggers
@ 2019-10-30  8:43         ` Chao Yu
  2019-10-30 16:50           ` Eric Biggers
  2019-10-30 17:02           ` Eric Biggers
  0 siblings, 2 replies; 32+ messages in thread
From: Chao Yu @ 2019-10-30  8:43 UTC (permalink / raw)
  To: Eric Biggers; +Cc: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2019/10/30 10:55, Eric Biggers wrote:
> On Tue, Oct 29, 2019 at 04:33:36PM +0800, Chao Yu wrote:
>> On 2019/10/28 6:50, Eric Biggers wrote:
>>>> +bool f2fs_is_compressed_page(struct page *page)
>>>> +{
>>>> +	if (!page_private(page))
>>>> +		return false;
>>>> +	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
>>>> +		return false;
>>>> +	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
>>>> +}
>>>
>>> This code implies that there can be multiple page private structures each of
>>> which has a different magic number.  But I only see F2FS_COMPRESSED_PAGE_MAGIC.
>>> Where in the code is the other one(s)?
>>
>> I'm not sure I understood you correctly, did you mean it needs to introduce
>> f2fs_is_atomic_written_page() and f2fs_is_dummy_written_page() like
>> f2fs_is_compressed_page()?
>>
> 
> No, I'm asking what is the case where the line
> 
> 	*((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC
> 
> returns false?

Should be this?

if (!page_private(page))
	return false;
f2fs_bug_on(*((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC)
return true;

> 
>>>
>>>> +
>>>> +static void f2fs_set_compressed_page(struct page *page,
>>>> +		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
>>>> +{
>>>> +	SetPagePrivate(page);
>>>> +	set_page_private(page, (unsigned long)data);
>>>> +
>>>> +	/* i_crypto_info and iv index */
>>>> +	page->index = index;
>>>> +	page->mapping = inode->i_mapping;
>>>> +	if (r)
>>>> +		refcount_inc(r);
>>>> +}
>>>
>>> It isn't really appropriate to create fake pagecache pages like this.  Did you
>>> consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?
>>
>> We need to store i_crypto_info and iv index somewhere, in order to pass them to
>> fscrypt_decrypt_block_inplace(), where did you suggest to store them?
>>
> 
> The same place where the pages are stored.

Still we need allocate space for those fields, any strong reason to do so?

> 
>>>> +
>>>> +void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
>>>> +{
>>>> +	kvfree(cc->rpages);
>>>> +}
>>>
>>> The memory is allocated with kzalloc(), so why is it freed with kvfree() and not
>>> just kfree()?
>>
>> It was allocated by f2fs_*alloc() which will fallback to kvmalloc() once
>> kmalloc() failed.
> 
> This seems to be a bug in f2fs_kmalloc() -- it inappropriately falls back to
> kvmalloc().  As per its name, it should only use kmalloc().  f2fs_kvmalloc()
> already exists, so it can be used when the fallback is wanted.

We can introduce f2fs_memalloc() to wrap f2fs_kmalloc() and f2fs_kvmalloc() as
below:

f2fs_memalloc()
{
	mem = f2fs_kmalloc();
	if (mem)
		return mem;
	return f2fs_kvmalloc();
}

It can be used in specified place where we really need it, like the place
descirbied in 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed") in where
we introduced original logic.

> 
>>
>>>> +static int lzo_compress_pages(struct compress_ctx *cc)
>>>> +{
>>>> +	int ret;
>>>> +
>>>> +	ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
>>>> +					&cc->clen, cc->private);
>>>> +	if (ret != LZO_E_OK) {
>>>> +		printk_ratelimited("%sF2FS-fs: lzo compress failed, ret:%d\n",
>>>> +								KERN_ERR, ret);
>>>> +		return -EIO;
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>
>>> Why not using f2fs_err()?  Same in lots of other places.
>>
>> We use printk_ratelimited at some points where we can afford to lose logs,
>> otherwise we use f2fs_{err,warn...} to record info as much as possible for
>> troubleshoot.
>>
> 
> It used to be the case that f2fs_msg() was ratelimited.  What stops it from
> spamming the logs now?

https://lore.kernel.org/patchwork/patch/973837/

> 
> The problem with a bare printk is that it doesn't show which filesystem instance
> the message is coming from.

We can add to print sbi->sb->s_id like f2fs_printk().

> 
>>>> +
>>>> +	ret = cops->compress_pages(cc);
>>>> +	if (ret)
>>>> +		goto out_vunmap_cbuf;
>>>> +
>>>> +	max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE;
>>>> +
>>>> +	if (cc->clen > max_len) {
>>>> +		ret = -EAGAIN;
>>>> +		goto out_vunmap_cbuf;
>>>> +	}
>>>
>>> Since we already know the max length we're willing to compress to (the max
>>> length for any space to be saved), why is more space than that being allocated?
>>> LZ4_compress_default() will return an error if there isn't enough space, so that
>>> error could just be used as the indication to store the data uncompressed.
>>
>> AFAIK, there is no such common error code returned from all compression
>> algorithms indicating there is no room for limited target size, however we need
>> that information to fallback to write raw pages. Any better idea?
>>
> 
> "Not enough room" is the only reasonable way for compression to fail, so all

At a glance, compression comments did say only fail due to out-of-space of
dst_buf, and it will fail due to other reasons as I checked few codes.
a) dst_buf is too small
b) src_buf is too large/small
c) wrong step
maybe missed other cases...

Yeah, we can get rid of condition b)/c) during implementation, however, what I'm
concern is the implementation is too tight to all error handling of all
compression algorithms, as we're not always aware of compression error handling
changes.

> that's needed is the ability for compression to report errors at all.  What
> specifically prevents this approach from working?
> 
>>>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
>>>>  {
>>>> -	/*
>>>> -	 * We use different work queues for decryption and for verity because
>>>> -	 * verity may require reading metadata pages that need decryption, and
>>>> -	 * we shouldn't recurse to the same workqueue.
>>>> -	 */
>>>
>>> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
>>> decryption and for verity?  See the comment above which is being deleted.
>>
>> Could you explain more about how deadlock happen? or share me a link address if
>> you have described that case somewhere?
>>
> 
> The verity work can read pages from the file which require decryption.  I'm
> concerned that it could deadlock if the work is scheduled on the same workqueue.

I assume you've tried one workqueue, and suffered deadlock..

> Granted, I'm not an expert in Linux workqueues, so if you've investigated this
> and determined that it's safe, can you explain why?

I'm not familiar with workqueue...  I guess it may not safe that if the work is
scheduled to the same cpu in where verity was waiting for data? if the work is
scheduled to other cpu, it may be safe.

I can check that before splitting the workqueue for verity and decrypt/decompress.

Thanks,

> 
> - Eric
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-26  1:31     ` Chao Yu
@ 2019-10-30 16:09       ` Jaegeuk Kim
  2019-10-31  2:27         ` Chao Yu
  0 siblings, 1 reply; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-30 16:09 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On 10/26, Chao Yu wrote:
> On 2019/10/26 2:18, Jaegeuk Kim wrote:
> > On 10/24, Chao Yu wrote:
> >> Hi Jaegeuk,
> >>
> >> On 2019/10/23 1:16, Jaegeuk Kim wrote:
> >>> This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
> >>> by allocating fully valid 2MB segment.
> >>>
> >>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> >>> ---
> >>>  fs/f2fs/f2fs.h     |  4 +++-
> >>>  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
> >>>  fs/f2fs/recovery.c |  2 +-
> >>>  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
> >>>  fs/f2fs/segment.h  |  2 ++
> >>>  fs/f2fs/super.c    |  1 +
> >>>  fs/f2fs/sysfs.c    |  2 ++
> >>>  7 files changed, 63 insertions(+), 8 deletions(-)
> >>>
> >>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> >>> index ca342f4c7db1..c681f51e351b 100644
> >>> --- a/fs/f2fs/f2fs.h
> >>> +++ b/fs/f2fs/f2fs.h
> >>> @@ -890,6 +890,7 @@ enum {
> >>>  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
> >>>  	CURSEG_COLD_NODE,	/* indirect node blocks */
> >>>  	NO_CHECK_TYPE,
> >>> +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
> >>>  };
> >>>  
> >>>  struct flush_cmd {
> >>> @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
> >>>  
> >>>  	/* threshold for gc trials on pinned files */
> >>>  	u64 gc_pin_file_threshold;
> >>> +	struct rw_semaphore pin_sem;
> >>>  
> >>>  	/* maximum # of trials to find a victim segment for SSR and GC */
> >>>  	unsigned int max_victim_search;
> >>> @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
> >>>  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
> >>>  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
> >>>  					unsigned int start, unsigned int end);
> >>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
> >>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
> >>>  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
> >>>  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
> >>>  					struct cp_control *cpc);
> >>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> >>> index 29bc0a542759..f6c038e8a6a7 100644
> >>> --- a/fs/f2fs/file.c
> >>> +++ b/fs/f2fs/file.c
> >>> @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
> >>>  	if (off_end)
> >>>  		map.m_len++;
> >>>  
> >>> -	if (f2fs_is_pinned_file(inode))
> >>> -		map.m_seg_type = CURSEG_COLD_DATA;
> >>> +	if (!map.m_len)
> >>> +		return 0;
> >>> +
> >>> +	if (f2fs_is_pinned_file(inode)) {
> >>> +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
> >>> +					sbi->log_blocks_per_seg;
> >>> +		block_t done = 0;
> >>> +
> >>> +		if (map.m_len % sbi->blocks_per_seg)
> >>> +			len += sbi->blocks_per_seg;
> >>>  
> >>> -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
> >>> -						F2FS_GET_BLOCK_PRE_DIO :
> >>> -						F2FS_GET_BLOCK_PRE_AIO));
> >>> +		map.m_len = sbi->blocks_per_seg;
> >>> +next_alloc:
> >>> +		mutex_lock(&sbi->gc_mutex);
> >>> +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
> >>> +		if (err && err != -ENODATA && err != -EAGAIN)
> >>> +			goto out_err;
> >>
> >> To grab enough free space?
> >>
> >> Shouldn't we call
> >>
> >> 	if (has_not_enough_free_secs(sbi, 0, 0)) {
> >> 		mutex_lock(&sbi->gc_mutex);
> >> 		f2fs_gc(sbi, false, false, NULL_SEGNO);
> >> 	}
> > 
> > The above calls gc all the time. Do we need this?
> 
> Hmmm... my concern is why we need to run foreground GC even if there is enough
> free space..

In order to get the free segment easily?

> 
> > 
> >>
> >>> +
> >>> +		down_write(&sbi->pin_sem);
> >>> +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
> >>> +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
> >>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
> >>> +		up_write(&sbi->pin_sem);
> >>> +
> >>> +		done += map.m_len;
> >>> +		len -= map.m_len;
> >>> +		map.m_lblk += map.m_len;
> >>> +		if (!err && len)
> >>> +			goto next_alloc;
> >>> +
> >>> +		map.m_len = done;
> >>> +	} else {
> >>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
> >>> +	}
> >>> +out_err:
> >>>  	if (err) {
> >>>  		pgoff_t last_off;
> >>>  
> >>> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> >>> index 783773e4560d..76477f71d4ee 100644
> >>> --- a/fs/f2fs/recovery.c
> >>> +++ b/fs/f2fs/recovery.c
> >>> @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
> >>>  		f2fs_put_page(page, 1);
> >>>  	}
> >>>  	if (!err)
> >>> -		f2fs_allocate_new_segments(sbi);
> >>> +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
> >>>  	return err;
> >>>  }
> >>>  
> >>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> >>> index 25c750cd0272..253d72c2663c 100644
> >>> --- a/fs/f2fs/segment.c
> >>> +++ b/fs/f2fs/segment.c
> >>> @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
> >>>  	up_read(&SM_I(sbi)->curseg_lock);
> >>>  }
> >>>  
> >>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> >>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
> >>>  {
> >>>  	struct curseg_info *curseg;
> >>>  	unsigned int old_segno;
> >>> @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> >>>  	down_write(&SIT_I(sbi)->sentry_lock);
> >>>  
> >>>  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
> >>> +		if (type != NO_CHECK_TYPE && i != type)
> >>> +			continue;
> >>> +
> >>>  		curseg = CURSEG_I(sbi, i);
> >>>  		old_segno = curseg->segno;
> >>>  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
> >>> @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
> >>>  {
> >>>  	struct sit_info *sit_i = SIT_I(sbi);
> >>>  	struct curseg_info *curseg = CURSEG_I(sbi, type);
> >>> +	bool put_pin_sem = false;
> >>> +
> >>> +	if (type == CURSEG_COLD_DATA) {
> >>> +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
> >>> +		if (down_read_trylock(&sbi->pin_sem)) {
> >>> +			put_pin_sem = true;
> >>> +		} else {
> >>> +			type = CURSEG_WARM_DATA;
> >>> +			curseg = CURSEG_I(sbi, type);
> >>
> >> It will mix pending cold data into warm area... rather than recovering curseg to
> >> write pointer of last cold segment?
> >>
> >> I know maybe that fallocate aligned address could be corner case, but I guess
> >> there should be some better solutions can handle race case more effectively.
> >>
> >> One solution could be: allocating a virtual log header to select free segment as
> >> 2m-aligned space target.
> > 
> > I thought about that, but concluded to avoid too much changes.
> 
> We have an unupstreamed feature which is based on virtual log header, I can
> introduce that basic virtual log fwk, which can be used for aligned allocation
> and later new features, would you like to check that?
> 
> Thanks,
> 
> > 
> >>
> >> Thanks,
> >>
> >>> +		}
> >>> +	} else if (type == CURSEG_COLD_DATA_PINNED) {
> >>> +		type = CURSEG_COLD_DATA;
> >>> +	}
> >>>  
> >>>  	down_read(&SM_I(sbi)->curseg_lock);
> >>>  
> >>> @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
> >>>  	mutex_unlock(&curseg->curseg_mutex);
> >>>  
> >>>  	up_read(&SM_I(sbi)->curseg_lock);
> >>> +
> >>> +	if (put_pin_sem)
> >>> +		up_read(&sbi->pin_sem);
> >>>  }
> >>>  
> >>>  static void update_device_state(struct f2fs_io_info *fio)
> >>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> >>> index 325781a1ae4d..a95467b202ea 100644
> >>> --- a/fs/f2fs/segment.h
> >>> +++ b/fs/f2fs/segment.h
> >>> @@ -313,6 +313,8 @@ struct sit_entry_set {
> >>>   */
> >>>  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
> >>>  {
> >>> +	if (type == CURSEG_COLD_DATA_PINNED)
> >>> +		type = CURSEG_COLD_DATA;
> >>>  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
> >>>  }
> >>>  
> >>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> >>> index f320fd11db48..c02a47ce551b 100644
> >>> --- a/fs/f2fs/super.c
> >>> +++ b/fs/f2fs/super.c
> >>> @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
> >>>  	spin_lock_init(&sbi->dev_lock);
> >>>  
> >>>  	init_rwsem(&sbi->sb_lock);
> >>> +	init_rwsem(&sbi->pin_sem);
> >>>  }
> >>>  
> >>>  static int init_percpu_info(struct f2fs_sb_info *sbi)
> >>> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
> >>> index b558b64a4c9c..f164959e4224 100644
> >>> --- a/fs/f2fs/sysfs.c
> >>> +++ b/fs/f2fs/sysfs.c
> >>> @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
> >>>  	if (f2fs_sb_has_casefold(sbi))
> >>>  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> >>>  				len ? ", " : "", "casefold");
> >>> +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> >>> +				len ? ", " : "", "pin_file");
> >>>  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
> >>>  	return len;
> >>>  }
> >>>
> > .
> > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30  8:43         ` Chao Yu
@ 2019-10-30 16:50           ` Eric Biggers
  2019-10-30 17:22             ` Gao Xiang via Linux-f2fs-devel
                               ` (2 more replies)
  2019-10-30 17:02           ` Eric Biggers
  1 sibling, 3 replies; 32+ messages in thread
From: Eric Biggers @ 2019-10-30 16:50 UTC (permalink / raw)
  To: Chao Yu; +Cc: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
> On 2019/10/30 10:55, Eric Biggers wrote:
> > On Tue, Oct 29, 2019 at 04:33:36PM +0800, Chao Yu wrote:
> >> On 2019/10/28 6:50, Eric Biggers wrote:
> >>>> +bool f2fs_is_compressed_page(struct page *page)
> >>>> +{
> >>>> +	if (!page_private(page))
> >>>> +		return false;
> >>>> +	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
> >>>> +		return false;
> >>>> +	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
> >>>> +}
> >>>
> >>> This code implies that there can be multiple page private structures each of
> >>> which has a different magic number.  But I only see F2FS_COMPRESSED_PAGE_MAGIC.
> >>> Where in the code is the other one(s)?
> >>
> >> I'm not sure I understood you correctly, did you mean it needs to introduce
> >> f2fs_is_atomic_written_page() and f2fs_is_dummy_written_page() like
> >> f2fs_is_compressed_page()?
> >>
> > 
> > No, I'm asking what is the case where the line
> > 
> > 	*((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC
> > 
> > returns false?
> 
> Should be this?
> 
> if (!page_private(page))
> 	return false;
> f2fs_bug_on(*((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC)
> return true;

Yes, that makes more sense, unless there are other cases.

> 
> > 
> >>>
> >>>> +
> >>>> +static void f2fs_set_compressed_page(struct page *page,
> >>>> +		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
> >>>> +{
> >>>> +	SetPagePrivate(page);
> >>>> +	set_page_private(page, (unsigned long)data);
> >>>> +
> >>>> +	/* i_crypto_info and iv index */
> >>>> +	page->index = index;
> >>>> +	page->mapping = inode->i_mapping;
> >>>> +	if (r)
> >>>> +		refcount_inc(r);
> >>>> +}
> >>>
> >>> It isn't really appropriate to create fake pagecache pages like this.  Did you
> >>> consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?
> >>
> >> We need to store i_crypto_info and iv index somewhere, in order to pass them to
> >> fscrypt_decrypt_block_inplace(), where did you suggest to store them?
> >>
> > 
> > The same place where the pages are stored.
> 
> Still we need allocate space for those fields, any strong reason to do so?
> 

page->mapping set implies that the page is a pagecache page.  Faking it could
cause problems with code elsewhere.

> > 
> >>>> +
> >>>> +void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
> >>>> +{
> >>>> +	kvfree(cc->rpages);
> >>>> +}
> >>>
> >>> The memory is allocated with kzalloc(), so why is it freed with kvfree() and not
> >>> just kfree()?
> >>
> >> It was allocated by f2fs_*alloc() which will fallback to kvmalloc() once
> >> kmalloc() failed.
> > 
> > This seems to be a bug in f2fs_kmalloc() -- it inappropriately falls back to
> > kvmalloc().  As per its name, it should only use kmalloc().  f2fs_kvmalloc()
> > already exists, so it can be used when the fallback is wanted.
> 
> We can introduce f2fs_memalloc() to wrap f2fs_kmalloc() and f2fs_kvmalloc() as
> below:
> 
> f2fs_memalloc()
> {
> 	mem = f2fs_kmalloc();
> 	if (mem)
> 		return mem;
> 	return f2fs_kvmalloc();
> }
> 
> It can be used in specified place where we really need it, like the place
> descirbied in 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed") in where
> we introduced original logic.

No, just use kvmalloc().  The whole point of kvmalloc() is that it tries
kmalloc() and then falls back to vmalloc() if it fails.

- Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30  8:43         ` Chao Yu
  2019-10-30 16:50           ` Eric Biggers
@ 2019-10-30 17:02           ` Eric Biggers
  2019-10-31  2:21             ` Chao Yu
  2019-11-13 13:10             ` Chao Yu
  1 sibling, 2 replies; 32+ messages in thread
From: Eric Biggers @ 2019-10-30 17:02 UTC (permalink / raw)
  To: Chao Yu; +Cc: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
> >>>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
> >>>>  {
> >>>> -	/*
> >>>> -	 * We use different work queues for decryption and for verity because
> >>>> -	 * verity may require reading metadata pages that need decryption, and
> >>>> -	 * we shouldn't recurse to the same workqueue.
> >>>> -	 */
> >>>
> >>> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
> >>> decryption and for verity?  See the comment above which is being deleted.
> >>
> >> Could you explain more about how deadlock happen? or share me a link address if
> >> you have described that case somewhere?
> >>
> > 
> > The verity work can read pages from the file which require decryption.  I'm
> > concerned that it could deadlock if the work is scheduled on the same workqueue.
> 
> I assume you've tried one workqueue, and suffered deadlock..
> 
> > Granted, I'm not an expert in Linux workqueues, so if you've investigated this
> > and determined that it's safe, can you explain why?
> 
> I'm not familiar with workqueue...  I guess it may not safe that if the work is
> scheduled to the same cpu in where verity was waiting for data? if the work is
> scheduled to other cpu, it may be safe.
> 
> I can check that before splitting the workqueue for verity and decrypt/decompress.
> 

Yes this is a real problem, try 'kvm-xfstests -c f2fs/encrypt generic/579'.
The worker thread gets deadlocked in f2fs_read_merkle_tree_page() waiting for
the Merkle tree page to be decrypted.  This is with the v2 compression patch;
it works fine on current mainline.

INFO: task kworker/u5:0:61 blocked for more than 30 seconds.
      Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/u5:0    D    0    61      2 0x80004000
Workqueue: f2fs_post_read_wq f2fs_post_read_work
Call Trace:
 context_switch kernel/sched/core.c:3384 [inline]
 __schedule+0x299/0x6c0 kernel/sched/core.c:4069
 schedule+0x44/0xd0 kernel/sched/core.c:4136
 io_schedule+0x11/0x40 kernel/sched/core.c:5780
 wait_on_page_bit_common mm/filemap.c:1174 [inline]
 wait_on_page_bit mm/filemap.c:1223 [inline]
 wait_on_page_locked include/linux/pagemap.h:527 [inline]
 wait_on_page_locked include/linux/pagemap.h:524 [inline]
 wait_on_page_read mm/filemap.c:2767 [inline]
 do_read_cache_page+0x407/0x660 mm/filemap.c:2810
 read_cache_page+0xd/0x10 mm/filemap.c:2894
 f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
 verify_page+0x110/0x560 fs/verity/verify.c:120
 fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
 verity_work fs/f2fs/data.c:142 [inline]
 f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
 process_one_work+0x225/0x550 kernel/workqueue.c:2269
 worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
 kthread+0x125/0x140 kernel/kthread.c:255
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
INFO: task kworker/u5:1:1140 blocked for more than 30 seconds.
      Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/u5:1    D    0  1140      2 0x80004000
Workqueue: f2fs_post_read_wq f2fs_post_read_work
Call Trace:
 context_switch kernel/sched/core.c:3384 [inline]
 __schedule+0x299/0x6c0 kernel/sched/core.c:4069
 schedule+0x44/0xd0 kernel/sched/core.c:4136
 io_schedule+0x11/0x40 kernel/sched/core.c:5780
 wait_on_page_bit_common mm/filemap.c:1174 [inline]
 wait_on_page_bit mm/filemap.c:1223 [inline]
 wait_on_page_locked include/linux/pagemap.h:527 [inline]
 wait_on_page_locked include/linux/pagemap.h:524 [inline]
 wait_on_page_read mm/filemap.c:2767 [inline]
 do_read_cache_page+0x407/0x660 mm/filemap.c:2810
 read_cache_page+0xd/0x10 mm/filemap.c:2894
 f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
 verify_page+0x110/0x560 fs/verity/verify.c:120
 fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
 verity_work fs/f2fs/data.c:142 [inline]
 f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
 process_one_work+0x225/0x550 kernel/workqueue.c:2269
 worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
 kthread+0x125/0x140 kernel/kthread.c:255
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

Showing all locks held in the system:
1 lock held by khungtaskd/21:
 #0: ffffffff82250520 (rcu_read_lock){....}, at: rcu_lock_acquire.constprop.0+0x0/0x30 include/trace/events/lock.h:13
2 locks held by kworker/u5:0/61:
 #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
 #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
 #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
 #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
 #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
 #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
2 locks held by kworker/u5:1/1140:
 #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
 #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
 #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
 #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
 #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
 #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30 16:50           ` Eric Biggers
@ 2019-10-30 17:22             ` Gao Xiang via Linux-f2fs-devel
  2019-10-30 17:47             ` Jaegeuk Kim
  2019-10-31  2:16             ` Chao Yu
  2 siblings, 0 replies; 32+ messages in thread
From: Gao Xiang via Linux-f2fs-devel @ 2019-10-30 17:22 UTC (permalink / raw)
  To: Chao Yu, Jaegeuk Kim, linux-kernel, linux-f2fs-devel,
	Andrew Morton, Matthew Wilcox, Fengguang Wu

Hi Eric,

(add some mm folks...)

On Wed, Oct 30, 2019 at 09:50:56AM -0700, Eric Biggers wrote:

<snip>

> > >>>
> > >>> It isn't really appropriate to create fake pagecache pages like this.  Did you
> > >>> consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?
> > >>
> > >> We need to store i_crypto_info and iv index somewhere, in order to pass them to
> > >> fscrypt_decrypt_block_inplace(), where did you suggest to store them?
> > >>
> > > 
> > > The same place where the pages are stored.
> > 
> > Still we need allocate space for those fields, any strong reason to do so?
> > 
> 
> page->mapping set implies that the page is a pagecache page.  Faking it could
> cause problems with code elsewhere.

Not very related with this patch. Faking page->mapping was used in zsmalloc before
nonLRU migration (see material [1]) and use in erofs now (page->mapping to indicate
nonLRU short lifetime temporary page type, page->private is used for per-page information),
as far as I know, NonLRU page without PAGE_MAPPING_MOVABLE set is safe for most mm code.

On the other hands, I think NULL page->mapping will waste such field in precious
page structure... And we can not get such page type directly only by a NULL --
a truncated file page or just allocated page or some type internal temporary pages...

So I have some proposal is to use page->mapping to indicate specific page type for
such nonLRU pages (by some common convention, e.g. some real structure, rather than
just zero out to waste 8 bytes, it's also natural to indicate some page type by
its `mapping' naming )... Since my English is not very well, I delay it util now...

[1] https://elixir.bootlin.com/linux/v3.18.140/source/mm/zsmalloc.c#L379
    https://lore.kernel.org/linux-mm/1459321935-3655-7-git-send-email-minchan@kernel.org
    and some not very related topic: https://lwn.net/Articles/752564/

Thanks,
Gao Xiang



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30 16:50           ` Eric Biggers
  2019-10-30 17:22             ` Gao Xiang via Linux-f2fs-devel
@ 2019-10-30 17:47             ` Jaegeuk Kim
  2019-10-31  2:16             ` Chao Yu
  2 siblings, 0 replies; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-30 17:47 UTC (permalink / raw)
  To: Chao Yu, linux-kernel, linux-f2fs-devel

On 10/30, Eric Biggers wrote:
> On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
> > On 2019/10/30 10:55, Eric Biggers wrote:
> > > On Tue, Oct 29, 2019 at 04:33:36PM +0800, Chao Yu wrote:
> > >> On 2019/10/28 6:50, Eric Biggers wrote:
> > >>>> +bool f2fs_is_compressed_page(struct page *page)
> > >>>> +{
> > >>>> +	if (!page_private(page))
> > >>>> +		return false;
> > >>>> +	if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
> > >>>> +		return false;
> > >>>> +	return *((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC;
> > >>>> +}
> > >>>
> > >>> This code implies that there can be multiple page private structures each of
> > >>> which has a different magic number.  But I only see F2FS_COMPRESSED_PAGE_MAGIC.
> > >>> Where in the code is the other one(s)?
> > >>
> > >> I'm not sure I understood you correctly, did you mean it needs to introduce
> > >> f2fs_is_atomic_written_page() and f2fs_is_dummy_written_page() like
> > >> f2fs_is_compressed_page()?
> > >>
> > > 
> > > No, I'm asking what is the case where the line
> > > 
> > > 	*((u32 *)page_private(page)) == F2FS_COMPRESSED_PAGE_MAGIC
> > > 
> > > returns false?
> > 
> > Should be this?
> > 
> > if (!page_private(page))
> > 	return false;
> > f2fs_bug_on(*((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC)
> > return true;
> 
> Yes, that makes more sense, unless there are other cases.
> 
> > 
> > > 
> > >>>
> > >>>> +
> > >>>> +static void f2fs_set_compressed_page(struct page *page,
> > >>>> +		struct inode *inode, pgoff_t index, void *data, refcount_t *r)
> > >>>> +{
> > >>>> +	SetPagePrivate(page);
> > >>>> +	set_page_private(page, (unsigned long)data);
> > >>>> +
> > >>>> +	/* i_crypto_info and iv index */
> > >>>> +	page->index = index;
> > >>>> +	page->mapping = inode->i_mapping;
> > >>>> +	if (r)
> > >>>> +		refcount_inc(r);
> > >>>> +}
> > >>>
> > >>> It isn't really appropriate to create fake pagecache pages like this.  Did you
> > >>> consider changing f2fs to use fscrypt_decrypt_block_inplace() instead?
> > >>
> > >> We need to store i_crypto_info and iv index somewhere, in order to pass them to
> > >> fscrypt_decrypt_block_inplace(), where did you suggest to store them?
> > >>
> > > 
> > > The same place where the pages are stored.
> > 
> > Still we need allocate space for those fields, any strong reason to do so?
> > 
> 
> page->mapping set implies that the page is a pagecache page.  Faking it could
> cause problems with code elsewhere.

I've checked it with minchan, and it seems to be fine that filesystem uses
this page internally only, not in pagecache.

> 
> > > 
> > >>>> +
> > >>>> +void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
> > >>>> +{
> > >>>> +	kvfree(cc->rpages);
> > >>>> +}
> > >>>
> > >>> The memory is allocated with kzalloc(), so why is it freed with kvfree() and not
> > >>> just kfree()?
> > >>
> > >> It was allocated by f2fs_*alloc() which will fallback to kvmalloc() once
> > >> kmalloc() failed.
> > > 
> > > This seems to be a bug in f2fs_kmalloc() -- it inappropriately falls back to
> > > kvmalloc().  As per its name, it should only use kmalloc().  f2fs_kvmalloc()
> > > already exists, so it can be used when the fallback is wanted.
> > 
> > We can introduce f2fs_memalloc() to wrap f2fs_kmalloc() and f2fs_kvmalloc() as
> > below:
> > 
> > f2fs_memalloc()
> > {
> > 	mem = f2fs_kmalloc();
> > 	if (mem)
> > 		return mem;
> > 	return f2fs_kvmalloc();
> > }
> > 
> > It can be used in specified place where we really need it, like the place
> > descirbied in 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed") in where
> > we introduced original logic.
> 
> No, just use kvmalloc().  The whole point of kvmalloc() is that it tries
> kmalloc() and then falls back to vmalloc() if it fails.
> 
> - Eric


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30 16:50           ` Eric Biggers
  2019-10-30 17:22             ` Gao Xiang via Linux-f2fs-devel
  2019-10-30 17:47             ` Jaegeuk Kim
@ 2019-10-31  2:16             ` Chao Yu
  2019-10-31 15:35               ` Jaegeuk Kim
  2 siblings, 1 reply; 32+ messages in thread
From: Chao Yu @ 2019-10-31  2:16 UTC (permalink / raw)
  To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2019/10/31 0:50, Eric Biggers wrote:
> No, just use kvmalloc().  The whole point of kvmalloc() is that it tries
> kmalloc() and then falls back to vmalloc() if it fails.

Okay, it's fine to me, let me fix this in another patch.

Thanks,

> 
> - Eric
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30 17:02           ` Eric Biggers
@ 2019-10-31  2:21             ` Chao Yu
  2019-11-13 13:10             ` Chao Yu
  1 sibling, 0 replies; 32+ messages in thread
From: Chao Yu @ 2019-10-31  2:21 UTC (permalink / raw)
  To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2019/10/31 1:02, Eric Biggers wrote:
> On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
>>>>>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
>>>>>>  {
>>>>>> -	/*
>>>>>> -	 * We use different work queues for decryption and for verity because
>>>>>> -	 * verity may require reading metadata pages that need decryption, and
>>>>>> -	 * we shouldn't recurse to the same workqueue.
>>>>>> -	 */
>>>>>
>>>>> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
>>>>> decryption and for verity?  See the comment above which is being deleted.
>>>>
>>>> Could you explain more about how deadlock happen? or share me a link address if
>>>> you have described that case somewhere?
>>>>
>>>
>>> The verity work can read pages from the file which require decryption.  I'm
>>> concerned that it could deadlock if the work is scheduled on the same workqueue.
>>
>> I assume you've tried one workqueue, and suffered deadlock..
>>
>>> Granted, I'm not an expert in Linux workqueues, so if you've investigated this
>>> and determined that it's safe, can you explain why?
>>
>> I'm not familiar with workqueue...  I guess it may not safe that if the work is
>> scheduled to the same cpu in where verity was waiting for data? if the work is
>> scheduled to other cpu, it may be safe.
>>
>> I can check that before splitting the workqueue for verity and decrypt/decompress.
>>
> 
> Yes this is a real problem, try 'kvm-xfstests -c f2fs/encrypt generic/579'.
> The worker thread gets deadlocked in f2fs_read_merkle_tree_page() waiting for
> the Merkle tree page to be decrypted.  This is with the v2 compression patch;
> it works fine on current mainline.

Oh, alright...

Let me split them, thanks very much for all the comments and test anyway.

Thanks,

> 
> INFO: task kworker/u5:0:61 blocked for more than 30 seconds.
>       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u5:0    D    0    61      2 0x80004000
> Workqueue: f2fs_post_read_wq f2fs_post_read_work
> Call Trace:
>  context_switch kernel/sched/core.c:3384 [inline]
>  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
>  schedule+0x44/0xd0 kernel/sched/core.c:4136
>  io_schedule+0x11/0x40 kernel/sched/core.c:5780
>  wait_on_page_bit_common mm/filemap.c:1174 [inline]
>  wait_on_page_bit mm/filemap.c:1223 [inline]
>  wait_on_page_locked include/linux/pagemap.h:527 [inline]
>  wait_on_page_locked include/linux/pagemap.h:524 [inline]
>  wait_on_page_read mm/filemap.c:2767 [inline]
>  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
>  read_cache_page+0xd/0x10 mm/filemap.c:2894
>  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
>  verify_page+0x110/0x560 fs/verity/verify.c:120
>  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
>  verity_work fs/f2fs/data.c:142 [inline]
>  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
>  process_one_work+0x225/0x550 kernel/workqueue.c:2269
>  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
>  kthread+0x125/0x140 kernel/kthread.c:255
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> INFO: task kworker/u5:1:1140 blocked for more than 30 seconds.
>       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u5:1    D    0  1140      2 0x80004000
> Workqueue: f2fs_post_read_wq f2fs_post_read_work
> Call Trace:
>  context_switch kernel/sched/core.c:3384 [inline]
>  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
>  schedule+0x44/0xd0 kernel/sched/core.c:4136
>  io_schedule+0x11/0x40 kernel/sched/core.c:5780
>  wait_on_page_bit_common mm/filemap.c:1174 [inline]
>  wait_on_page_bit mm/filemap.c:1223 [inline]
>  wait_on_page_locked include/linux/pagemap.h:527 [inline]
>  wait_on_page_locked include/linux/pagemap.h:524 [inline]
>  wait_on_page_read mm/filemap.c:2767 [inline]
>  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
>  read_cache_page+0xd/0x10 mm/filemap.c:2894
>  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
>  verify_page+0x110/0x560 fs/verity/verify.c:120
>  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
>  verity_work fs/f2fs/data.c:142 [inline]
>  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
>  process_one_work+0x225/0x550 kernel/workqueue.c:2269
>  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
>  kthread+0x125/0x140 kernel/kthread.c:255
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> 
> Showing all locks held in the system:
> 1 lock held by khungtaskd/21:
>  #0: ffffffff82250520 (rcu_read_lock){....}, at: rcu_lock_acquire.constprop.0+0x0/0x30 include/trace/events/lock.h:13
> 2 locks held by kworker/u5:0/61:
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
>  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> 2 locks held by kworker/u5:1/1140:
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
>  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-30 16:09       ` Jaegeuk Kim
@ 2019-10-31  2:27         ` Chao Yu
  2019-10-31 15:29           ` Jaegeuk Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Chao Yu @ 2019-10-31  2:27 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

On 2019/10/31 0:09, Jaegeuk Kim wrote:
> On 10/26, Chao Yu wrote:
>> On 2019/10/26 2:18, Jaegeuk Kim wrote:
>>> On 10/24, Chao Yu wrote:
>>>> Hi Jaegeuk,
>>>>
>>>> On 2019/10/23 1:16, Jaegeuk Kim wrote:
>>>>> This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
>>>>> by allocating fully valid 2MB segment.
>>>>>
>>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>>>>> ---
>>>>>  fs/f2fs/f2fs.h     |  4 +++-
>>>>>  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
>>>>>  fs/f2fs/recovery.c |  2 +-
>>>>>  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
>>>>>  fs/f2fs/segment.h  |  2 ++
>>>>>  fs/f2fs/super.c    |  1 +
>>>>>  fs/f2fs/sysfs.c    |  2 ++
>>>>>  7 files changed, 63 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>>> index ca342f4c7db1..c681f51e351b 100644
>>>>> --- a/fs/f2fs/f2fs.h
>>>>> +++ b/fs/f2fs/f2fs.h
>>>>> @@ -890,6 +890,7 @@ enum {
>>>>>  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
>>>>>  	CURSEG_COLD_NODE,	/* indirect node blocks */
>>>>>  	NO_CHECK_TYPE,
>>>>> +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
>>>>>  };
>>>>>  
>>>>>  struct flush_cmd {
>>>>> @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
>>>>>  
>>>>>  	/* threshold for gc trials on pinned files */
>>>>>  	u64 gc_pin_file_threshold;
>>>>> +	struct rw_semaphore pin_sem;
>>>>>  
>>>>>  	/* maximum # of trials to find a victim segment for SSR and GC */
>>>>>  	unsigned int max_victim_search;
>>>>> @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
>>>>>  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
>>>>>  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>>>>>  					unsigned int start, unsigned int end);
>>>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
>>>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
>>>>>  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
>>>>>  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
>>>>>  					struct cp_control *cpc);
>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>> index 29bc0a542759..f6c038e8a6a7 100644
>>>>> --- a/fs/f2fs/file.c
>>>>> +++ b/fs/f2fs/file.c
>>>>> @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
>>>>>  	if (off_end)
>>>>>  		map.m_len++;
>>>>>  
>>>>> -	if (f2fs_is_pinned_file(inode))
>>>>> -		map.m_seg_type = CURSEG_COLD_DATA;
>>>>> +	if (!map.m_len)
>>>>> +		return 0;
>>>>> +
>>>>> +	if (f2fs_is_pinned_file(inode)) {
>>>>> +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
>>>>> +					sbi->log_blocks_per_seg;
>>>>> +		block_t done = 0;
>>>>> +
>>>>> +		if (map.m_len % sbi->blocks_per_seg)
>>>>> +			len += sbi->blocks_per_seg;
>>>>>  
>>>>> -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
>>>>> -						F2FS_GET_BLOCK_PRE_DIO :
>>>>> -						F2FS_GET_BLOCK_PRE_AIO));
>>>>> +		map.m_len = sbi->blocks_per_seg;
>>>>> +next_alloc:
>>>>> +		mutex_lock(&sbi->gc_mutex);
>>>>> +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
>>>>> +		if (err && err != -ENODATA && err != -EAGAIN)
>>>>> +			goto out_err;
>>>>
>>>> To grab enough free space?
>>>>
>>>> Shouldn't we call
>>>>
>>>> 	if (has_not_enough_free_secs(sbi, 0, 0)) {
>>>> 		mutex_lock(&sbi->gc_mutex);
>>>> 		f2fs_gc(sbi, false, false, NULL_SEGNO);
>>>> 	}
>>>
>>> The above calls gc all the time. Do we need this?
>>
>> Hmmm... my concern is why we need to run foreground GC even if there is enough
>> free space..
> 
> In order to get the free segment easily?

However, I doubt arbitrary foreground GC with greedy algorithm will ruin
hot/cold data separation, actually, for sufficient free segment case, it's
unnecessary to call FGGC.

Thanks,

> 
>>
>>>
>>>>
>>>>> +
>>>>> +		down_write(&sbi->pin_sem);
>>>>> +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
>>>>> +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
>>>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
>>>>> +		up_write(&sbi->pin_sem);
>>>>> +
>>>>> +		done += map.m_len;
>>>>> +		len -= map.m_len;
>>>>> +		map.m_lblk += map.m_len;
>>>>> +		if (!err && len)
>>>>> +			goto next_alloc;
>>>>> +
>>>>> +		map.m_len = done;
>>>>> +	} else {
>>>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
>>>>> +	}
>>>>> +out_err:
>>>>>  	if (err) {
>>>>>  		pgoff_t last_off;
>>>>>  
>>>>> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
>>>>> index 783773e4560d..76477f71d4ee 100644
>>>>> --- a/fs/f2fs/recovery.c
>>>>> +++ b/fs/f2fs/recovery.c
>>>>> @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
>>>>>  		f2fs_put_page(page, 1);
>>>>>  	}
>>>>>  	if (!err)
>>>>> -		f2fs_allocate_new_segments(sbi);
>>>>> +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
>>>>>  	return err;
>>>>>  }
>>>>>  
>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>>> index 25c750cd0272..253d72c2663c 100644
>>>>> --- a/fs/f2fs/segment.c
>>>>> +++ b/fs/f2fs/segment.c
>>>>> @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>>>>>  	up_read(&SM_I(sbi)->curseg_lock);
>>>>>  }
>>>>>  
>>>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>>>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
>>>>>  {
>>>>>  	struct curseg_info *curseg;
>>>>>  	unsigned int old_segno;
>>>>> @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>>>>>  	down_write(&SIT_I(sbi)->sentry_lock);
>>>>>  
>>>>>  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
>>>>> +		if (type != NO_CHECK_TYPE && i != type)
>>>>> +			continue;
>>>>> +
>>>>>  		curseg = CURSEG_I(sbi, i);
>>>>>  		old_segno = curseg->segno;
>>>>>  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
>>>>> @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>>>>>  {
>>>>>  	struct sit_info *sit_i = SIT_I(sbi);
>>>>>  	struct curseg_info *curseg = CURSEG_I(sbi, type);
>>>>> +	bool put_pin_sem = false;
>>>>> +
>>>>> +	if (type == CURSEG_COLD_DATA) {
>>>>> +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
>>>>> +		if (down_read_trylock(&sbi->pin_sem)) {
>>>>> +			put_pin_sem = true;
>>>>> +		} else {
>>>>> +			type = CURSEG_WARM_DATA;
>>>>> +			curseg = CURSEG_I(sbi, type);
>>>>
>>>> It will mix pending cold data into warm area... rather than recovering curseg to
>>>> write pointer of last cold segment?
>>>>
>>>> I know maybe that fallocate aligned address could be corner case, but I guess
>>>> there should be some better solutions can handle race case more effectively.
>>>>
>>>> One solution could be: allocating a virtual log header to select free segment as
>>>> 2m-aligned space target.
>>>
>>> I thought about that, but concluded to avoid too much changes.
>>
>> We have an unupstreamed feature which is based on virtual log header, I can
>> introduce that basic virtual log fwk, which can be used for aligned allocation
>> and later new features, would you like to check that?
>>
>> Thanks,
>>
>>>
>>>>
>>>> Thanks,
>>>>
>>>>> +		}
>>>>> +	} else if (type == CURSEG_COLD_DATA_PINNED) {
>>>>> +		type = CURSEG_COLD_DATA;
>>>>> +	}
>>>>>  
>>>>>  	down_read(&SM_I(sbi)->curseg_lock);
>>>>>  
>>>>> @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>>>>>  	mutex_unlock(&curseg->curseg_mutex);
>>>>>  
>>>>>  	up_read(&SM_I(sbi)->curseg_lock);
>>>>> +
>>>>> +	if (put_pin_sem)
>>>>> +		up_read(&sbi->pin_sem);
>>>>>  }
>>>>>  
>>>>>  static void update_device_state(struct f2fs_io_info *fio)
>>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>>> index 325781a1ae4d..a95467b202ea 100644
>>>>> --- a/fs/f2fs/segment.h
>>>>> +++ b/fs/f2fs/segment.h
>>>>> @@ -313,6 +313,8 @@ struct sit_entry_set {
>>>>>   */
>>>>>  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
>>>>>  {
>>>>> +	if (type == CURSEG_COLD_DATA_PINNED)
>>>>> +		type = CURSEG_COLD_DATA;
>>>>>  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
>>>>>  }
>>>>>  
>>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>>> index f320fd11db48..c02a47ce551b 100644
>>>>> --- a/fs/f2fs/super.c
>>>>> +++ b/fs/f2fs/super.c
>>>>> @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
>>>>>  	spin_lock_init(&sbi->dev_lock);
>>>>>  
>>>>>  	init_rwsem(&sbi->sb_lock);
>>>>> +	init_rwsem(&sbi->pin_sem);
>>>>>  }
>>>>>  
>>>>>  static int init_percpu_info(struct f2fs_sb_info *sbi)
>>>>> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
>>>>> index b558b64a4c9c..f164959e4224 100644
>>>>> --- a/fs/f2fs/sysfs.c
>>>>> +++ b/fs/f2fs/sysfs.c
>>>>> @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
>>>>>  	if (f2fs_sb_has_casefold(sbi))
>>>>>  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>>>>>  				len ? ", " : "", "casefold");
>>>>> +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>>>>> +				len ? ", " : "", "pin_file");
>>>>>  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
>>>>>  	return len;
>>>>>  }
>>>>>
>>> .
>>>
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-31  2:27         ` Chao Yu
@ 2019-10-31 15:29           ` Jaegeuk Kim
  2019-11-05  3:39             ` Chao Yu
  0 siblings, 1 reply; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-31 15:29 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On 10/31, Chao Yu wrote:
> On 2019/10/31 0:09, Jaegeuk Kim wrote:
> > On 10/26, Chao Yu wrote:
> >> On 2019/10/26 2:18, Jaegeuk Kim wrote:
> >>> On 10/24, Chao Yu wrote:
> >>>> Hi Jaegeuk,
> >>>>
> >>>> On 2019/10/23 1:16, Jaegeuk Kim wrote:
> >>>>> This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
> >>>>> by allocating fully valid 2MB segment.
> >>>>>
> >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> >>>>> ---
> >>>>>  fs/f2fs/f2fs.h     |  4 +++-
> >>>>>  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
> >>>>>  fs/f2fs/recovery.c |  2 +-
> >>>>>  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
> >>>>>  fs/f2fs/segment.h  |  2 ++
> >>>>>  fs/f2fs/super.c    |  1 +
> >>>>>  fs/f2fs/sysfs.c    |  2 ++
> >>>>>  7 files changed, 63 insertions(+), 8 deletions(-)
> >>>>>
> >>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> >>>>> index ca342f4c7db1..c681f51e351b 100644
> >>>>> --- a/fs/f2fs/f2fs.h
> >>>>> +++ b/fs/f2fs/f2fs.h
> >>>>> @@ -890,6 +890,7 @@ enum {
> >>>>>  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
> >>>>>  	CURSEG_COLD_NODE,	/* indirect node blocks */
> >>>>>  	NO_CHECK_TYPE,
> >>>>> +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
> >>>>>  };
> >>>>>  
> >>>>>  struct flush_cmd {
> >>>>> @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
> >>>>>  
> >>>>>  	/* threshold for gc trials on pinned files */
> >>>>>  	u64 gc_pin_file_threshold;
> >>>>> +	struct rw_semaphore pin_sem;
> >>>>>  
> >>>>>  	/* maximum # of trials to find a victim segment for SSR and GC */
> >>>>>  	unsigned int max_victim_search;
> >>>>> @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
> >>>>>  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
> >>>>>  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
> >>>>>  					unsigned int start, unsigned int end);
> >>>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
> >>>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
> >>>>>  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
> >>>>>  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
> >>>>>  					struct cp_control *cpc);
> >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> >>>>> index 29bc0a542759..f6c038e8a6a7 100644
> >>>>> --- a/fs/f2fs/file.c
> >>>>> +++ b/fs/f2fs/file.c
> >>>>> @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
> >>>>>  	if (off_end)
> >>>>>  		map.m_len++;
> >>>>>  
> >>>>> -	if (f2fs_is_pinned_file(inode))
> >>>>> -		map.m_seg_type = CURSEG_COLD_DATA;
> >>>>> +	if (!map.m_len)
> >>>>> +		return 0;
> >>>>> +
> >>>>> +	if (f2fs_is_pinned_file(inode)) {
> >>>>> +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
> >>>>> +					sbi->log_blocks_per_seg;
> >>>>> +		block_t done = 0;
> >>>>> +
> >>>>> +		if (map.m_len % sbi->blocks_per_seg)
> >>>>> +			len += sbi->blocks_per_seg;
> >>>>>  
> >>>>> -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
> >>>>> -						F2FS_GET_BLOCK_PRE_DIO :
> >>>>> -						F2FS_GET_BLOCK_PRE_AIO));
> >>>>> +		map.m_len = sbi->blocks_per_seg;
> >>>>> +next_alloc:
> >>>>> +		mutex_lock(&sbi->gc_mutex);
> >>>>> +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
> >>>>> +		if (err && err != -ENODATA && err != -EAGAIN)
> >>>>> +			goto out_err;
> >>>>
> >>>> To grab enough free space?
> >>>>
> >>>> Shouldn't we call
> >>>>
> >>>> 	if (has_not_enough_free_secs(sbi, 0, 0)) {
> >>>> 		mutex_lock(&sbi->gc_mutex);
> >>>> 		f2fs_gc(sbi, false, false, NULL_SEGNO);
> >>>> 	}
> >>>
> >>> The above calls gc all the time. Do we need this?
> >>
> >> Hmmm... my concern is why we need to run foreground GC even if there is enough
> >> free space..
> > 
> > In order to get the free segment easily?
> 
> However, I doubt arbitrary foreground GC with greedy algorithm will ruin
> hot/cold data separation, actually, for sufficient free segment case, it's
> unnecessary to call FGGC.

Two things here; 1) I do worry much about when hitting boundary on
has_not_enough_free_secs() which calculates # of free segments based on # of
dirty pages. In this case, we just jump to allocate another free segment so
I think it increases the possiblity of no free segment panic. 2) Even if we
do call FGGC a lot, I don't think it will *ruin* the hot/cold data separation
a lot. Putting hot/warm blocks together into cold log will make another hot
segment which was being used as cold log. IOWs, we don't need to keep hot data
in hot log at all, but should be fine to split hot and cold data in different
segments. So, I chose to go safer way since this is eating free segments
directly.

> 
> Thanks,
> 
> > 
> >>
> >>>
> >>>>
> >>>>> +
> >>>>> +		down_write(&sbi->pin_sem);
> >>>>> +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
> >>>>> +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
> >>>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
> >>>>> +		up_write(&sbi->pin_sem);
> >>>>> +
> >>>>> +		done += map.m_len;
> >>>>> +		len -= map.m_len;
> >>>>> +		map.m_lblk += map.m_len;
> >>>>> +		if (!err && len)
> >>>>> +			goto next_alloc;
> >>>>> +
> >>>>> +		map.m_len = done;
> >>>>> +	} else {
> >>>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
> >>>>> +	}
> >>>>> +out_err:
> >>>>>  	if (err) {
> >>>>>  		pgoff_t last_off;
> >>>>>  
> >>>>> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> >>>>> index 783773e4560d..76477f71d4ee 100644
> >>>>> --- a/fs/f2fs/recovery.c
> >>>>> +++ b/fs/f2fs/recovery.c
> >>>>> @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
> >>>>>  		f2fs_put_page(page, 1);
> >>>>>  	}
> >>>>>  	if (!err)
> >>>>> -		f2fs_allocate_new_segments(sbi);
> >>>>> +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
> >>>>>  	return err;
> >>>>>  }
> >>>>>  
> >>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> >>>>> index 25c750cd0272..253d72c2663c 100644
> >>>>> --- a/fs/f2fs/segment.c
> >>>>> +++ b/fs/f2fs/segment.c
> >>>>> @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
> >>>>>  	up_read(&SM_I(sbi)->curseg_lock);
> >>>>>  }
> >>>>>  
> >>>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> >>>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
> >>>>>  {
> >>>>>  	struct curseg_info *curseg;
> >>>>>  	unsigned int old_segno;
> >>>>> @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
> >>>>>  	down_write(&SIT_I(sbi)->sentry_lock);
> >>>>>  
> >>>>>  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
> >>>>> +		if (type != NO_CHECK_TYPE && i != type)
> >>>>> +			continue;
> >>>>> +
> >>>>>  		curseg = CURSEG_I(sbi, i);
> >>>>>  		old_segno = curseg->segno;
> >>>>>  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
> >>>>> @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
> >>>>>  {
> >>>>>  	struct sit_info *sit_i = SIT_I(sbi);
> >>>>>  	struct curseg_info *curseg = CURSEG_I(sbi, type);
> >>>>> +	bool put_pin_sem = false;
> >>>>> +
> >>>>> +	if (type == CURSEG_COLD_DATA) {
> >>>>> +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
> >>>>> +		if (down_read_trylock(&sbi->pin_sem)) {
> >>>>> +			put_pin_sem = true;
> >>>>> +		} else {
> >>>>> +			type = CURSEG_WARM_DATA;
> >>>>> +			curseg = CURSEG_I(sbi, type);
> >>>>
> >>>> It will mix pending cold data into warm area... rather than recovering curseg to
> >>>> write pointer of last cold segment?
> >>>>
> >>>> I know maybe that fallocate aligned address could be corner case, but I guess
> >>>> there should be some better solutions can handle race case more effectively.
> >>>>
> >>>> One solution could be: allocating a virtual log header to select free segment as
> >>>> 2m-aligned space target.
> >>>
> >>> I thought about that, but concluded to avoid too much changes.
> >>
> >> We have an unupstreamed feature which is based on virtual log header, I can
> >> introduce that basic virtual log fwk, which can be used for aligned allocation
> >> and later new features, would you like to check that?
> >>
> >> Thanks,
> >>
> >>>
> >>>>
> >>>> Thanks,
> >>>>
> >>>>> +		}
> >>>>> +	} else if (type == CURSEG_COLD_DATA_PINNED) {
> >>>>> +		type = CURSEG_COLD_DATA;
> >>>>> +	}
> >>>>>  
> >>>>>  	down_read(&SM_I(sbi)->curseg_lock);
> >>>>>  
> >>>>> @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
> >>>>>  	mutex_unlock(&curseg->curseg_mutex);
> >>>>>  
> >>>>>  	up_read(&SM_I(sbi)->curseg_lock);
> >>>>> +
> >>>>> +	if (put_pin_sem)
> >>>>> +		up_read(&sbi->pin_sem);
> >>>>>  }
> >>>>>  
> >>>>>  static void update_device_state(struct f2fs_io_info *fio)
> >>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> >>>>> index 325781a1ae4d..a95467b202ea 100644
> >>>>> --- a/fs/f2fs/segment.h
> >>>>> +++ b/fs/f2fs/segment.h
> >>>>> @@ -313,6 +313,8 @@ struct sit_entry_set {
> >>>>>   */
> >>>>>  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
> >>>>>  {
> >>>>> +	if (type == CURSEG_COLD_DATA_PINNED)
> >>>>> +		type = CURSEG_COLD_DATA;
> >>>>>  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
> >>>>>  }
> >>>>>  
> >>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> >>>>> index f320fd11db48..c02a47ce551b 100644
> >>>>> --- a/fs/f2fs/super.c
> >>>>> +++ b/fs/f2fs/super.c
> >>>>> @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
> >>>>>  	spin_lock_init(&sbi->dev_lock);
> >>>>>  
> >>>>>  	init_rwsem(&sbi->sb_lock);
> >>>>> +	init_rwsem(&sbi->pin_sem);
> >>>>>  }
> >>>>>  
> >>>>>  static int init_percpu_info(struct f2fs_sb_info *sbi)
> >>>>> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
> >>>>> index b558b64a4c9c..f164959e4224 100644
> >>>>> --- a/fs/f2fs/sysfs.c
> >>>>> +++ b/fs/f2fs/sysfs.c
> >>>>> @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
> >>>>>  	if (f2fs_sb_has_casefold(sbi))
> >>>>>  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> >>>>>  				len ? ", " : "", "casefold");
> >>>>> +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
> >>>>> +				len ? ", " : "", "pin_file");
> >>>>>  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
> >>>>>  	return len;
> >>>>>  }
> >>>>>
> >>> .
> >>>
> > .
> > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-31  2:16             ` Chao Yu
@ 2019-10-31 15:35               ` Jaegeuk Kim
  2019-11-01 10:02                 ` Chao Yu
  0 siblings, 1 reply; 32+ messages in thread
From: Jaegeuk Kim @ 2019-10-31 15:35 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On 10/31, Chao Yu wrote:
> On 2019/10/31 0:50, Eric Biggers wrote:
> > No, just use kvmalloc().  The whole point of kvmalloc() is that it tries
> > kmalloc() and then falls back to vmalloc() if it fails.
> 
> Okay, it's fine to me, let me fix this in another patch.

I've fixed some bugs. (e.g., mmap) Please apply this in your next patch, so that
I can continue to test new version as early as possible.

With this patch, I could boot up a device and install some apps successfully
with "compress_extension=*".

---
 fs/f2fs/compress.c | 229 +++++++++++++++++++++++----------------------
 fs/f2fs/data.c     | 109 +++++++++++++--------
 fs/f2fs/f2fs.h     |  22 +++--
 fs/f2fs/file.c     |  71 +++++++++-----
 fs/f2fs/namei.c    |  20 +++-
 5 files changed, 264 insertions(+), 187 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index f276d82a67aa..e03d57396ea2 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -77,8 +77,9 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
 
-	if (cc->rpages)
+	if (cc->nr_rpages)
 		return 0;
+
 	cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) * cc->cluster_size,
 								GFP_KERNEL);
 	if (!cc->rpages)
@@ -88,7 +89,9 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
 
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
 {
-	kvfree(cc->rpages);
+	f2fs_reset_compress_ctx(cc);
+	WARN_ON(cc->nr_rpages);
+	kfree(cc->rpages);
 }
 
 int f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
@@ -224,16 +227,6 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
 	.decompress_pages	= lz4_decompress_pages,
 };
 
-static void f2fs_release_cluster_pages(struct compress_ctx *cc)
-{
-	int i;
-
-	for (i = 0; i < cc->nr_rpages; i++) {
-		inode_dec_dirty_pages(cc->inode);
-		unlock_page(cc->rpages[i]);
-	}
-}
-
 static struct page *f2fs_grab_page(void)
 {
 	struct page *page;
@@ -321,6 +314,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 	trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
 							cc->clen, ret);
 	return 0;
+
 out_vunmap_cbuf:
 	vunmap(cc->cbuf);
 out_vunmap_rbuf:
@@ -393,10 +387,9 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 	vunmap(dic->rbuf);
 out_free_dic:
 	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
-	f2fs_free_dic(dic);
-
 	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
 							dic->clen, ret);
+	f2fs_free_dic(dic);
 }
 
 static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -443,51 +436,25 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
 			return false;
 		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 			return false;
-		if (f2fs_is_drop_cache(cc->inode))
-			return false;
-		if (f2fs_is_volatile_file(cc->inode))
-			return false;
 
 		offset = i_size & (PAGE_SIZE - 1);
 		if ((page->index > end_index) ||
 			(page->index == end_index && !offset))
 			return false;
+		if (page->index != start_idx_of_cluster(cc) + i)
+			return false;
 	}
 	return true;
 }
 
-int f2fs_is_cluster_existed(struct compress_ctx *cc)
-{
-	struct dnode_of_data dn;
-	unsigned int start_idx = start_idx_of_cluster(cc);
-	int ret;
-	int i;
-
-	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
-	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
-		block_t blkaddr = datablock_addr(dn.inode, dn.node_page,
-							dn.ofs_in_node);
-		if (blkaddr == COMPRESS_ADDR) {
-			ret = 1;
-			break;
-		}
-		if (__is_valid_data_blkaddr(blkaddr)) {
-			ret = 2;
-			break;
-		}
-	}
-	f2fs_put_dnode(&dn);
-	return ret;
-}
-
 static bool cluster_may_compress(struct compress_ctx *cc)
 {
 	if (!f2fs_compressed_file(cc->inode))
 		return false;
+	if (f2fs_is_atomic_file(cc->inode))
+		return false;
+	if (f2fs_is_mmap_file(cc->inode))
+		return false;
 	if (!f2fs_cluster_is_full(cc))
 		return false;
 	return __cluster_may_compress(cc);
@@ -495,19 +462,59 @@ static bool cluster_may_compress(struct compress_ctx *cc)
 
 void f2fs_reset_compress_ctx(struct compress_ctx *cc)
 {
-	if (cc->rpages)
-		memset(cc->rpages, 0, sizeof(struct page *) * cc->cluster_size);
 	cc->nr_rpages = 0;
 	cc->nr_cpages = 0;
 	cc->cluster_idx = NULL_CLUSTER;
 }
 
+int is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
+{
+	struct dnode_of_data dn;
+	unsigned int start_idx = cluster_idx(cc, index) * cc->cluster_size;
+	int ret, i;
+
+	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+	if (ret) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto fail;
+	}
+	if (dn.data_blkaddr == COMPRESS_ADDR) {
+		ret = CLUSTER_IS_FULL;
+		for (i = 1; i < cc->cluster_size; i++) {
+			block_t blkaddr;
+
+			blkaddr = datablock_addr(dn.inode,
+					dn.node_page, dn.ofs_in_node + i);
+			if (blkaddr == NULL_ADDR) {
+				ret = CLUSTER_HAS_SPACE;
+				break;
+			}
+		}
+	}
+fail:
+	f2fs_put_dnode(&dn);
+	return ret;
+}
+
+int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
+{
+	struct compress_ctx cc = {
+		.inode = inode,
+		.cluster_size = F2FS_I(inode)->i_cluster_size,
+	};
+
+	return is_compressed_cluster(&cc, index);
+}
+
 static void set_cluster_writeback(struct compress_ctx *cc)
 {
 	int i;
 
 	for (i = 0; i < cc->cluster_size; i++)
-		set_page_writeback(cc->rpages[i]);
+		if (cc->rpages[i])
+			set_page_writeback(cc->rpages[i]);
 }
 
 static void set_cluster_dirty(struct compress_ctx *cc)
@@ -515,17 +522,17 @@ static void set_cluster_dirty(struct compress_ctx *cc)
 	int i;
 
 	for (i = 0; i < cc->cluster_size; i++)
-		set_page_dirty(cc->rpages[i]);
+		if (cc->rpages[i])
+			set_page_dirty(cc->rpages[i]);
 }
 
-int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
-					struct page **pagep, pgoff_t index,
-					void **fsdata, bool prealloc)
+static int prepare_compress_overwrite(struct compress_ctx *cc,
+		struct page **pagep, pgoff_t index, void **fsdata,
+		bool prealloc)
 {
-	struct inode *inode = cc->inode;
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
 	struct bio *bio = NULL;
-	struct address_space *mapping = inode->i_mapping;
+	struct address_space *mapping = cc->inode->i_mapping;
 	struct page *page;
 	struct dnode_of_data dn;
 	sector_t last_block_in_bio;
@@ -586,13 +593,12 @@ int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
 			}
 			goto retry;
 		}
-
 	}
 
 	if (prealloc) {
 		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
 
-		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
 
 		for (i = cc->cluster_size - 1; i > 0; i--) {
 			ret = f2fs_get_block(&dn, start_idx + i);
@@ -609,7 +615,8 @@ int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
 
 	*fsdata = cc->rpages;
 	*pagep = cc->rpages[offset_in_cluster(cc, index)];
-	return 0;
+	return CLUSTER_IS_FULL;
+
 unlock_pages:
 	for (idx = 0; idx < i; idx++) {
 		if (cc->rpages[idx])
@@ -626,13 +633,34 @@ int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
 	return ret;
 }
 
-void f2fs_compress_write_end(struct inode *inode, void *fsdata,
-							bool written)
+int f2fs_prepare_compress_overwrite(struct inode *inode,
+		struct page **pagep, pgoff_t index, void **fsdata)
+{
+	struct compress_ctx cc = {
+		.inode = inode,
+		.cluster_size = F2FS_I(inode)->i_cluster_size,
+		.cluster_idx = NULL_CLUSTER,
+		.rpages = NULL,
+		.nr_rpages = 0,
+	};
+	int ret = is_compressed_cluster(&cc, index);
+
+	if (ret <= 0)
+		return ret;
+
+	/* compressed case */
+	return prepare_compress_overwrite(&cc, pagep, index,
+			fsdata, ret == CLUSTER_HAS_SPACE);
+}
+
+bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
+		pgoff_t index, bool written)
 {
 	struct compress_ctx cc = {
 		.cluster_size = F2FS_I(inode)->i_cluster_size,
 		.rpages = fsdata,
 	};
+	bool first_index = (index == cc.rpages[0]->index);
 	int i;
 
 	if (written)
@@ -640,6 +668,11 @@ void f2fs_compress_write_end(struct inode *inode, void *fsdata,
 
 	for (i = 0; i < cc.cluster_size; i++)
 		f2fs_put_page(cc.rpages[i], 1);
+
+	f2fs_destroy_compress_ctx(&cc);
+
+	return first_index;
+
 }
 
 static int f2fs_write_compressed_pages(struct compress_ctx *cc,
@@ -723,6 +756,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 
 		blkaddr = datablock_addr(dn.inode, dn.node_page,
 							dn.ofs_in_node);
+		fio.page = cc->rpages[i];
+		fio.old_blkaddr = blkaddr;
 
 		/* cluster header */
 		if (i == 0) {
@@ -731,7 +766,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 			if (__is_valid_data_blkaddr(blkaddr))
 				f2fs_invalidate_blocks(sbi, blkaddr);
 			f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR);
-			continue;
+			goto unlock_continue;
 		}
 
 		if (pre_compressed_blocks && __is_valid_data_blkaddr(blkaddr))
@@ -742,13 +777,11 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 				f2fs_invalidate_blocks(sbi, blkaddr);
 				f2fs_update_data_blkaddr(&dn, NEW_ADDR);
 			}
-			continue;
+			goto unlock_continue;
 		}
 
 		f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR);
 
-		fio.page = cc->rpages[i];
-		fio.old_blkaddr = blkaddr;
 
 		if (fio.encrypted)
 			fio.encrypted_page = cc->cpages[i - 1];
@@ -759,6 +792,9 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 		cc->cpages[i - 1] = NULL;
 		f2fs_outplace_write_data(&dn, &fio);
 		(*submitted)++;
+unlock_continue:
+		inode_dec_dirty_pages(cc->inode);
+		unlock_page(fio.page);
 	}
 
 	if (pre_compressed_blocks) {
@@ -778,10 +814,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
 
-	f2fs_release_cluster_pages(cc);
-
-	cc->rpages = NULL;
-
 	if (err) {
 		file_set_keep_isize(inode);
 	} else {
@@ -791,6 +823,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 		up_write(&fi->i_sem);
 	}
 	return 0;
+
 out_destroy_crypt:
 	for (i -= 1; i >= 0; i--)
 		fscrypt_finalize_bounce_page(&cc->cpages[i]);
@@ -824,12 +857,13 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 		return;
 
 	for (i = 0; i < cic->nr_rpages; i++) {
+		WARN_ON(!cic->rpages[i]);
 		clear_cold_data(cic->rpages[i]);
 		end_page_writeback(cic->rpages[i]);
 	}
 
-	kvfree(cic->rpages);
-	kvfree(cic);
+	kfree(cic->rpages);
+	kfree(cic);
 }
 
 static int f2fs_write_raw_pages(struct compress_ctx *cc,
@@ -843,6 +877,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 	for (i = 0; i < cc->cluster_size; i++) {
 		if (!cc->rpages[i])
 			continue;
+		BUG_ON(!PageLocked(cc->rpages[i]));
 		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
 						NULL, NULL, wbc, io_type);
 		if (ret) {
@@ -855,9 +890,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 		*submitted += _submitted;
 	}
 	return 0;
+
 out_fail:
 	/* TODO: revoke partially updated block addresses */
-	for (i += 1; i < cc->cluster_size; i++) {
+	for (++i; i < cc->cluster_size; i++) {
 		if (!cc->rpages[i])
 			continue;
 		redirty_page_for_writepage(wbc, cc->rpages[i]);
@@ -890,9 +926,14 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 	}
 write:
 	if (err == -EAGAIN) {
+		bool compressed = false;
+
 		f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted);
+		if (is_compressed_cluster(cc, start_idx_of_cluster(cc)))
+			compressed = true;
+
 		err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
-		if (f2fs_is_cluster_existed(cc) == 1) {
+		if (compressed) {
 			stat_sub_compr_blocks(cc->inode, *submitted);
 			F2FS_I(cc->inode)->i_compressed_blocks -= *submitted;
 			f2fs_mark_inode_dirty_sync(cc->inode, true);
@@ -902,37 +943,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 	return err;
 }
 
-int f2fs_is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
-{
-	struct dnode_of_data dn;
-	unsigned int start_idx = cluster_idx(cc, index) * cc->cluster_size;
-	int ret, i;
-
-	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
-	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
-	if (ret) {
-		if (ret == -ENOENT)
-			ret = 0;
-		goto fail;
-	}
-	if (dn.data_blkaddr == COMPRESS_ADDR) {
-		ret = CLUSTER_IS_FULL;
-		for (i = 1; i < cc->cluster_size; i++) {
-			block_t blkaddr;
-
-			blkaddr = datablock_addr(dn.inode,
-					dn.node_page, dn.ofs_in_node + i);
-			if (blkaddr == NULL_ADDR) {
-				ret = CLUSTER_HAS_SPACE;
-				break;
-			}
-		}
-	}
-fail:
-	f2fs_put_dnode(&dn);
-	return ret;
-}
-
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
@@ -991,9 +1001,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 
 	dic->rpages = cc->rpages;
 	dic->nr_rpages = cc->cluster_size;
-
-	cc->rpages = NULL;
 	return dic;
+
 out_free:
 	f2fs_free_dic(dic);
 out:
@@ -1011,7 +1020,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
 			unlock_page(dic->tpages[i]);
 			put_page(dic->tpages[i]);
 		}
-		kvfree(dic->tpages);
+		kfree(dic->tpages);
 	}
 
 	if (dic->cpages) {
@@ -1020,11 +1029,11 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
 				continue;
 			f2fs_put_compressed_page(dic->cpages[i]);
 		}
-		kvfree(dic->cpages);
+		kfree(dic->cpages);
 	}
 
-	kvfree(dic->rpages);
-	kvfree(dic);
+	kfree(dic->rpages);
+	kfree(dic);
 }
 
 void f2fs_set_cluster_uptodate(struct page **rpages,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index bac96c3a8bc9..b8e0431747b1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1925,18 +1925,18 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 
 	/* get rid of pages beyond EOF */
-	for (i = cc->nr_rpages - 1; i >= 0; i--) {
+	for (i = 0; i < cc->cluster_size; i++) {
 		struct page *page = cc->rpages[i];
 
 		if (!page)
 			continue;
-		if ((sector_t)page->index < last_block_in_file)
-			break;
-
-		zero_user_segment(page, 0, PAGE_SIZE);
-		if (!PageUptodate(page))
-			SetPageUptodate(page);
-
+		if ((sector_t)page->index >= last_block_in_file) {
+			zero_user_segment(page, 0, PAGE_SIZE);
+			if (!PageUptodate(page))
+				SetPageUptodate(page);
+		} else if (!PageUptodate(page)) {
+			continue;
+		}
 		unlock_page(page);
 		cc->rpages[i] = NULL;
 		cc->nr_rpages--;
@@ -2031,6 +2031,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	f2fs_reset_compress_ctx(cc);
 	*bio_ret = bio;
 	return 0;
+
 out_put_dnode:
 	f2fs_put_dnode(&dn);
 out:
@@ -2100,7 +2101,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
 				if (ret)
 					goto set_error_page;
 			}
-			ret = f2fs_is_compressed_cluster(&cc, page->index);
+			ret = f2fs_is_compressed_cluster(inode, page->index);
 			if (ret < 0)
 				goto set_error_page;
 			else if (!ret)
@@ -2457,7 +2458,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
-	if (page->index < end_index || f2fs_verity_in_progress(inode))
+	if (f2fs_compressed_file(inode) ||
+		page->index < end_index || f2fs_verity_in_progress(inode))
 		goto write;
 
 	/*
@@ -2533,7 +2535,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 		f2fs_remove_dirty_inode(inode);
 		submitted = NULL;
 	}
-
 	unlock_page(page);
 	if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
 					!F2FS_I(inode)->cp_task)
@@ -2567,6 +2568,15 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 static int f2fs_write_data_page(struct page *page,
 					struct writeback_control *wbc)
 {
+	struct inode *inode = page->mapping->host;
+
+	if (f2fs_compressed_file(inode)) {
+		if (f2fs_is_compressed_cluster(inode, page->index)) {
+			redirty_page_for_writepage(wbc, page);
+			return AOP_WRITEPAGE_ACTIVATE;
+		}
+	}
+
 	return f2fs_write_single_data_page(page, NULL, NULL, NULL,
 						wbc, FS_DATA_IO);
 }
@@ -2581,7 +2591,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 					enum iostat_type io_type)
 {
 	int ret = 0;
-	int done = 0;
+	int done = 0, retry = 0;
 	struct pagevec pvec;
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	struct bio *bio = NULL;
@@ -2639,10 +2649,11 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
+	retry = 0;
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
-	while (!done && (index <= end)) {
+	while (!done && !retry && (index <= end)) {
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
 				tag);
 		if (nr_pages == 0)
@@ -2650,25 +2661,42 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
-			bool need_readd = false;
-
+			bool need_readd;
 readd:
+			need_readd = false;
 			if (f2fs_compressed_file(inode)) {
+				void *fsdata = NULL;
+				struct page *pagep;
+				int ret2;
+
 				ret = f2fs_init_compress_ctx(&cc);
 				if (ret) {
 					done = 1;
 					break;
 				}
 
-				if (!f2fs_cluster_can_merge_page(&cc,
-							page->index)) {
-					need_readd = true;
+				if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
 					ret = f2fs_write_multi_pages(&cc,
-						&submitted, wbc, io_type);
+							&submitted, wbc, io_type);
+					if (!ret)
+						need_readd = true;
 					goto result;
 				}
+				if (f2fs_cluster_is_empty(&cc)) {
+					ret2 = f2fs_prepare_compress_overwrite(inode,
+							&pagep, page->index, &fsdata);
+					if (ret2 < 0) {
+						ret = ret2;
+						done = 1;
+						break;
+					} else if (ret2 &&
+							!f2fs_compress_write_end(inode, fsdata,
+								page->index, true)) {
+						retry = 1;
+						break;
+					}
+				}
 			}
-
 			/* give a priority to WB_SYNC threads */
 			if (atomic_read(&sbi->wb_sync_req[DATA]) &&
 					wbc->sync_mode == WB_SYNC_NONE) {
@@ -2702,7 +2730,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			if (f2fs_compressed_file(mapping->host)) {
+			if (f2fs_compressed_file(inode)) {
 				ret = f2fs_compress_ctx_add_page(&cc, page);
 				f2fs_bug_on(sbi, ret);
 				continue;
@@ -2754,7 +2782,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 		/* TODO: error handling */
 	}
 
-	if (!cycled && !done) {
+	if ((!cycled && !done) || retry) {
 		cycled = 1;
 		index = 0;
 		end = writeback_index - 1;
@@ -2770,8 +2798,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	if (bio)
 		f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
 
-	f2fs_destroy_compress_ctx(&cc);
-
 	return ret;
 }
 
@@ -3017,26 +3043,18 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 	if (f2fs_compressed_file(inode)) {
-		struct compress_ctx cc = {
-			.inode = inode,
-			.cluster_size = F2FS_I(inode)->i_cluster_size,
-			.cluster_idx = NULL_CLUSTER,
-			.rpages = NULL,
-			.nr_rpages = 0,
-		};
+		int ret;
 
 		*fsdata = NULL;
 
-		err = f2fs_is_compressed_cluster(&cc, index);
-		if (err < 0)
+		ret = f2fs_prepare_compress_overwrite(inode, pagep,
+				index, fsdata);
+		if (ret < 0) {
+			err = ret;
 			goto fail;
-		if (!err)
-			goto repeat;
-
-		err = f2fs_prepare_compress_overwrite(&cc, pagep, index, fsdata,
-						err == CLUSTER_HAS_SPACE);
-		/* need to goto fail? */
-		return err;
+		} else if (ret) {
+			return 0;
+		}
 	}
 
 repeat:
@@ -3139,7 +3157,7 @@ static int f2fs_write_end(struct file *file,
 
 	/* overwrite compressed file */
 	if (f2fs_compressed_file(inode) && fsdata) {
-		f2fs_compress_write_end(inode, fsdata, copied);
+		f2fs_compress_write_end(inode, fsdata, page->index, copied);
 		goto update_time;
 	}
 
@@ -3534,6 +3552,15 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	if (ret)
 		return ret;
 
+	if (f2fs_compressed_file(inode)) {
+		if (F2FS_I(inode)->i_compressed_blocks)
+			return -EINVAL;
+
+		F2FS_I(inode)->i_flags &= ~FS_COMPR_FL;
+		clear_inode_flag(inode, FI_COMPRESSED_FILE);
+		stat_dec_compr_inode(inode);
+	}
+
 	ret = check_swap_activate(file, sis->max);
 	if (ret)
 		return ret;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d22a4e2bb8b8..9c3399fdd6c1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2541,6 +2541,7 @@ enum {
 	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
 	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 	FI_COMPRESSED_FILE,	/* indicate file's data can be compressed */
+	FI_MMAP_FILE,		/* indicate file was mmapped */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2766,6 +2767,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode)
 	return is_inode_flag_set(inode, FI_INLINE_DOTS);
 }
 
+static inline int f2fs_is_mmap_file(struct inode *inode)
+{
+	return is_inode_flag_set(inode, FI_MMAP_FILE);
+}
+
 static inline bool f2fs_is_pinned_file(struct inode *inode)
 {
 	return is_inode_flag_set(inode, FI_PIN_FILE);
@@ -3609,7 +3615,7 @@ void f2fs_destroy_root_stats(void);
 #define stat_inc_atomic_write(inode)			do { } while (0)
 #define stat_dec_atomic_write(inode)			do { } while (0)
 #define stat_inc_compr_blocks(inode)			do { } while (0)
-#define stat_dec_compr_blocks(inode)			do { } while (0)
+#define stat_sub_compr_blocks(inode)			do { } while (0)
 #define stat_update_max_atomic_write(inode)		do { } while (0)
 #define stat_inc_volatile_write(inode)			do { } while (0)
 #define stat_dec_volatile_write(inode)			do { } while (0)
@@ -3755,13 +3761,13 @@ static inline bool f2fs_post_read_required(struct inode *inode)
  * compress.c
  */
 bool f2fs_is_compressed_page(struct page *page);
+int is_compressed_cluster(struct compress_ctx *cc, pgoff_t index);
 struct page *f2fs_compress_control_page(struct page *page);
 void f2fs_reset_compress_ctx(struct compress_ctx *cc);
-int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
-					struct page **page_ret, pgoff_t index,
-					void **fsdata, bool prealloc);
-void f2fs_compress_write_end(struct inode *inode, void *fsdata,
-							bool written);
+int f2fs_prepare_compress_overwrite(struct inode *inode,
+		struct page **pagep, pgoff_t index, void **fsdata);
+bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
+		pgoff_t index, bool written);
 void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
 void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
 bool f2fs_cluster_is_empty(struct compress_ctx *cc);
@@ -3771,7 +3777,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 						int *submitted,
 						struct writeback_control *wbc,
 						enum iostat_type io_type);
-int f2fs_is_compressed_cluster(struct compress_ctx *cc, pgoff_t index);
+int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
 int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				unsigned nr_pages, sector_t *last_block_in_bio,
 				bool is_readahead);
@@ -3923,6 +3929,8 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
 		return true;
 	if (f2fs_is_multi_device(sbi))
 		return true;
+	if (f2fs_compressed_file(inode))
+		return true;
 	/*
 	 * for blkzoned device, fallback direct IO to buffered IO, so
 	 * all IOs can be serialized by log-structured write.
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8a92e8fd648c..99380c419b87 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -51,7 +51,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn = { .node_changed = false };
-	int err;
+	bool need_alloc = true;
+	int err = 0;
 
 	if (unlikely(f2fs_cp_error(sbi))) {
 		err = -EIO;
@@ -63,6 +64,18 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 		goto err;
 	}
 
+	if (f2fs_compressed_file(inode)) {
+		int ret = f2fs_is_compressed_cluster(inode, page->index);
+
+		if (ret < 0) {
+			err = ret;
+			goto err;
+		} else if (ret) {
+			f2fs_bug_on(sbi, ret == CLUSTER_HAS_SPACE);
+			need_alloc = false;
+		}
+	}
+
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -78,15 +91,17 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 		goto out_sem;
 	}
 
-	/* block allocation */
-	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
-	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = f2fs_get_block(&dn, page->index);
-	f2fs_put_dnode(&dn);
-	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
-	if (err) {
-		unlock_page(page);
-		goto out_sem;
+	if (need_alloc) {
+		/* block allocation */
+		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		err = f2fs_get_block(&dn, page->index);
+		f2fs_put_dnode(&dn);
+		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+		if (err) {
+			unlock_page(page);
+			goto out_sem;
+		}
 	}
 
 	/* fill the page */
@@ -492,6 +507,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	file_accessed(file);
 	vma->vm_ops = &f2fs_file_vm_ops;
+	set_inode_flag(inode, FI_MMAP_FILE);
 	return 0;
 }
 
@@ -1781,8 +1797,18 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 			return -EINVAL;
 		if (iflags & FS_NOCOMP_FL)
 			return -EINVAL;
-		if (S_ISREG(inode->i_mode))
-			clear_inode_flag(inode, FI_INLINE_DATA);
+		if (fi->i_flags & FS_COMPR_FL) {
+			int err = f2fs_convert_inline_inode(inode);
+
+			if (err)
+				return err;
+
+			if (!f2fs_may_compress(inode))
+				return -EINVAL;
+
+			set_inode_flag(inode, FI_COMPRESSED_FILE);
+			stat_inc_compr_inode(inode);
+		}
 	}
 	if ((iflags ^ fi->i_flags) & FS_NOCOMP_FL) {
 		if (fi->i_flags & FS_COMPR_FL)
@@ -1793,19 +1819,6 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 	f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & FS_COMPR_FL) &&
 					(fi->i_flags & FS_NOCOMP_FL));
 
-	if (fi->i_flags & FS_COMPR_FL) {
-		int err = f2fs_convert_inline_inode(inode);
-
-		if (err)
-			return err;
-
-		if (!f2fs_may_compress(inode))
-			return -EINVAL;
-
-		set_inode_flag(inode, FI_COMPRESSED_FILE);
-		stat_inc_compr_inode(inode);
-	}
-
 	if (fi->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 	else
@@ -1988,6 +2001,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
 	inode_lock(inode);
 
+	if (f2fs_compressed_file(inode) && !fi->i_compressed_blocks) {
+		fi->i_flags &= ~FS_COMPR_FL;
+		clear_inode_flag(inode, FI_COMPRESSED_FILE);
+		stat_dec_compr_inode(inode);
+	}
+
 	if (f2fs_is_atomic_file(inode)) {
 		if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
 			ret = -EINVAL;
@@ -3190,7 +3209,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 	}
 
 	if (f2fs_compressed_file(inode)) {
-		if (F2FS_HAS_BLOCKS(inode) || i_size_read(inode)) {
+		if (F2FS_I(inode)->i_compressed_blocks) {
 			ret = -EOPNOTSUPP;
 			goto out;
 		}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 9f37e95c4a4b..ac0c51cefca2 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -128,9 +128,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 				1 << F2FS_I(inode)->i_log_cluster_size;
 
 		/* Inherit the compression flag in directory */
-		if ((F2FS_I(inode)->i_flags & FS_COMPR_FL) &&
-					f2fs_may_compress(inode))
+		if ((F2FS_I(dir)->i_flags & FS_COMPR_FL) &&
+					f2fs_may_compress(inode)) {
+			F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
 			set_inode_flag(inode, FI_COMPRESSED_FILE);
+		}
 	}
 
 	f2fs_set_inode_flags(inode);
@@ -282,6 +284,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
 static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
 						const unsigned char *name)
 {
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
 	unsigned char (*ext)[F2FS_EXTENSION_LEN];
 	unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
 	int i, cold_count, hot_count;
@@ -292,13 +295,24 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
 			!f2fs_may_compress(inode))
 		return;
 
+	down_read(&sbi->sb_lock);
+
 	ext = F2FS_OPTION(sbi).extensions;
 
 	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
 	hot_count = sbi->raw_super->hot_ext_count;
 
+	for (i = cold_count; i < cold_count + hot_count; i++) {
+		if (is_extension_exist(name, extlist[i])) {
+			up_read(&sbi->sb_lock);
+			return;
+		}
+	}
+
+	up_read(&sbi->sb_lock);
+
 	for (i = 0; i < ext_cnt; i++) {
-		if (is_extension_exist(name, ext[i]) && !file_is_hot(inode)) {
+		if (is_extension_exist(name, ext[i])) {
 			F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
 			set_inode_flag(inode, FI_COMPRESSED_FILE);
 			return;
-- 
2.19.0.605.g01d371f741-goog



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-31 15:35               ` Jaegeuk Kim
@ 2019-11-01 10:02                 ` Chao Yu
  0 siblings, 0 replies; 32+ messages in thread
From: Chao Yu @ 2019-11-01 10:02 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

On 2019/10/31 23:35, Jaegeuk Kim wrote:
> Hi Chao,
> 
> On 10/31, Chao Yu wrote:
>> On 2019/10/31 0:50, Eric Biggers wrote:
>>> No, just use kvmalloc().  The whole point of kvmalloc() is that it tries
>>> kmalloc() and then falls back to vmalloc() if it fails.
>>
>> Okay, it's fine to me, let me fix this in another patch.
> 
> I've fixed some bugs. (e.g., mmap) Please apply this in your next patch, so that
> I can continue to test new version as early as possible.

Applied with some fixes as below comments.

> 
> With this patch, I could boot up a device and install some apps successfully
> with "compress_extension=*".

Ah, '*' can trigger big pressure on compression paths.

> 
> ---
>  fs/f2fs/compress.c | 229 +++++++++++++++++++++++----------------------
>  fs/f2fs/data.c     | 109 +++++++++++++--------
>  fs/f2fs/f2fs.h     |  22 +++--
>  fs/f2fs/file.c     |  71 +++++++++-----
>  fs/f2fs/namei.c    |  20 +++-
>  5 files changed, 264 insertions(+), 187 deletions(-)
> 
> diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
> index f276d82a67aa..e03d57396ea2 100644
> --- a/fs/f2fs/compress.c
> +++ b/fs/f2fs/compress.c
> @@ -77,8 +77,9 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
>  {
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
>  
> -	if (cc->rpages)
> +	if (cc->nr_rpages)
>  		return 0;
> +
>  	cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) * cc->cluster_size,
>  								GFP_KERNEL);
>  	if (!cc->rpages)
> @@ -88,7 +89,9 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
>  
>  void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
>  {
> -	kvfree(cc->rpages);
> +	f2fs_reset_compress_ctx(cc);
> +	WARN_ON(cc->nr_rpages);

f2fs_reset_compress_ctx() will reset cc->nr_rpages to zero, I removed it for now.

> +	kfree(cc->rpages);
>  }
>  
>  int f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
> @@ -224,16 +227,6 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
>  	.decompress_pages	= lz4_decompress_pages,
>  };
>  
> -static void f2fs_release_cluster_pages(struct compress_ctx *cc)
> -{
> -	int i;
> -
> -	for (i = 0; i < cc->nr_rpages; i++) {
> -		inode_dec_dirty_pages(cc->inode);
> -		unlock_page(cc->rpages[i]);
> -	}
> -}
> -
>  static struct page *f2fs_grab_page(void)
>  {
>  	struct page *page;
> @@ -321,6 +314,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
>  	trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
>  							cc->clen, ret);
>  	return 0;
> +
>  out_vunmap_cbuf:
>  	vunmap(cc->cbuf);
>  out_vunmap_rbuf:
> @@ -393,10 +387,9 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
>  	vunmap(dic->rbuf);
>  out_free_dic:
>  	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
> -	f2fs_free_dic(dic);
> -
>  	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
>  							dic->clen, ret);
> +	f2fs_free_dic(dic);
>  }
>  
>  static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
> @@ -443,51 +436,25 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
>  			return false;
>  		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
>  			return false;
> -		if (f2fs_is_drop_cache(cc->inode))
> -			return false;
> -		if (f2fs_is_volatile_file(cc->inode))
> -			return false;
>  
>  		offset = i_size & (PAGE_SIZE - 1);
>  		if ((page->index > end_index) ||
>  			(page->index == end_index && !offset))
>  			return false;
> +		if (page->index != start_idx_of_cluster(cc) + i)
> +			return false;

Should this be a bug?

>  	}
>  	return true;
>  }
>  
> -int f2fs_is_cluster_existed(struct compress_ctx *cc)
> -{
> -	struct dnode_of_data dn;
> -	unsigned int start_idx = start_idx_of_cluster(cc);
> -	int ret;
> -	int i;
> -
> -	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
> -	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
> -	if (ret)
> -		return ret;
> -
> -	for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
> -		block_t blkaddr = datablock_addr(dn.inode, dn.node_page,
> -							dn.ofs_in_node);
> -		if (blkaddr == COMPRESS_ADDR) {
> -			ret = 1;
> -			break;
> -		}
> -		if (__is_valid_data_blkaddr(blkaddr)) {
> -			ret = 2;
> -			break;
> -		}
> -	}
> -	f2fs_put_dnode(&dn);
> -	return ret;
> -}
> -
>  static bool cluster_may_compress(struct compress_ctx *cc)
>  {
>  	if (!f2fs_compressed_file(cc->inode))
>  		return false;
> +	if (f2fs_is_atomic_file(cc->inode))
> +		return false;
> +	if (f2fs_is_mmap_file(cc->inode))
> +		return false;
>  	if (!f2fs_cluster_is_full(cc))
>  		return false;
>  	return __cluster_may_compress(cc);
> @@ -495,19 +462,59 @@ static bool cluster_may_compress(struct compress_ctx *cc)
>  
>  void f2fs_reset_compress_ctx(struct compress_ctx *cc)
>  {
> -	if (cc->rpages)
> -		memset(cc->rpages, 0, sizeof(struct page *) * cc->cluster_size);
>  	cc->nr_rpages = 0;
>  	cc->nr_cpages = 0;
>  	cc->cluster_idx = NULL_CLUSTER;
>  }
>  
> +int is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
> +{
> +	struct dnode_of_data dn;
> +	unsigned int start_idx = cluster_idx(cc, index) * cc->cluster_size;
> +	int ret, i;
> +
> +	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
> +	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
> +	if (ret) {
> +		if (ret == -ENOENT)
> +			ret = 0;
> +		goto fail;
> +	}
> +	if (dn.data_blkaddr == COMPRESS_ADDR) {
> +		ret = CLUSTER_IS_FULL;
> +		for (i = 1; i < cc->cluster_size; i++) {
> +			block_t blkaddr;
> +
> +			blkaddr = datablock_addr(dn.inode,
> +					dn.node_page, dn.ofs_in_node + i);
> +			if (blkaddr == NULL_ADDR) {
> +				ret = CLUSTER_HAS_SPACE;
> +				break;
> +			}
> +		}
> +	}
> +fail:
> +	f2fs_put_dnode(&dn);
> +	return ret;
> +}
> +
> +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
> +{
> +	struct compress_ctx cc = {
> +		.inode = inode,
> +		.cluster_size = F2FS_I(inode)->i_cluster_size,
> +	};
> +
> +	return is_compressed_cluster(&cc, index);
> +}
> +
>  static void set_cluster_writeback(struct compress_ctx *cc)
>  {
>  	int i;
>  
>  	for (i = 0; i < cc->cluster_size; i++)
> -		set_page_writeback(cc->rpages[i]);
> +		if (cc->rpages[i])
> +			set_page_writeback(cc->rpages[i]);
>  }
>  
>  static void set_cluster_dirty(struct compress_ctx *cc)
> @@ -515,17 +522,17 @@ static void set_cluster_dirty(struct compress_ctx *cc)
>  	int i;
>  
>  	for (i = 0; i < cc->cluster_size; i++)
> -		set_page_dirty(cc->rpages[i]);
> +		if (cc->rpages[i])
> +			set_page_dirty(cc->rpages[i]);
>  }
>  
> -int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
> -					struct page **pagep, pgoff_t index,
> -					void **fsdata, bool prealloc)
> +static int prepare_compress_overwrite(struct compress_ctx *cc,
> +		struct page **pagep, pgoff_t index, void **fsdata,
> +		bool prealloc)
>  {
> -	struct inode *inode = cc->inode;
> -	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
>  	struct bio *bio = NULL;
> -	struct address_space *mapping = inode->i_mapping;
> +	struct address_space *mapping = cc->inode->i_mapping;
>  	struct page *page;
>  	struct dnode_of_data dn;
>  	sector_t last_block_in_bio;
> @@ -586,13 +593,12 @@ int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
>  			}
>  			goto retry;
>  		}
> -
>  	}
>  
>  	if (prealloc) {
>  		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
>  
> -		set_new_dnode(&dn, inode, NULL, NULL, 0);
> +		set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
>  
>  		for (i = cc->cluster_size - 1; i > 0; i--) {
>  			ret = f2fs_get_block(&dn, start_idx + i);
> @@ -609,7 +615,8 @@ int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
>  
>  	*fsdata = cc->rpages;
>  	*pagep = cc->rpages[offset_in_cluster(cc, index)];
> -	return 0;
> +	return CLUSTER_IS_FULL;
> +
>  unlock_pages:
>  	for (idx = 0; idx < i; idx++) {
>  		if (cc->rpages[idx])
> @@ -626,13 +633,34 @@ int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
>  	return ret;
>  }
>  
> -void f2fs_compress_write_end(struct inode *inode, void *fsdata,
> -							bool written)
> +int f2fs_prepare_compress_overwrite(struct inode *inode,
> +		struct page **pagep, pgoff_t index, void **fsdata)
> +{
> +	struct compress_ctx cc = {
> +		.inode = inode,
> +		.cluster_size = F2FS_I(inode)->i_cluster_size,
> +		.cluster_idx = NULL_CLUSTER,
> +		.rpages = NULL,
> +		.nr_rpages = 0,
> +	};
> +	int ret = is_compressed_cluster(&cc, index);
> +
> +	if (ret <= 0)
> +		return ret;
> +
> +	/* compressed case */
> +	return prepare_compress_overwrite(&cc, pagep, index,
> +			fsdata, ret == CLUSTER_HAS_SPACE);
> +}
> +
> +bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
> +		pgoff_t index, bool written)
>  {
>  	struct compress_ctx cc = {
>  		.cluster_size = F2FS_I(inode)->i_cluster_size,
>  		.rpages = fsdata,
>  	};
> +	bool first_index = (index == cc.rpages[0]->index);
>  	int i;
>  
>  	if (written)
> @@ -640,6 +668,11 @@ void f2fs_compress_write_end(struct inode *inode, void *fsdata,
>  
>  	for (i = 0; i < cc.cluster_size; i++)
>  		f2fs_put_page(cc.rpages[i], 1);
> +
> +	f2fs_destroy_compress_ctx(&cc);
> +
> +	return first_index;
> +
>  }
>  
>  static int f2fs_write_compressed_pages(struct compress_ctx *cc,
> @@ -723,6 +756,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
>  
>  		blkaddr = datablock_addr(dn.inode, dn.node_page,
>  							dn.ofs_in_node);
> +		fio.page = cc->rpages[i];
> +		fio.old_blkaddr = blkaddr;
>  
>  		/* cluster header */
>  		if (i == 0) {
> @@ -731,7 +766,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
>  			if (__is_valid_data_blkaddr(blkaddr))
>  				f2fs_invalidate_blocks(sbi, blkaddr);
>  			f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR);
> -			continue;
> +			goto unlock_continue;
>  		}
>  
>  		if (pre_compressed_blocks && __is_valid_data_blkaddr(blkaddr))
> @@ -742,13 +777,11 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
>  				f2fs_invalidate_blocks(sbi, blkaddr);
>  				f2fs_update_data_blkaddr(&dn, NEW_ADDR);
>  			}
> -			continue;
> +			goto unlock_continue;
>  		}
>  
>  		f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR);
>  
> -		fio.page = cc->rpages[i];
> -		fio.old_blkaddr = blkaddr;
>  
>  		if (fio.encrypted)
>  			fio.encrypted_page = cc->cpages[i - 1];
> @@ -759,6 +792,9 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
>  		cc->cpages[i - 1] = NULL;
>  		f2fs_outplace_write_data(&dn, &fio);
>  		(*submitted)++;
> +unlock_continue:
> +		inode_dec_dirty_pages(cc->inode);
> +		unlock_page(fio.page);
>  	}
>  
>  	if (pre_compressed_blocks) {
> @@ -778,10 +814,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
>  	f2fs_put_dnode(&dn);
>  	f2fs_unlock_op(sbi);
>  
> -	f2fs_release_cluster_pages(cc);
> -
> -	cc->rpages = NULL;
> -
>  	if (err) {
>  		file_set_keep_isize(inode);
>  	} else {
> @@ -791,6 +823,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
>  		up_write(&fi->i_sem);
>  	}
>  	return 0;
> +
>  out_destroy_crypt:
>  	for (i -= 1; i >= 0; i--)
>  		fscrypt_finalize_bounce_page(&cc->cpages[i]);
> @@ -824,12 +857,13 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
>  		return;
>  
>  	for (i = 0; i < cic->nr_rpages; i++) {
> +		WARN_ON(!cic->rpages[i]);
>  		clear_cold_data(cic->rpages[i]);
>  		end_page_writeback(cic->rpages[i]);
>  	}
>  
> -	kvfree(cic->rpages);
> -	kvfree(cic);
> +	kfree(cic->rpages);
> +	kfree(cic);
>  }
>  
>  static int f2fs_write_raw_pages(struct compress_ctx *cc,
> @@ -843,6 +877,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
>  	for (i = 0; i < cc->cluster_size; i++) {
>  		if (!cc->rpages[i])
>  			continue;
> +		BUG_ON(!PageLocked(cc->rpages[i]));
>  		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
>  						NULL, NULL, wbc, io_type);
>  		if (ret) {
> @@ -855,9 +890,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
>  		*submitted += _submitted;
>  	}
>  	return 0;
> +
>  out_fail:
>  	/* TODO: revoke partially updated block addresses */
> -	for (i += 1; i < cc->cluster_size; i++) {
> +	for (++i; i < cc->cluster_size; i++) {
>  		if (!cc->rpages[i])
>  			continue;
>  		redirty_page_for_writepage(wbc, cc->rpages[i]);
> @@ -890,9 +926,14 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
>  	}
>  write:
>  	if (err == -EAGAIN) {
> +		bool compressed = false;
> +
>  		f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted);
> +		if (is_compressed_cluster(cc, start_idx_of_cluster(cc)))
> +			compressed = true;
> +
>  		err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
> -		if (f2fs_is_cluster_existed(cc) == 1) {
> +		if (compressed) {
>  			stat_sub_compr_blocks(cc->inode, *submitted);
>  			F2FS_I(cc->inode)->i_compressed_blocks -= *submitted;
>  			f2fs_mark_inode_dirty_sync(cc->inode, true);
> @@ -902,37 +943,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
>  	return err;
>  }
>  
> -int f2fs_is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
> -{
> -	struct dnode_of_data dn;
> -	unsigned int start_idx = cluster_idx(cc, index) * cc->cluster_size;
> -	int ret, i;
> -
> -	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
> -	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
> -	if (ret) {
> -		if (ret == -ENOENT)
> -			ret = 0;
> -		goto fail;
> -	}
> -	if (dn.data_blkaddr == COMPRESS_ADDR) {
> -		ret = CLUSTER_IS_FULL;
> -		for (i = 1; i < cc->cluster_size; i++) {
> -			block_t blkaddr;
> -
> -			blkaddr = datablock_addr(dn.inode,
> -					dn.node_page, dn.ofs_in_node + i);
> -			if (blkaddr == NULL_ADDR) {
> -				ret = CLUSTER_HAS_SPACE;
> -				break;
> -			}
> -		}
> -	}
> -fail:
> -	f2fs_put_dnode(&dn);
> -	return ret;
> -}
> -
>  struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
>  {
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
> @@ -991,9 +1001,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
>  
>  	dic->rpages = cc->rpages;
>  	dic->nr_rpages = cc->cluster_size;
> -
> -	cc->rpages = NULL;
>  	return dic;
> +
>  out_free:
>  	f2fs_free_dic(dic);
>  out:
> @@ -1011,7 +1020,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
>  			unlock_page(dic->tpages[i]);
>  			put_page(dic->tpages[i]);
>  		}
> -		kvfree(dic->tpages);
> +		kfree(dic->tpages);
>  	}
>  
>  	if (dic->cpages) {
> @@ -1020,11 +1029,11 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
>  				continue;
>  			f2fs_put_compressed_page(dic->cpages[i]);
>  		}
> -		kvfree(dic->cpages);
> +		kfree(dic->cpages);
>  	}
>  
> -	kvfree(dic->rpages);
> -	kvfree(dic);
> +	kfree(dic->rpages);
> +	kfree(dic);
>  }
>  
>  void f2fs_set_cluster_uptodate(struct page **rpages,
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index bac96c3a8bc9..b8e0431747b1 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -1925,18 +1925,18 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>  	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
>  
>  	/* get rid of pages beyond EOF */
> -	for (i = cc->nr_rpages - 1; i >= 0; i--) {
> +	for (i = 0; i < cc->cluster_size; i++) {
>  		struct page *page = cc->rpages[i];
>  
>  		if (!page)
>  			continue;
> -		if ((sector_t)page->index < last_block_in_file)
> -			break;
> -
> -		zero_user_segment(page, 0, PAGE_SIZE);
> -		if (!PageUptodate(page))
> -			SetPageUptodate(page);
> -
> +		if ((sector_t)page->index >= last_block_in_file) {
> +			zero_user_segment(page, 0, PAGE_SIZE);
> +			if (!PageUptodate(page))
> +				SetPageUptodate(page);
> +		} else if (!PageUptodate(page)) {
> +			continue;
> +		}
>  		unlock_page(page);
>  		cc->rpages[i] = NULL;
>  		cc->nr_rpages--;
> @@ -2031,6 +2031,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>  	f2fs_reset_compress_ctx(cc);
>  	*bio_ret = bio;
>  	return 0;
> +
>  out_put_dnode:
>  	f2fs_put_dnode(&dn);
>  out:
> @@ -2100,7 +2101,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
>  				if (ret)
>  					goto set_error_page;
>  			}
> -			ret = f2fs_is_compressed_cluster(&cc, page->index);
> +			ret = f2fs_is_compressed_cluster(inode, page->index);
>  			if (ret < 0)
>  				goto set_error_page;
>  			else if (!ret)
> @@ -2457,7 +2458,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
>  	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
>  		goto redirty_out;
>  
> -	if (page->index < end_index || f2fs_verity_in_progress(inode))
> +	if (f2fs_compressed_file(inode) ||
> +		page->index < end_index || f2fs_verity_in_progress(inode))
>  		goto write;
>  
>  	/*
> @@ -2533,7 +2535,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
>  		f2fs_remove_dirty_inode(inode);
>  		submitted = NULL;
>  	}
> -
>  	unlock_page(page);
>  	if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
>  					!F2FS_I(inode)->cp_task)
> @@ -2567,6 +2568,15 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
>  static int f2fs_write_data_page(struct page *page,
>  					struct writeback_control *wbc)
>  {
> +	struct inode *inode = page->mapping->host;
> +
> +	if (f2fs_compressed_file(inode)) {
> +		if (f2fs_is_compressed_cluster(inode, page->index)) {
> +			redirty_page_for_writepage(wbc, page);
> +			return AOP_WRITEPAGE_ACTIVATE;
> +		}
> +	}
> +
>  	return f2fs_write_single_data_page(page, NULL, NULL, NULL,
>  						wbc, FS_DATA_IO);
>  }
> @@ -2581,7 +2591,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
>  					enum iostat_type io_type)
>  {
>  	int ret = 0;
> -	int done = 0;
> +	int done = 0, retry = 0;
>  	struct pagevec pvec;
>  	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
>  	struct bio *bio = NULL;
> @@ -2639,10 +2649,11 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
>  	else
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
> +	retry = 0;
>  	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>  		tag_pages_for_writeback(mapping, index, end);
>  	done_index = index;
> -	while (!done && (index <= end)) {
> +	while (!done && !retry && (index <= end)) {
>  		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
>  				tag);
>  		if (nr_pages == 0)
> @@ -2650,25 +2661,42 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
>  
>  		for (i = 0; i < nr_pages; i++) {
>  			struct page *page = pvec.pages[i];
> -			bool need_readd = false;
> -
> +			bool need_readd;
>  readd:
> +			need_readd = false;
>  			if (f2fs_compressed_file(inode)) {
> +				void *fsdata = NULL;
> +				struct page *pagep;
> +				int ret2;
> +
>  				ret = f2fs_init_compress_ctx(&cc);
>  				if (ret) {
>  					done = 1;
>  					break;
>  				}
>  
> -				if (!f2fs_cluster_can_merge_page(&cc,
> -							page->index)) {
> -					need_readd = true;
> +				if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
>  					ret = f2fs_write_multi_pages(&cc,
> -						&submitted, wbc, io_type);
> +							&submitted, wbc, io_type);
> +					if (!ret)
> +						need_readd = true;
>  					goto result;
>  				}
> +				if (f2fs_cluster_is_empty(&cc)) {
> +					ret2 = f2fs_prepare_compress_overwrite(inode,
> +							&pagep, page->index, &fsdata);
> +					if (ret2 < 0) {
> +						ret = ret2;
> +						done = 1;
> +						break;
> +					} else if (ret2 &&
> +							!f2fs_compress_write_end(inode, fsdata,
> +								page->index, true)) {
> +						retry = 1;
> +						break;
> +					}
> +				}
>  			}
> -
>  			/* give a priority to WB_SYNC threads */
>  			if (atomic_read(&sbi->wb_sync_req[DATA]) &&
>  					wbc->sync_mode == WB_SYNC_NONE) {
> @@ -2702,7 +2730,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
>  			if (!clear_page_dirty_for_io(page))
>  				goto continue_unlock;
>  
> -			if (f2fs_compressed_file(mapping->host)) {
> +			if (f2fs_compressed_file(inode)) {
>  				ret = f2fs_compress_ctx_add_page(&cc, page);
>  				f2fs_bug_on(sbi, ret);
>  				continue;
> @@ -2754,7 +2782,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
>  		/* TODO: error handling */
>  	}
>  
> -	if (!cycled && !done) {
> +	if ((!cycled && !done) || retry) {
>  		cycled = 1;
>  		index = 0;
>  		end = writeback_index - 1;
> @@ -2770,8 +2798,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
>  	if (bio)
>  		f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
>  
> -	f2fs_destroy_compress_ctx(&cc);
> -
>  	return ret;
>  }
>  
> @@ -3017,26 +3043,18 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
>  	}
>  
>  	if (f2fs_compressed_file(inode)) {
> -		struct compress_ctx cc = {
> -			.inode = inode,
> -			.cluster_size = F2FS_I(inode)->i_cluster_size,
> -			.cluster_idx = NULL_CLUSTER,
> -			.rpages = NULL,
> -			.nr_rpages = 0,
> -		};
> +		int ret;
>  
>  		*fsdata = NULL;
>  
> -		err = f2fs_is_compressed_cluster(&cc, index);
> -		if (err < 0)
> +		ret = f2fs_prepare_compress_overwrite(inode, pagep,
> +				index, fsdata);
> +		if (ret < 0) {
> +			err = ret;
>  			goto fail;
> -		if (!err)
> -			goto repeat;
> -
> -		err = f2fs_prepare_compress_overwrite(&cc, pagep, index, fsdata,
> -						err == CLUSTER_HAS_SPACE);
> -		/* need to goto fail? */
> -		return err;
> +		} else if (ret) {
> +			return 0;
> +		}
>  	}
>  
>  repeat:
> @@ -3139,7 +3157,7 @@ static int f2fs_write_end(struct file *file,
>  
>  	/* overwrite compressed file */
>  	if (f2fs_compressed_file(inode) && fsdata) {
> -		f2fs_compress_write_end(inode, fsdata, copied);
> +		f2fs_compress_write_end(inode, fsdata, page->index, copied);
>  		goto update_time;
>  	}
>  
> @@ -3534,6 +3552,15 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
>  	if (ret)
>  		return ret;
>  
> +	if (f2fs_compressed_file(inode)) {
> +		if (F2FS_I(inode)->i_compressed_blocks)
> +			return -EINVAL;
> +
> +		F2FS_I(inode)->i_flags &= ~FS_COMPR_FL;
> +		clear_inode_flag(inode, FI_COMPRESSED_FILE);
> +		stat_dec_compr_inode(inode);
> +	}
> +
>  	ret = check_swap_activate(file, sis->max);
>  	if (ret)
>  		return ret;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index d22a4e2bb8b8..9c3399fdd6c1 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -2541,6 +2541,7 @@ enum {
>  	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
>  	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
>  	FI_COMPRESSED_FILE,	/* indicate file's data can be compressed */
> +	FI_MMAP_FILE,		/* indicate file was mmapped */
>  };
>  
>  static inline void __mark_inode_dirty_flag(struct inode *inode,
> @@ -2766,6 +2767,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode)
>  	return is_inode_flag_set(inode, FI_INLINE_DOTS);
>  }
>  
> +static inline int f2fs_is_mmap_file(struct inode *inode)
> +{
> +	return is_inode_flag_set(inode, FI_MMAP_FILE);
> +}
> +
>  static inline bool f2fs_is_pinned_file(struct inode *inode)
>  {
>  	return is_inode_flag_set(inode, FI_PIN_FILE);
> @@ -3609,7 +3615,7 @@ void f2fs_destroy_root_stats(void);
>  #define stat_inc_atomic_write(inode)			do { } while (0)
>  #define stat_dec_atomic_write(inode)			do { } while (0)
>  #define stat_inc_compr_blocks(inode)			do { } while (0)
> -#define stat_dec_compr_blocks(inode)			do { } while (0)
> +#define stat_sub_compr_blocks(inode)			do { } while (0)
>  #define stat_update_max_atomic_write(inode)		do { } while (0)
>  #define stat_inc_volatile_write(inode)			do { } while (0)
>  #define stat_dec_volatile_write(inode)			do { } while (0)
> @@ -3755,13 +3761,13 @@ static inline bool f2fs_post_read_required(struct inode *inode)
>   * compress.c
>   */
>  bool f2fs_is_compressed_page(struct page *page);
> +int is_compressed_cluster(struct compress_ctx *cc, pgoff_t index);
>  struct page *f2fs_compress_control_page(struct page *page);
>  void f2fs_reset_compress_ctx(struct compress_ctx *cc);
> -int f2fs_prepare_compress_overwrite(struct compress_ctx *cc,
> -					struct page **page_ret, pgoff_t index,
> -					void **fsdata, bool prealloc);
> -void f2fs_compress_write_end(struct inode *inode, void *fsdata,
> -							bool written);
> +int f2fs_prepare_compress_overwrite(struct inode *inode,
> +		struct page **pagep, pgoff_t index, void **fsdata);
> +bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
> +		pgoff_t index, bool written);
>  void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
>  void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
>  bool f2fs_cluster_is_empty(struct compress_ctx *cc);
> @@ -3771,7 +3777,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
>  						int *submitted,
>  						struct writeback_control *wbc,
>  						enum iostat_type io_type);
> -int f2fs_is_compressed_cluster(struct compress_ctx *cc, pgoff_t index);
> +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
>  int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>  				unsigned nr_pages, sector_t *last_block_in_bio,
>  				bool is_readahead);
> @@ -3923,6 +3929,8 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
>  		return true;
>  	if (f2fs_is_multi_device(sbi))
>  		return true;
> +	if (f2fs_compressed_file(inode))
> +		return true;
>  	/*
>  	 * for blkzoned device, fallback direct IO to buffered IO, so
>  	 * all IOs can be serialized by log-structured write.
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 8a92e8fd648c..99380c419b87 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -51,7 +51,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>  	struct dnode_of_data dn = { .node_changed = false };
> -	int err;
> +	bool need_alloc = true;
> +	int err = 0;
>  
>  	if (unlikely(f2fs_cp_error(sbi))) {
>  		err = -EIO;
> @@ -63,6 +64,18 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  		goto err;
>  	}
>  
> +	if (f2fs_compressed_file(inode)) {
> +		int ret = f2fs_is_compressed_cluster(inode, page->index);
> +
> +		if (ret < 0) {
> +			err = ret;
> +			goto err;
> +		} else if (ret) {
> +			f2fs_bug_on(sbi, ret == CLUSTER_HAS_SPACE);
> +			need_alloc = false;
> +		}
> +	}
> +
>  	sb_start_pagefault(inode->i_sb);
>  
>  	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
> @@ -78,15 +91,17 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  		goto out_sem;
>  	}
>  
> -	/* block allocation */
> -	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> -	set_new_dnode(&dn, inode, NULL, NULL, 0);
> -	err = f2fs_get_block(&dn, page->index);
> -	f2fs_put_dnode(&dn);
> -	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
> -	if (err) {
> -		unlock_page(page);
> -		goto out_sem;
> +	if (need_alloc) {
> +		/* block allocation */
> +		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
> +		set_new_dnode(&dn, inode, NULL, NULL, 0);
> +		err = f2fs_get_block(&dn, page->index);
> +		f2fs_put_dnode(&dn);
> +		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
> +		if (err) {
> +			unlock_page(page);
> +			goto out_sem;
> +		}
>  	}
>  
>  	/* fill the page */
> @@ -492,6 +507,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
>  
>  	file_accessed(file);
>  	vma->vm_ops = &f2fs_file_vm_ops;
> +	set_inode_flag(inode, FI_MMAP_FILE);
>  	return 0;
>  }
>  
> @@ -1781,8 +1797,18 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
>  			return -EINVAL;
>  		if (iflags & FS_NOCOMP_FL)
>  			return -EINVAL;
> -		if (S_ISREG(inode->i_mode))
> -			clear_inode_flag(inode, FI_INLINE_DATA);
> +		if (fi->i_flags & FS_COMPR_FL) {

i_flags & F2FS_COMPR_FL

Need to change all FS_{COMPR, NOCOMP}_FL to F2FS_{COMPR, NOCOMP}_FL

> +			int err = f2fs_convert_inline_inode(inode);
> +
> +			if (err)
> +				return err;
> +
> +			if (!f2fs_may_compress(inode))
> +				return -EINVAL;
> +
> +			set_inode_flag(inode, FI_COMPRESSED_FILE);
> +			stat_inc_compr_inode(inode);
> +		}
>  	}
>  	if ((iflags ^ fi->i_flags) & FS_NOCOMP_FL) {
>  		if (fi->i_flags & FS_COMPR_FL)
> @@ -1793,19 +1819,6 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
>  	f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & FS_COMPR_FL) &&
>  					(fi->i_flags & FS_NOCOMP_FL));
>  
> -	if (fi->i_flags & FS_COMPR_FL) {
> -		int err = f2fs_convert_inline_inode(inode);
> -
> -		if (err)
> -			return err;
> -
> -		if (!f2fs_may_compress(inode))
> -			return -EINVAL;
> -
> -		set_inode_flag(inode, FI_COMPRESSED_FILE);
> -		stat_inc_compr_inode(inode);
> -	}
> -
>  	if (fi->i_flags & F2FS_PROJINHERIT_FL)
>  		set_inode_flag(inode, FI_PROJ_INHERIT);
>  	else
> @@ -1988,6 +2001,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
>  
>  	inode_lock(inode);
>  
> +	if (f2fs_compressed_file(inode) && !fi->i_compressed_blocks) {
> +		fi->i_flags &= ~FS_COMPR_FL;
> +		clear_inode_flag(inode, FI_COMPRESSED_FILE);
> +		stat_dec_compr_inode(inode);
> +	}
> +
>  	if (f2fs_is_atomic_file(inode)) {
>  		if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
>  			ret = -EINVAL;
> @@ -3190,7 +3209,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
>  	}
>  
>  	if (f2fs_compressed_file(inode)) {
> -		if (F2FS_HAS_BLOCKS(inode) || i_size_read(inode)) {
> +		if (F2FS_I(inode)->i_compressed_blocks) {
>  			ret = -EOPNOTSUPP;
>  			goto out;
>  		}
> diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
> index 9f37e95c4a4b..ac0c51cefca2 100644
> --- a/fs/f2fs/namei.c
> +++ b/fs/f2fs/namei.c
> @@ -128,9 +128,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
>  				1 << F2FS_I(inode)->i_log_cluster_size;
>  
>  		/* Inherit the compression flag in directory */
> -		if ((F2FS_I(inode)->i_flags & FS_COMPR_FL) &&
> -					f2fs_may_compress(inode))
> +		if ((F2FS_I(dir)->i_flags & FS_COMPR_FL) &&
> +					f2fs_may_compress(inode)) {
> +			F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
>  			set_inode_flag(inode, FI_COMPRESSED_FILE);
> +		}
>  	}
>  
>  	f2fs_set_inode_flags(inode);
> @@ -282,6 +284,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
>  static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
>  						const unsigned char *name)
>  {
> +	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
>  	unsigned char (*ext)[F2FS_EXTENSION_LEN];
>  	unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
>  	int i, cold_count, hot_count;
> @@ -292,13 +295,24 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
>  			!f2fs_may_compress(inode))
>  		return;
>  
> +	down_read(&sbi->sb_lock);
> +
>  	ext = F2FS_OPTION(sbi).extensions;
>  
>  	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
>  	hot_count = sbi->raw_super->hot_ext_count;
>  
> +	for (i = cold_count; i < cold_count + hot_count; i++) {
> +		if (is_extension_exist(name, extlist[i])) {
> +			up_read(&sbi->sb_lock);
> +			return;
> +		}
> +	}
> +
> +	up_read(&sbi->sb_lock);
> +
>  	for (i = 0; i < ext_cnt; i++) {
> -		if (is_extension_exist(name, ext[i]) && !file_is_hot(inode)) {
> +		if (is_extension_exist(name, ext[i])) {
>  			F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
>  			set_inode_flag(inode, FI_COMPRESSED_FILE);
>  			return;
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file
  2019-10-31 15:29           ` Jaegeuk Kim
@ 2019-11-05  3:39             ` Chao Yu
  0 siblings, 0 replies; 32+ messages in thread
From: Chao Yu @ 2019-11-05  3:39 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

On 2019/10/31 23:29, Jaegeuk Kim wrote:
> On 10/31, Chao Yu wrote:
>> On 2019/10/31 0:09, Jaegeuk Kim wrote:
>>> On 10/26, Chao Yu wrote:
>>>> On 2019/10/26 2:18, Jaegeuk Kim wrote:
>>>>> On 10/24, Chao Yu wrote:
>>>>>> Hi Jaegeuk,
>>>>>>
>>>>>> On 2019/10/23 1:16, Jaegeuk Kim wrote:
>>>>>>> This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
>>>>>>> by allocating fully valid 2MB segment.
>>>>>>>
>>>>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>>>>>>> ---
>>>>>>>  fs/f2fs/f2fs.h     |  4 +++-
>>>>>>>  fs/f2fs/file.c     | 39 ++++++++++++++++++++++++++++++++++-----
>>>>>>>  fs/f2fs/recovery.c |  2 +-
>>>>>>>  fs/f2fs/segment.c  | 21 ++++++++++++++++++++-
>>>>>>>  fs/f2fs/segment.h  |  2 ++
>>>>>>>  fs/f2fs/super.c    |  1 +
>>>>>>>  fs/f2fs/sysfs.c    |  2 ++
>>>>>>>  7 files changed, 63 insertions(+), 8 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>>>>> index ca342f4c7db1..c681f51e351b 100644
>>>>>>> --- a/fs/f2fs/f2fs.h
>>>>>>> +++ b/fs/f2fs/f2fs.h
>>>>>>> @@ -890,6 +890,7 @@ enum {
>>>>>>>  	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
>>>>>>>  	CURSEG_COLD_NODE,	/* indirect node blocks */
>>>>>>>  	NO_CHECK_TYPE,
>>>>>>> +	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
>>>>>>>  };
>>>>>>>  
>>>>>>>  struct flush_cmd {
>>>>>>> @@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
>>>>>>>  
>>>>>>>  	/* threshold for gc trials on pinned files */
>>>>>>>  	u64 gc_pin_file_threshold;
>>>>>>> +	struct rw_semaphore pin_sem;
>>>>>>>  
>>>>>>>  	/* maximum # of trials to find a victim segment for SSR and GC */
>>>>>>>  	unsigned int max_victim_search;
>>>>>>> @@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
>>>>>>>  int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
>>>>>>>  void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>>>>>>>  					unsigned int start, unsigned int end);
>>>>>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
>>>>>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
>>>>>>>  int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
>>>>>>>  bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
>>>>>>>  					struct cp_control *cpc);
>>>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>>>> index 29bc0a542759..f6c038e8a6a7 100644
>>>>>>> --- a/fs/f2fs/file.c
>>>>>>> +++ b/fs/f2fs/file.c
>>>>>>> @@ -1545,12 +1545,41 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
>>>>>>>  	if (off_end)
>>>>>>>  		map.m_len++;
>>>>>>>  
>>>>>>> -	if (f2fs_is_pinned_file(inode))
>>>>>>> -		map.m_seg_type = CURSEG_COLD_DATA;
>>>>>>> +	if (!map.m_len)
>>>>>>> +		return 0;
>>>>>>> +
>>>>>>> +	if (f2fs_is_pinned_file(inode)) {
>>>>>>> +		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
>>>>>>> +					sbi->log_blocks_per_seg;
>>>>>>> +		block_t done = 0;
>>>>>>> +
>>>>>>> +		if (map.m_len % sbi->blocks_per_seg)
>>>>>>> +			len += sbi->blocks_per_seg;
>>>>>>>  
>>>>>>> -	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
>>>>>>> -						F2FS_GET_BLOCK_PRE_DIO :
>>>>>>> -						F2FS_GET_BLOCK_PRE_AIO));
>>>>>>> +		map.m_len = sbi->blocks_per_seg;
>>>>>>> +next_alloc:
>>>>>>> +		mutex_lock(&sbi->gc_mutex);
>>>>>>> +		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
>>>>>>> +		if (err && err != -ENODATA && err != -EAGAIN)
>>>>>>> +			goto out_err;
>>>>>>
>>>>>> To grab enough free space?
>>>>>>
>>>>>> Shouldn't we call
>>>>>>
>>>>>> 	if (has_not_enough_free_secs(sbi, 0, 0)) {
>>>>>> 		mutex_lock(&sbi->gc_mutex);
>>>>>> 		f2fs_gc(sbi, false, false, NULL_SEGNO);
>>>>>> 	}
>>>>>
>>>>> The above calls gc all the time. Do we need this?
>>>>
>>>> Hmmm... my concern is why we need to run foreground GC even if there is enough
>>>> free space..
>>>
>>> In order to get the free segment easily?
>>
>> However, I doubt arbitrary foreground GC with greedy algorithm will ruin
>> hot/cold data separation, actually, for sufficient free segment case, it's
>> unnecessary to call FGGC.
> 
> Two things here; 1) I do worry much about when hitting boundary on
> has_not_enough_free_secs() which calculates # of free segments based on # of
> dirty pages. In this case, we just jump to allocate another free segment so
> I think it increases the possiblity of no free segment panic. 2) Even if we

Yup, I guess for other places, if there is thousand of threads allocating space
concurrently, we may have small probability to hit run out-of-free-space issue,
probability can increase if partition size decreases.

So I think the right way to fix all out-of-free-space issues is to use
reservation mechanism in our interface,  e.g.

f2fs_create()
{
	int reserved_block = 1 + 1;
	/*
	 * reserved blocks we may dirty/allocate in create flow:
	 * 1 block: new inode block;
	 * 1 block: parent's dent block;
	 */

	f2fs_get_budget(reserved_block); //sbi->reserved_block += 2;

	f2fs_balance_fs(sbi); //has_not_enough_free_secs(): reserved_secs += get_secs(,
sbi->reserved_block);

	f2fs_add_link();
	/* inode meta is dirty; dent block is dirty */

	f2fs_release_budget(reserved_block); //sbi->reserved_block -= 2;
}

expand_inode_data() can switch to use this mechanism to avoid that issue.

> do call FGGC a lot, I don't think it will *ruin* the hot/cold data separation
> a lot. Putting hot/warm blocks together into cold log will make another hot
> segment which was being used as cold log. IOWs, we don't need to keep hot data
> in hot log at all, but should be fine to split hot and cold data in different

With below codes, we are trying to mix cold data into warm data log with block
granularity, rather than segment granularity.

	if (type == CURSEG_COLD_DATA) {
		/* GC during CURSEG_COLD_DATA_PINNED allocation */
		if (down_read_trylock(&sbi->pin_sem)) {
			put_pin_sem = true;
		} else {
			type = CURSEG_WARM_DATA;
			curseg = CURSEG_I(sbi, type);
		}

This could cause more GC cycles on such mixed segments, or am I missing something?

Thanks,

> segments. So, I chose to go safer way since this is eating free segments
> directly.
> 
>>
>> Thanks,
>>
>>>
>>>>
>>>>>
>>>>>>
>>>>>>> +
>>>>>>> +		down_write(&sbi->pin_sem);
>>>>>>> +		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
>>>>>>> +		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
>>>>>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
>>>>>>> +		up_write(&sbi->pin_sem);
>>>>>>> +
>>>>>>> +		done += map.m_len;
>>>>>>> +		len -= map.m_len;
>>>>>>> +		map.m_lblk += map.m_len;
>>>>>>> +		if (!err && len)
>>>>>>> +			goto next_alloc;
>>>>>>> +
>>>>>>> +		map.m_len = done;
>>>>>>> +	} else {
>>>>>>> +		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
>>>>>>> +	}
>>>>>>> +out_err:
>>>>>>>  	if (err) {
>>>>>>>  		pgoff_t last_off;
>>>>>>>  
>>>>>>> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
>>>>>>> index 783773e4560d..76477f71d4ee 100644
>>>>>>> --- a/fs/f2fs/recovery.c
>>>>>>> +++ b/fs/f2fs/recovery.c
>>>>>>> @@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
>>>>>>>  		f2fs_put_page(page, 1);
>>>>>>>  	}
>>>>>>>  	if (!err)
>>>>>>> -		f2fs_allocate_new_segments(sbi);
>>>>>>> +		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
>>>>>>>  	return err;
>>>>>>>  }
>>>>>>>  
>>>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>>>>>> index 25c750cd0272..253d72c2663c 100644
>>>>>>> --- a/fs/f2fs/segment.c
>>>>>>> +++ b/fs/f2fs/segment.c
>>>>>>> @@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
>>>>>>>  	up_read(&SM_I(sbi)->curseg_lock);
>>>>>>>  }
>>>>>>>  
>>>>>>> -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>>>>>>> +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
>>>>>>>  {
>>>>>>>  	struct curseg_info *curseg;
>>>>>>>  	unsigned int old_segno;
>>>>>>> @@ -2699,6 +2699,9 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
>>>>>>>  	down_write(&SIT_I(sbi)->sentry_lock);
>>>>>>>  
>>>>>>>  	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
>>>>>>> +		if (type != NO_CHECK_TYPE && i != type)
>>>>>>> +			continue;
>>>>>>> +
>>>>>>>  		curseg = CURSEG_I(sbi, i);
>>>>>>>  		old_segno = curseg->segno;
>>>>>>>  		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
>>>>>>> @@ -3068,6 +3071,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>>>>>>>  {
>>>>>>>  	struct sit_info *sit_i = SIT_I(sbi);
>>>>>>>  	struct curseg_info *curseg = CURSEG_I(sbi, type);
>>>>>>> +	bool put_pin_sem = false;
>>>>>>> +
>>>>>>> +	if (type == CURSEG_COLD_DATA) {
>>>>>>> +		/* GC during CURSEG_COLD_DATA_PINNED allocation */
>>>>>>> +		if (down_read_trylock(&sbi->pin_sem)) {
>>>>>>> +			put_pin_sem = true;
>>>>>>> +		} else {
>>>>>>> +			type = CURSEG_WARM_DATA;
>>>>>>> +			curseg = CURSEG_I(sbi, type);
>>>>>>
>>>>>> It will mix pending cold data into warm area... rather than recovering curseg to
>>>>>> write pointer of last cold segment?
>>>>>>
>>>>>> I know maybe that fallocate aligned address could be corner case, but I guess
>>>>>> there should be some better solutions can handle race case more effectively.
>>>>>>
>>>>>> One solution could be: allocating a virtual log header to select free segment as
>>>>>> 2m-aligned space target.
>>>>>
>>>>> I thought about that, but concluded to avoid too much changes.
>>>>
>>>> We have an unupstreamed feature which is based on virtual log header, I can
>>>> introduce that basic virtual log fwk, which can be used for aligned allocation
>>>> and later new features, would you like to check that?
>>>>
>>>> Thanks,
>>>>
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>>> +		}
>>>>>>> +	} else if (type == CURSEG_COLD_DATA_PINNED) {
>>>>>>> +		type = CURSEG_COLD_DATA;
>>>>>>> +	}
>>>>>>>  
>>>>>>>  	down_read(&SM_I(sbi)->curseg_lock);
>>>>>>>  
>>>>>>> @@ -3133,6 +3149,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
>>>>>>>  	mutex_unlock(&curseg->curseg_mutex);
>>>>>>>  
>>>>>>>  	up_read(&SM_I(sbi)->curseg_lock);
>>>>>>> +
>>>>>>> +	if (put_pin_sem)
>>>>>>> +		up_read(&sbi->pin_sem);
>>>>>>>  }
>>>>>>>  
>>>>>>>  static void update_device_state(struct f2fs_io_info *fio)
>>>>>>> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
>>>>>>> index 325781a1ae4d..a95467b202ea 100644
>>>>>>> --- a/fs/f2fs/segment.h
>>>>>>> +++ b/fs/f2fs/segment.h
>>>>>>> @@ -313,6 +313,8 @@ struct sit_entry_set {
>>>>>>>   */
>>>>>>>  static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
>>>>>>>  {
>>>>>>> +	if (type == CURSEG_COLD_DATA_PINNED)
>>>>>>> +		type = CURSEG_COLD_DATA;
>>>>>>>  	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
>>>>>>>  }
>>>>>>>  
>>>>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>>>>> index f320fd11db48..c02a47ce551b 100644
>>>>>>> --- a/fs/f2fs/super.c
>>>>>>> +++ b/fs/f2fs/super.c
>>>>>>> @@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
>>>>>>>  	spin_lock_init(&sbi->dev_lock);
>>>>>>>  
>>>>>>>  	init_rwsem(&sbi->sb_lock);
>>>>>>> +	init_rwsem(&sbi->pin_sem);
>>>>>>>  }
>>>>>>>  
>>>>>>>  static int init_percpu_info(struct f2fs_sb_info *sbi)
>>>>>>> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
>>>>>>> index b558b64a4c9c..f164959e4224 100644
>>>>>>> --- a/fs/f2fs/sysfs.c
>>>>>>> +++ b/fs/f2fs/sysfs.c
>>>>>>> @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
>>>>>>>  	if (f2fs_sb_has_casefold(sbi))
>>>>>>>  		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>>>>>>>  				len ? ", " : "", "casefold");
>>>>>>> +	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
>>>>>>> +				len ? ", " : "", "pin_file");
>>>>>>>  	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
>>>>>>>  	return len;
>>>>>>>  }
>>>>>>>
>>>>> .
>>>>>
>>> .
>>>
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 1/2 v2] f2fs: support aligned pinned file
  2019-10-22 17:16 [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Jaegeuk Kim
  2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
  2019-10-24  8:21 ` [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Chao Yu
@ 2019-11-07 19:14 ` " Jaegeuk Kim
  2 siblings, 0 replies; 32+ messages in thread
From: Jaegeuk Kim @ 2019-11-07 19:14 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel

This patch supports 2MB-aligned pinned file, which can guarantee no GC at all
by allocating fully valid 2MB segment.

Check free segments by has_not_enough_free_secs() with large budget.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
v2 from v1:
 - avoid to allocate gc'ed segment and pinned segment consecutively
 - avoid to allocate SSR segment

 fs/f2fs/f2fs.h     |  4 +++-
 fs/f2fs/file.c     | 42 +++++++++++++++++++++++++++++++++++++-----
 fs/f2fs/recovery.c |  2 +-
 fs/f2fs/segment.c  | 31 +++++++++++++++++++++++++++----
 fs/f2fs/segment.h  |  2 ++
 fs/f2fs/super.c    |  1 +
 fs/f2fs/sysfs.c    |  2 ++
 7 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ca342f4c7db1..c681f51e351b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -890,6 +890,7 @@ enum {
 	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
 	CURSEG_COLD_NODE,	/* indirect node blocks */
 	NO_CHECK_TYPE,
+	CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
 };
 
 struct flush_cmd {
@@ -1301,6 +1302,7 @@ struct f2fs_sb_info {
 
 	/* threshold for gc trials on pinned files */
 	u64 gc_pin_file_threshold;
+	struct rw_semaphore pin_sem;
 
 	/* maximum # of trials to find a victim segment for SSR and GC */
 	unsigned int max_victim_search;
@@ -3116,7 +3118,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
 void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 					unsigned int start, unsigned int end);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
 					struct cp_control *cpc);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 29bc0a542759..c31a5bbc8090 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1545,12 +1545,44 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	if (off_end)
 		map.m_len++;
 
-	if (f2fs_is_pinned_file(inode))
-		map.m_seg_type = CURSEG_COLD_DATA;
+	if (!map.m_len)
+		return 0;
+
+	if (f2fs_is_pinned_file(inode)) {
+		block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
+					sbi->log_blocks_per_seg;
+		block_t done = 0;
+
+		if (map.m_len % sbi->blocks_per_seg)
+			len += sbi->blocks_per_seg;
+
+		map.m_len = sbi->blocks_per_seg;
+next_alloc:
+		if (has_not_enough_free_secs(sbi, 0,
+			GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
+			mutex_lock(&sbi->gc_mutex);
+			err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+			if (err && err != -ENODATA && err != -EAGAIN)
+				goto out_err;
+		}
 
-	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
-						F2FS_GET_BLOCK_PRE_DIO :
-						F2FS_GET_BLOCK_PRE_AIO));
+		down_write(&sbi->pin_sem);
+		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
+		f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
+		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+		up_write(&sbi->pin_sem);
+
+		done += map.m_len;
+		len -= map.m_len;
+		map.m_lblk += map.m_len;
+		if (!err && len)
+			goto next_alloc;
+
+		map.m_len = done;
+	} else {
+		err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	}
+out_err:
 	if (err) {
 		pgoff_t last_off;
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 783773e4560d..76477f71d4ee 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -711,7 +711,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 		f2fs_put_page(page, 1);
 	}
 	if (!err)
-		f2fs_allocate_new_segments(sbi);
+		f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
 	return err;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 25c750cd0272..8bb37f8a1845 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2690,7 +2690,7 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 	up_read(&SM_I(sbi)->curseg_lock);
 }
 
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg;
 	unsigned int old_segno;
@@ -2699,10 +2699,17 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 	down_write(&SIT_I(sbi)->sentry_lock);
 
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		if (type != NO_CHECK_TYPE && i != type)
+			continue;
+
 		curseg = CURSEG_I(sbi, i);
-		old_segno = curseg->segno;
-		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
-		locate_dirty_segment(sbi, old_segno);
+		if (type == NO_CHECK_TYPE || curseg->next_blkoff ||
+				get_valid_blocks(sbi, curseg->segno, false) ||
+				get_ckpt_valid_blocks(sbi, curseg->segno)) {
+			old_segno = curseg->segno;
+			SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+			locate_dirty_segment(sbi, old_segno);
+		}
 	}
 
 	up_write(&SIT_I(sbi)->sentry_lock);
@@ -3068,6 +3075,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	bool put_pin_sem = false;
+
+	if (type == CURSEG_COLD_DATA) {
+		/* GC during CURSEG_COLD_DATA_PINNED allocation */
+		if (down_read_trylock(&sbi->pin_sem)) {
+			put_pin_sem = true;
+		} else {
+			type = CURSEG_WARM_DATA;
+			curseg = CURSEG_I(sbi, type);
+		}
+	} else if (type == CURSEG_COLD_DATA_PINNED) {
+		type = CURSEG_COLD_DATA;
+	}
 
 	down_read(&SM_I(sbi)->curseg_lock);
 
@@ -3133,6 +3153,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	mutex_unlock(&curseg->curseg_mutex);
 
 	up_read(&SM_I(sbi)->curseg_lock);
+
+	if (put_pin_sem)
+		up_read(&sbi->pin_sem);
 }
 
 static void update_device_state(struct f2fs_io_info *fio)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 325781a1ae4d..a95467b202ea 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -313,6 +313,8 @@ struct sit_entry_set {
  */
 static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
 {
+	if (type == CURSEG_COLD_DATA_PINNED)
+		type = CURSEG_COLD_DATA;
 	return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f320fd11db48..c02a47ce551b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2853,6 +2853,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	spin_lock_init(&sbi->dev_lock);
 
 	init_rwsem(&sbi->sb_lock);
+	init_rwsem(&sbi->pin_sem);
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index b558b64a4c9c..f164959e4224 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_casefold(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "casefold");
+	len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "pin_file");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
-- 
2.19.0.605.g01d371f741-goog



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-10-30 17:02           ` Eric Biggers
  2019-10-31  2:21             ` Chao Yu
@ 2019-11-13 13:10             ` Chao Yu
  2019-11-18 16:11               ` Jaegeuk Kim
  1 sibling, 1 reply; 32+ messages in thread
From: Chao Yu @ 2019-11-13 13:10 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: Eric Biggers, linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

I've split workqueue for fsverity, please test compression based on last patch.

I shutdown F2FS_FS_COMPRESSION config, it looks all verity testcases can pass, will
do more test for compress/encrypt/fsverity combination later.

The diff is as below, code base is last g-dev-test branch:

From 5b51682bc3013b8de6dee4906865181c3ded435f Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 12 Nov 2019 10:03:21 +0800
Subject: [PATCH INCREMENT] f2fs: compress: split workqueue for fsverity

Signed-off-by: Chao Yu <yuchao0@huawei.com>
---
 fs/f2fs/compress.c | 16 +++++---
 fs/f2fs/data.c     | 94 +++++++++++++++++++++++++++++++++++-----------
 fs/f2fs/f2fs.h     |  2 +-
 3 files changed, 84 insertions(+), 28 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index f4ce825f12b4..254275325890 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -377,7 +377,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)

 	dec_page_count(sbi, F2FS_RD_DATA);

-	if (bio->bi_status)
+	if (bio->bi_status || PageError(page))
 		dic->failed = true;

 	if (refcount_dec_not_one(&dic->ref))
@@ -419,10 +419,14 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 out_vunmap_rbuf:
 	vunmap(dic->rbuf);
 out_free_dic:
-	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
+	if (!verity)
+		f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
+								ret, false);
+
 	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
-								dic->clen, ret);
-	f2fs_free_dic(dic);
+							dic->clen, ret);
+	if (!verity)
+		f2fs_free_dic(dic);
 }

 static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -1086,7 +1090,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
 	kfree(dic);
 }

-void f2fs_set_cluster_uptodate(struct page **rpages,
+void f2fs_decompress_end_io(struct page **rpages,
 			unsigned int cluster_size, bool err, bool verity)
 {
 	int i;
@@ -1108,4 +1112,4 @@ void f2fs_set_cluster_uptodate(struct page **rpages,
 		}
 		unlock_page(rpage);
 	}
-}
+}
\ No newline at end of file
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c9362a53f8a1..2d64c6ffee84 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -98,7 +98,7 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
 		page = bv->bv_page;

 #ifdef CONFIG_F2FS_FS_COMPRESSION
-		if (compr && PagePrivate(page)) {
+		if (compr && f2fs_is_compressed_page(page)) {
 			f2fs_decompress_pages(bio, page, verity);
 			continue;
 		}
@@ -115,9 +115,14 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
 		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
 		unlock_page(page);
 	}
-	if (bio->bi_private)
-		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
-	bio_put(bio);
+}
+
+static void f2fs_release_read_bio(struct bio *bio);
+static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
+{
+	if (!compr)
+		__read_end_io(bio, false, verity);
+	f2fs_release_read_bio(bio);
 }

 static void f2fs_decompress_bio(struct bio *bio, bool verity)
@@ -127,19 +132,50 @@ static void f2fs_decompress_bio(struct bio *bio, bool verity)

 static void bio_post_read_processing(struct bio_post_read_ctx *ctx);

-static void decrypt_work(struct bio_post_read_ctx *ctx)
+static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
 {
 	fscrypt_decrypt_bio(ctx->bio);
 }

-static void decompress_work(struct bio_post_read_ctx *ctx, bool verity)
+static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
+{
+	f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
+}
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
 {
-	f2fs_decompress_bio(ctx->bio, verity);
+	f2fs_decompress_end_io(rpages, cluster_size, false, true);
 }

-static void verity_work(struct bio_post_read_ctx *ctx)
+static void f2fs_verify_bio(struct bio *bio)
 {
+	struct page *page = bio_first_page_all(bio);
+	struct decompress_io_ctx *dic =
+			(struct decompress_io_ctx *)page_private(page);
+
+	f2fs_verify_pages(dic->rpages, dic->cluster_size);
+	f2fs_free_dic(dic);
+}
+#endif
+
+static void f2fs_verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	/* previous step is decompression */
+	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
+
+		f2fs_verify_bio(ctx->bio);
+		f2fs_release_read_bio(ctx->bio);
+		return;
+	}
+#endif
+
 	fsverity_verify_bio(ctx->bio);
+	__f2fs_read_end_io(ctx->bio, false, false);
 }

 static void f2fs_post_read_work(struct work_struct *work)
@@ -148,18 +184,19 @@ static void f2fs_post_read_work(struct work_struct *work)
 		container_of(work, struct bio_post_read_ctx, work);

 	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
-		decrypt_work(ctx);
+		f2fs_decrypt_work(ctx);

-	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
-		decompress_work(ctx,
-			ctx->enabled_steps & (1 << STEP_VERITY));
+	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
+		f2fs_decompress_work(ctx);
+
+	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+		INIT_WORK(&ctx->work, f2fs_verity_work);
+		fsverity_enqueue_verify_work(&ctx->work);
 		return;
 	}

-	if (ctx->enabled_steps & (1 << STEP_VERITY))
-		verity_work(ctx);
-
-	__read_end_io(ctx->bio, false, false);
+	__f2fs_read_end_io(ctx->bio,
+		ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
 }

 static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
@@ -176,12 +213,20 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
 	 * we shouldn't recurse to the same workqueue.
 	 */

-	if (ctx->enabled_steps) {
+	if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
+		ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
 		INIT_WORK(&ctx->work, f2fs_post_read_work);
 		f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
 		return;
 	}
-	__read_end_io(ctx->bio, false, false);
+
+	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+		INIT_WORK(&ctx->work, f2fs_verity_work);
+		fsverity_enqueue_verify_work(&ctx->work);
+		return;
+	}
+
+	__f2fs_read_end_io(ctx->bio, false, false);
 }

 static bool f2fs_bio_post_read_required(struct bio *bio)
@@ -205,7 +250,7 @@ static void f2fs_read_end_io(struct bio *bio)
 		return;
 	}

-	__read_end_io(bio, false, false);
+	__f2fs_read_end_io(bio, false, false);
 }

 static void f2fs_write_end_io(struct bio *bio)
@@ -864,6 +909,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	return bio;
 }

+static void f2fs_release_read_bio(struct bio *bio)
+{
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
+}
+
 /* This can handle encryption stuffs */
 static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 							block_t blkaddr)
@@ -2023,7 +2075,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				dic->failed = true;
 				if (refcount_sub_and_test(dic->nr_cpages - i,
 							&dic->ref))
-					f2fs_set_cluster_uptodate(dic->rpages,
+					f2fs_decompress_end_io(dic->rpages,
 							cc->cluster_size, true,
 							false);
 				f2fs_free_dic(dic);
@@ -2053,7 +2105,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 out_put_dnode:
 	f2fs_put_dnode(&dn);
 out:
-	f2fs_set_cluster_uptodate(cc->rpages, cc->cluster_size, true, false);
+	f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
 	f2fs_destroy_compress_ctx(cc);
 	*bio_ret = bio;
 	return ret;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8a3a35b42a37..20067fa3b035 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3795,7 +3795,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				bool is_readahead);
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
 void f2fs_free_dic(struct decompress_io_ctx *dic);
-void f2fs_set_cluster_uptodate(struct page **rpages,
+void f2fs_decompress_end_io(struct page **rpages,
 			unsigned int cluster_size, bool err, bool verity);
 int f2fs_init_compress_ctx(struct compress_ctx *cc);
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
-- 
2.18.0.rc1



On 2019/10/31 1:02, Eric Biggers wrote:
> On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
>>>>>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
>>>>>>  {
>>>>>> -	/*
>>>>>> -	 * We use different work queues for decryption and for verity because
>>>>>> -	 * verity may require reading metadata pages that need decryption, and
>>>>>> -	 * we shouldn't recurse to the same workqueue.
>>>>>> -	 */
>>>>>
>>>>> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
>>>>> decryption and for verity?  See the comment above which is being deleted.
>>>>
>>>> Could you explain more about how deadlock happen? or share me a link address if
>>>> you have described that case somewhere?
>>>>
>>>
>>> The verity work can read pages from the file which require decryption.  I'm
>>> concerned that it could deadlock if the work is scheduled on the same workqueue.
>>
>> I assume you've tried one workqueue, and suffered deadlock..
>>
>>> Granted, I'm not an expert in Linux workqueues, so if you've investigated this
>>> and determined that it's safe, can you explain why?
>>
>> I'm not familiar with workqueue...  I guess it may not safe that if the work is
>> scheduled to the same cpu in where verity was waiting for data? if the work is
>> scheduled to other cpu, it may be safe.
>>
>> I can check that before splitting the workqueue for verity and decrypt/decompress.
>>
> 
> Yes this is a real problem, try 'kvm-xfstests -c f2fs/encrypt generic/579'.
> The worker thread gets deadlocked in f2fs_read_merkle_tree_page() waiting for
> the Merkle tree page to be decrypted.  This is with the v2 compression patch;
> it works fine on current mainline.
> 
> INFO: task kworker/u5:0:61 blocked for more than 30 seconds.
>       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u5:0    D    0    61      2 0x80004000
> Workqueue: f2fs_post_read_wq f2fs_post_read_work
> Call Trace:
>  context_switch kernel/sched/core.c:3384 [inline]
>  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
>  schedule+0x44/0xd0 kernel/sched/core.c:4136
>  io_schedule+0x11/0x40 kernel/sched/core.c:5780
>  wait_on_page_bit_common mm/filemap.c:1174 [inline]
>  wait_on_page_bit mm/filemap.c:1223 [inline]
>  wait_on_page_locked include/linux/pagemap.h:527 [inline]
>  wait_on_page_locked include/linux/pagemap.h:524 [inline]
>  wait_on_page_read mm/filemap.c:2767 [inline]
>  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
>  read_cache_page+0xd/0x10 mm/filemap.c:2894
>  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
>  verify_page+0x110/0x560 fs/verity/verify.c:120
>  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
>  verity_work fs/f2fs/data.c:142 [inline]
>  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
>  process_one_work+0x225/0x550 kernel/workqueue.c:2269
>  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
>  kthread+0x125/0x140 kernel/kthread.c:255
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> INFO: task kworker/u5:1:1140 blocked for more than 30 seconds.
>       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u5:1    D    0  1140      2 0x80004000
> Workqueue: f2fs_post_read_wq f2fs_post_read_work
> Call Trace:
>  context_switch kernel/sched/core.c:3384 [inline]
>  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
>  schedule+0x44/0xd0 kernel/sched/core.c:4136
>  io_schedule+0x11/0x40 kernel/sched/core.c:5780
>  wait_on_page_bit_common mm/filemap.c:1174 [inline]
>  wait_on_page_bit mm/filemap.c:1223 [inline]
>  wait_on_page_locked include/linux/pagemap.h:527 [inline]
>  wait_on_page_locked include/linux/pagemap.h:524 [inline]
>  wait_on_page_read mm/filemap.c:2767 [inline]
>  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
>  read_cache_page+0xd/0x10 mm/filemap.c:2894
>  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
>  verify_page+0x110/0x560 fs/verity/verify.c:120
>  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
>  verity_work fs/f2fs/data.c:142 [inline]
>  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
>  process_one_work+0x225/0x550 kernel/workqueue.c:2269
>  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
>  kthread+0x125/0x140 kernel/kthread.c:255
>  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> 
> Showing all locks held in the system:
> 1 lock held by khungtaskd/21:
>  #0: ffffffff82250520 (rcu_read_lock){....}, at: rcu_lock_acquire.constprop.0+0x0/0x30 include/trace/events/lock.h:13
> 2 locks held by kworker/u5:0/61:
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
>  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> 2 locks held by kworker/u5:1/1140:
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
>  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
>  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
>  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-11-13 13:10             ` Chao Yu
@ 2019-11-18 16:11               ` Jaegeuk Kim
  2019-11-18 20:58                 ` Jaegeuk Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Jaegeuk Kim @ 2019-11-18 16:11 UTC (permalink / raw)
  To: Chao Yu; +Cc: Eric Biggers, linux-kernel, linux-f2fs-devel

On 11/13, Chao Yu wrote:
> Hi Jaegeuk,
> 
> I've split workqueue for fsverity, please test compression based on last patch.
> 
> I shutdown F2FS_FS_COMPRESSION config, it looks all verity testcases can pass, will
> do more test for compress/encrypt/fsverity combination later.

Thanks, I applied and start some tests.

> 
> The diff is as below, code base is last g-dev-test branch:
> 
> >From 5b51682bc3013b8de6dee4906865181c3ded435f Mon Sep 17 00:00:00 2001
> From: Chao Yu <yuchao0@huawei.com>
> Date: Tue, 12 Nov 2019 10:03:21 +0800
> Subject: [PATCH INCREMENT] f2fs: compress: split workqueue for fsverity
> 
> Signed-off-by: Chao Yu <yuchao0@huawei.com>
> ---
>  fs/f2fs/compress.c | 16 +++++---
>  fs/f2fs/data.c     | 94 +++++++++++++++++++++++++++++++++++-----------
>  fs/f2fs/f2fs.h     |  2 +-
>  3 files changed, 84 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
> index f4ce825f12b4..254275325890 100644
> --- a/fs/f2fs/compress.c
> +++ b/fs/f2fs/compress.c
> @@ -377,7 +377,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
> 
>  	dec_page_count(sbi, F2FS_RD_DATA);
> 
> -	if (bio->bi_status)
> +	if (bio->bi_status || PageError(page))
>  		dic->failed = true;
> 
>  	if (refcount_dec_not_one(&dic->ref))
> @@ -419,10 +419,14 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
>  out_vunmap_rbuf:
>  	vunmap(dic->rbuf);
>  out_free_dic:
> -	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
> +	if (!verity)
> +		f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
> +								ret, false);
> +
>  	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
> -								dic->clen, ret);
> -	f2fs_free_dic(dic);
> +							dic->clen, ret);
> +	if (!verity)
> +		f2fs_free_dic(dic);
>  }
> 
>  static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
> @@ -1086,7 +1090,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
>  	kfree(dic);
>  }
> 
> -void f2fs_set_cluster_uptodate(struct page **rpages,
> +void f2fs_decompress_end_io(struct page **rpages,
>  			unsigned int cluster_size, bool err, bool verity)
>  {
>  	int i;
> @@ -1108,4 +1112,4 @@ void f2fs_set_cluster_uptodate(struct page **rpages,
>  		}
>  		unlock_page(rpage);
>  	}
> -}
> +}
> \ No newline at end of file
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index c9362a53f8a1..2d64c6ffee84 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -98,7 +98,7 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
>  		page = bv->bv_page;
> 
>  #ifdef CONFIG_F2FS_FS_COMPRESSION
> -		if (compr && PagePrivate(page)) {
> +		if (compr && f2fs_is_compressed_page(page)) {
>  			f2fs_decompress_pages(bio, page, verity);
>  			continue;
>  		}
> @@ -115,9 +115,14 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
>  		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
>  		unlock_page(page);
>  	}
> -	if (bio->bi_private)
> -		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
> -	bio_put(bio);
> +}
> +
> +static void f2fs_release_read_bio(struct bio *bio);
> +static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
> +{
> +	if (!compr)
> +		__read_end_io(bio, false, verity);
> +	f2fs_release_read_bio(bio);
>  }
> 
>  static void f2fs_decompress_bio(struct bio *bio, bool verity)
> @@ -127,19 +132,50 @@ static void f2fs_decompress_bio(struct bio *bio, bool verity)
> 
>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
> 
> -static void decrypt_work(struct bio_post_read_ctx *ctx)
> +static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
>  {
>  	fscrypt_decrypt_bio(ctx->bio);
>  }
> 
> -static void decompress_work(struct bio_post_read_ctx *ctx, bool verity)
> +static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
> +{
> +	f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
> +}
> +
> +#ifdef CONFIG_F2FS_FS_COMPRESSION
> +void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
>  {
> -	f2fs_decompress_bio(ctx->bio, verity);
> +	f2fs_decompress_end_io(rpages, cluster_size, false, true);
>  }
> 
> -static void verity_work(struct bio_post_read_ctx *ctx)
> +static void f2fs_verify_bio(struct bio *bio)
>  {
> +	struct page *page = bio_first_page_all(bio);
> +	struct decompress_io_ctx *dic =
> +			(struct decompress_io_ctx *)page_private(page);
> +
> +	f2fs_verify_pages(dic->rpages, dic->cluster_size);
> +	f2fs_free_dic(dic);
> +}
> +#endif
> +
> +static void f2fs_verity_work(struct work_struct *work)
> +{
> +	struct bio_post_read_ctx *ctx =
> +		container_of(work, struct bio_post_read_ctx, work);
> +
> +#ifdef CONFIG_F2FS_FS_COMPRESSION
> +	/* previous step is decompression */
> +	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
> +
> +		f2fs_verify_bio(ctx->bio);
> +		f2fs_release_read_bio(ctx->bio);
> +		return;
> +	}
> +#endif
> +
>  	fsverity_verify_bio(ctx->bio);
> +	__f2fs_read_end_io(ctx->bio, false, false);
>  }
> 
>  static void f2fs_post_read_work(struct work_struct *work)
> @@ -148,18 +184,19 @@ static void f2fs_post_read_work(struct work_struct *work)
>  		container_of(work, struct bio_post_read_ctx, work);
> 
>  	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
> -		decrypt_work(ctx);
> +		f2fs_decrypt_work(ctx);
> 
> -	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
> -		decompress_work(ctx,
> -			ctx->enabled_steps & (1 << STEP_VERITY));
> +	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
> +		f2fs_decompress_work(ctx);
> +
> +	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
> +		INIT_WORK(&ctx->work, f2fs_verity_work);
> +		fsverity_enqueue_verify_work(&ctx->work);
>  		return;
>  	}
> 
> -	if (ctx->enabled_steps & (1 << STEP_VERITY))
> -		verity_work(ctx);
> -
> -	__read_end_io(ctx->bio, false, false);
> +	__f2fs_read_end_io(ctx->bio,
> +		ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
>  }
> 
>  static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
> @@ -176,12 +213,20 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
>  	 * we shouldn't recurse to the same workqueue.
>  	 */
> 
> -	if (ctx->enabled_steps) {
> +	if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
> +		ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
>  		INIT_WORK(&ctx->work, f2fs_post_read_work);
>  		f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
>  		return;
>  	}
> -	__read_end_io(ctx->bio, false, false);
> +
> +	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
> +		INIT_WORK(&ctx->work, f2fs_verity_work);
> +		fsverity_enqueue_verify_work(&ctx->work);
> +		return;
> +	}
> +
> +	__f2fs_read_end_io(ctx->bio, false, false);
>  }
> 
>  static bool f2fs_bio_post_read_required(struct bio *bio)
> @@ -205,7 +250,7 @@ static void f2fs_read_end_io(struct bio *bio)
>  		return;
>  	}
> 
> -	__read_end_io(bio, false, false);
> +	__f2fs_read_end_io(bio, false, false);
>  }
> 
>  static void f2fs_write_end_io(struct bio *bio)
> @@ -864,6 +909,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
>  	return bio;
>  }
> 
> +static void f2fs_release_read_bio(struct bio *bio)
> +{
> +	if (bio->bi_private)
> +		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
> +	bio_put(bio);
> +}
> +
>  /* This can handle encryption stuffs */
>  static int f2fs_submit_page_read(struct inode *inode, struct page *page,
>  							block_t blkaddr)
> @@ -2023,7 +2075,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>  				dic->failed = true;
>  				if (refcount_sub_and_test(dic->nr_cpages - i,
>  							&dic->ref))
> -					f2fs_set_cluster_uptodate(dic->rpages,
> +					f2fs_decompress_end_io(dic->rpages,
>  							cc->cluster_size, true,
>  							false);
>  				f2fs_free_dic(dic);
> @@ -2053,7 +2105,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>  out_put_dnode:
>  	f2fs_put_dnode(&dn);
>  out:
> -	f2fs_set_cluster_uptodate(cc->rpages, cc->cluster_size, true, false);
> +	f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
>  	f2fs_destroy_compress_ctx(cc);
>  	*bio_ret = bio;
>  	return ret;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 8a3a35b42a37..20067fa3b035 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -3795,7 +3795,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>  				bool is_readahead);
>  struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
>  void f2fs_free_dic(struct decompress_io_ctx *dic);
> -void f2fs_set_cluster_uptodate(struct page **rpages,
> +void f2fs_decompress_end_io(struct page **rpages,
>  			unsigned int cluster_size, bool err, bool verity);
>  int f2fs_init_compress_ctx(struct compress_ctx *cc);
>  void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
> -- 
> 2.18.0.rc1
> 
> 
> 
> On 2019/10/31 1:02, Eric Biggers wrote:
> > On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
> >>>>>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
> >>>>>>  {
> >>>>>> -	/*
> >>>>>> -	 * We use different work queues for decryption and for verity because
> >>>>>> -	 * verity may require reading metadata pages that need decryption, and
> >>>>>> -	 * we shouldn't recurse to the same workqueue.
> >>>>>> -	 */
> >>>>>
> >>>>> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
> >>>>> decryption and for verity?  See the comment above which is being deleted.
> >>>>
> >>>> Could you explain more about how deadlock happen? or share me a link address if
> >>>> you have described that case somewhere?
> >>>>
> >>>
> >>> The verity work can read pages from the file which require decryption.  I'm
> >>> concerned that it could deadlock if the work is scheduled on the same workqueue.
> >>
> >> I assume you've tried one workqueue, and suffered deadlock..
> >>
> >>> Granted, I'm not an expert in Linux workqueues, so if you've investigated this
> >>> and determined that it's safe, can you explain why?
> >>
> >> I'm not familiar with workqueue...  I guess it may not safe that if the work is
> >> scheduled to the same cpu in where verity was waiting for data? if the work is
> >> scheduled to other cpu, it may be safe.
> >>
> >> I can check that before splitting the workqueue for verity and decrypt/decompress.
> >>
> > 
> > Yes this is a real problem, try 'kvm-xfstests -c f2fs/encrypt generic/579'.
> > The worker thread gets deadlocked in f2fs_read_merkle_tree_page() waiting for
> > the Merkle tree page to be decrypted.  This is with the v2 compression patch;
> > it works fine on current mainline.
> > 
> > INFO: task kworker/u5:0:61 blocked for more than 30 seconds.
> >       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > kworker/u5:0    D    0    61      2 0x80004000
> > Workqueue: f2fs_post_read_wq f2fs_post_read_work
> > Call Trace:
> >  context_switch kernel/sched/core.c:3384 [inline]
> >  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
> >  schedule+0x44/0xd0 kernel/sched/core.c:4136
> >  io_schedule+0x11/0x40 kernel/sched/core.c:5780
> >  wait_on_page_bit_common mm/filemap.c:1174 [inline]
> >  wait_on_page_bit mm/filemap.c:1223 [inline]
> >  wait_on_page_locked include/linux/pagemap.h:527 [inline]
> >  wait_on_page_locked include/linux/pagemap.h:524 [inline]
> >  wait_on_page_read mm/filemap.c:2767 [inline]
> >  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
> >  read_cache_page+0xd/0x10 mm/filemap.c:2894
> >  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
> >  verify_page+0x110/0x560 fs/verity/verify.c:120
> >  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
> >  verity_work fs/f2fs/data.c:142 [inline]
> >  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
> >  process_one_work+0x225/0x550 kernel/workqueue.c:2269
> >  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
> >  kthread+0x125/0x140 kernel/kthread.c:255
> >  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> > INFO: task kworker/u5:1:1140 blocked for more than 30 seconds.
> >       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > kworker/u5:1    D    0  1140      2 0x80004000
> > Workqueue: f2fs_post_read_wq f2fs_post_read_work
> > Call Trace:
> >  context_switch kernel/sched/core.c:3384 [inline]
> >  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
> >  schedule+0x44/0xd0 kernel/sched/core.c:4136
> >  io_schedule+0x11/0x40 kernel/sched/core.c:5780
> >  wait_on_page_bit_common mm/filemap.c:1174 [inline]
> >  wait_on_page_bit mm/filemap.c:1223 [inline]
> >  wait_on_page_locked include/linux/pagemap.h:527 [inline]
> >  wait_on_page_locked include/linux/pagemap.h:524 [inline]
> >  wait_on_page_read mm/filemap.c:2767 [inline]
> >  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
> >  read_cache_page+0xd/0x10 mm/filemap.c:2894
> >  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
> >  verify_page+0x110/0x560 fs/verity/verify.c:120
> >  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
> >  verity_work fs/f2fs/data.c:142 [inline]
> >  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
> >  process_one_work+0x225/0x550 kernel/workqueue.c:2269
> >  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
> >  kthread+0x125/0x140 kernel/kthread.c:255
> >  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> > 
> > Showing all locks held in the system:
> > 1 lock held by khungtaskd/21:
> >  #0: ffffffff82250520 (rcu_read_lock){....}, at: rcu_lock_acquire.constprop.0+0x0/0x30 include/trace/events/lock.h:13
> > 2 locks held by kworker/u5:0/61:
> >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> >  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> >  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> >  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> > 2 locks held by kworker/u5:1/1140:
> >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> >  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> >  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> >  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> > .
> > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-11-18 16:11               ` Jaegeuk Kim
@ 2019-11-18 20:58                 ` Jaegeuk Kim
  2019-11-25 17:42                   ` Jaegeuk Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Jaegeuk Kim @ 2019-11-18 20:58 UTC (permalink / raw)
  To: Chao Yu; +Cc: Eric Biggers, linux-kernel, linux-f2fs-devel

On 11/18, Jaegeuk Kim wrote:
> On 11/13, Chao Yu wrote:
> > Hi Jaegeuk,
> > 
> > I've split workqueue for fsverity, please test compression based on last patch.
> > 
> > I shutdown F2FS_FS_COMPRESSION config, it looks all verity testcases can pass, will
> > do more test for compress/encrypt/fsverity combination later.
> 
> Thanks, I applied and start some tests.

I modified below to fix wrong compression check in read path.

--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1007,6 +1007,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
        if (!dic)
                return ERR_PTR(-ENOMEM);
 
+       dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
        dic->inode = cc->inode;
        refcount_set(&dic->ref, 1);
        dic->cluster_idx = cc->cluster_idx;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 02a2e7261b457..399ba883632a0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1255,6 +1255,7 @@ struct compress_io_ctx {
 
 /* decompress io context for read IO path */
 struct decompress_io_ctx {
+       u32 magic;                      /* magic number to indicate page is compressed */
        struct inode *inode;            /* inode the context belong to */
        unsigned int cluster_idx;       /* cluster index number */
        unsigned int cluster_size;      /* page count in cluster */

> 
> > 
> > The diff is as below, code base is last g-dev-test branch:
> > 
> > >From 5b51682bc3013b8de6dee4906865181c3ded435f Mon Sep 17 00:00:00 2001
> > From: Chao Yu <yuchao0@huawei.com>
> > Date: Tue, 12 Nov 2019 10:03:21 +0800
> > Subject: [PATCH INCREMENT] f2fs: compress: split workqueue for fsverity
> > 
> > Signed-off-by: Chao Yu <yuchao0@huawei.com>
> > ---
> >  fs/f2fs/compress.c | 16 +++++---
> >  fs/f2fs/data.c     | 94 +++++++++++++++++++++++++++++++++++-----------
> >  fs/f2fs/f2fs.h     |  2 +-
> >  3 files changed, 84 insertions(+), 28 deletions(-)
> > 
> > diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
> > index f4ce825f12b4..254275325890 100644
> > --- a/fs/f2fs/compress.c
> > +++ b/fs/f2fs/compress.c
> > @@ -377,7 +377,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
> > 
> >  	dec_page_count(sbi, F2FS_RD_DATA);
> > 
> > -	if (bio->bi_status)
> > +	if (bio->bi_status || PageError(page))
> >  		dic->failed = true;
> > 
> >  	if (refcount_dec_not_one(&dic->ref))
> > @@ -419,10 +419,14 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
> >  out_vunmap_rbuf:
> >  	vunmap(dic->rbuf);
> >  out_free_dic:
> > -	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
> > +	if (!verity)
> > +		f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
> > +								ret, false);
> > +
> >  	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
> > -								dic->clen, ret);
> > -	f2fs_free_dic(dic);
> > +							dic->clen, ret);
> > +	if (!verity)
> > +		f2fs_free_dic(dic);
> >  }
> > 
> >  static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
> > @@ -1086,7 +1090,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
> >  	kfree(dic);
> >  }
> > 
> > -void f2fs_set_cluster_uptodate(struct page **rpages,
> > +void f2fs_decompress_end_io(struct page **rpages,
> >  			unsigned int cluster_size, bool err, bool verity)
> >  {
> >  	int i;
> > @@ -1108,4 +1112,4 @@ void f2fs_set_cluster_uptodate(struct page **rpages,
> >  		}
> >  		unlock_page(rpage);
> >  	}
> > -}
> > +}
> > \ No newline at end of file
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index c9362a53f8a1..2d64c6ffee84 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -98,7 +98,7 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
> >  		page = bv->bv_page;
> > 
> >  #ifdef CONFIG_F2FS_FS_COMPRESSION
> > -		if (compr && PagePrivate(page)) {
> > +		if (compr && f2fs_is_compressed_page(page)) {
> >  			f2fs_decompress_pages(bio, page, verity);
> >  			continue;
> >  		}
> > @@ -115,9 +115,14 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
> >  		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
> >  		unlock_page(page);
> >  	}
> > -	if (bio->bi_private)
> > -		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
> > -	bio_put(bio);
> > +}
> > +
> > +static void f2fs_release_read_bio(struct bio *bio);
> > +static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
> > +{
> > +	if (!compr)
> > +		__read_end_io(bio, false, verity);
> > +	f2fs_release_read_bio(bio);
> >  }
> > 
> >  static void f2fs_decompress_bio(struct bio *bio, bool verity)
> > @@ -127,19 +132,50 @@ static void f2fs_decompress_bio(struct bio *bio, bool verity)
> > 
> >  static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
> > 
> > -static void decrypt_work(struct bio_post_read_ctx *ctx)
> > +static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
> >  {
> >  	fscrypt_decrypt_bio(ctx->bio);
> >  }
> > 
> > -static void decompress_work(struct bio_post_read_ctx *ctx, bool verity)
> > +static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
> > +{
> > +	f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
> > +}
> > +
> > +#ifdef CONFIG_F2FS_FS_COMPRESSION
> > +void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
> >  {
> > -	f2fs_decompress_bio(ctx->bio, verity);
> > +	f2fs_decompress_end_io(rpages, cluster_size, false, true);
> >  }
> > 
> > -static void verity_work(struct bio_post_read_ctx *ctx)
> > +static void f2fs_verify_bio(struct bio *bio)
> >  {
> > +	struct page *page = bio_first_page_all(bio);
> > +	struct decompress_io_ctx *dic =
> > +			(struct decompress_io_ctx *)page_private(page);
> > +
> > +	f2fs_verify_pages(dic->rpages, dic->cluster_size);
> > +	f2fs_free_dic(dic);
> > +}
> > +#endif
> > +
> > +static void f2fs_verity_work(struct work_struct *work)
> > +{
> > +	struct bio_post_read_ctx *ctx =
> > +		container_of(work, struct bio_post_read_ctx, work);
> > +
> > +#ifdef CONFIG_F2FS_FS_COMPRESSION
> > +	/* previous step is decompression */
> > +	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
> > +
> > +		f2fs_verify_bio(ctx->bio);
> > +		f2fs_release_read_bio(ctx->bio);
> > +		return;
> > +	}
> > +#endif
> > +
> >  	fsverity_verify_bio(ctx->bio);
> > +	__f2fs_read_end_io(ctx->bio, false, false);
> >  }
> > 
> >  static void f2fs_post_read_work(struct work_struct *work)
> > @@ -148,18 +184,19 @@ static void f2fs_post_read_work(struct work_struct *work)
> >  		container_of(work, struct bio_post_read_ctx, work);
> > 
> >  	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
> > -		decrypt_work(ctx);
> > +		f2fs_decrypt_work(ctx);
> > 
> > -	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
> > -		decompress_work(ctx,
> > -			ctx->enabled_steps & (1 << STEP_VERITY));
> > +	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
> > +		f2fs_decompress_work(ctx);
> > +
> > +	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
> > +		INIT_WORK(&ctx->work, f2fs_verity_work);
> > +		fsverity_enqueue_verify_work(&ctx->work);
> >  		return;
> >  	}
> > 
> > -	if (ctx->enabled_steps & (1 << STEP_VERITY))
> > -		verity_work(ctx);
> > -
> > -	__read_end_io(ctx->bio, false, false);
> > +	__f2fs_read_end_io(ctx->bio,
> > +		ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
> >  }
> > 
> >  static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
> > @@ -176,12 +213,20 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
> >  	 * we shouldn't recurse to the same workqueue.
> >  	 */
> > 
> > -	if (ctx->enabled_steps) {
> > +	if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
> > +		ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
> >  		INIT_WORK(&ctx->work, f2fs_post_read_work);
> >  		f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
> >  		return;
> >  	}
> > -	__read_end_io(ctx->bio, false, false);
> > +
> > +	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
> > +		INIT_WORK(&ctx->work, f2fs_verity_work);
> > +		fsverity_enqueue_verify_work(&ctx->work);
> > +		return;
> > +	}
> > +
> > +	__f2fs_read_end_io(ctx->bio, false, false);
> >  }
> > 
> >  static bool f2fs_bio_post_read_required(struct bio *bio)
> > @@ -205,7 +250,7 @@ static void f2fs_read_end_io(struct bio *bio)
> >  		return;
> >  	}
> > 
> > -	__read_end_io(bio, false, false);
> > +	__f2fs_read_end_io(bio, false, false);
> >  }
> > 
> >  static void f2fs_write_end_io(struct bio *bio)
> > @@ -864,6 +909,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
> >  	return bio;
> >  }
> > 
> > +static void f2fs_release_read_bio(struct bio *bio)
> > +{
> > +	if (bio->bi_private)
> > +		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
> > +	bio_put(bio);
> > +}
> > +
> >  /* This can handle encryption stuffs */
> >  static int f2fs_submit_page_read(struct inode *inode, struct page *page,
> >  							block_t blkaddr)
> > @@ -2023,7 +2075,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> >  				dic->failed = true;
> >  				if (refcount_sub_and_test(dic->nr_cpages - i,
> >  							&dic->ref))
> > -					f2fs_set_cluster_uptodate(dic->rpages,
> > +					f2fs_decompress_end_io(dic->rpages,
> >  							cc->cluster_size, true,
> >  							false);
> >  				f2fs_free_dic(dic);
> > @@ -2053,7 +2105,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> >  out_put_dnode:
> >  	f2fs_put_dnode(&dn);
> >  out:
> > -	f2fs_set_cluster_uptodate(cc->rpages, cc->cluster_size, true, false);
> > +	f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
> >  	f2fs_destroy_compress_ctx(cc);
> >  	*bio_ret = bio;
> >  	return ret;
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 8a3a35b42a37..20067fa3b035 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -3795,7 +3795,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> >  				bool is_readahead);
> >  struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
> >  void f2fs_free_dic(struct decompress_io_ctx *dic);
> > -void f2fs_set_cluster_uptodate(struct page **rpages,
> > +void f2fs_decompress_end_io(struct page **rpages,
> >  			unsigned int cluster_size, bool err, bool verity);
> >  int f2fs_init_compress_ctx(struct compress_ctx *cc);
> >  void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
> > -- 
> > 2.18.0.rc1
> > 
> > 
> > 
> > On 2019/10/31 1:02, Eric Biggers wrote:
> > > On Wed, Oct 30, 2019 at 04:43:52PM +0800, Chao Yu wrote:
> > >>>>>>  static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
> > >>>>>>  {
> > >>>>>> -	/*
> > >>>>>> -	 * We use different work queues for decryption and for verity because
> > >>>>>> -	 * verity may require reading metadata pages that need decryption, and
> > >>>>>> -	 * we shouldn't recurse to the same workqueue.
> > >>>>>> -	 */
> > >>>>>
> > >>>>> Why is it okay (i.e., no deadlocks) to no longer use different work queues for
> > >>>>> decryption and for verity?  See the comment above which is being deleted.
> > >>>>
> > >>>> Could you explain more about how deadlock happen? or share me a link address if
> > >>>> you have described that case somewhere?
> > >>>>
> > >>>
> > >>> The verity work can read pages from the file which require decryption.  I'm
> > >>> concerned that it could deadlock if the work is scheduled on the same workqueue.
> > >>
> > >> I assume you've tried one workqueue, and suffered deadlock..
> > >>
> > >>> Granted, I'm not an expert in Linux workqueues, so if you've investigated this
> > >>> and determined that it's safe, can you explain why?
> > >>
> > >> I'm not familiar with workqueue...  I guess it may not safe that if the work is
> > >> scheduled to the same cpu in where verity was waiting for data? if the work is
> > >> scheduled to other cpu, it may be safe.
> > >>
> > >> I can check that before splitting the workqueue for verity and decrypt/decompress.
> > >>
> > > 
> > > Yes this is a real problem, try 'kvm-xfstests -c f2fs/encrypt generic/579'.
> > > The worker thread gets deadlocked in f2fs_read_merkle_tree_page() waiting for
> > > the Merkle tree page to be decrypted.  This is with the v2 compression patch;
> > > it works fine on current mainline.
> > > 
> > > INFO: task kworker/u5:0:61 blocked for more than 30 seconds.
> > >       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> > > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > kworker/u5:0    D    0    61      2 0x80004000
> > > Workqueue: f2fs_post_read_wq f2fs_post_read_work
> > > Call Trace:
> > >  context_switch kernel/sched/core.c:3384 [inline]
> > >  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
> > >  schedule+0x44/0xd0 kernel/sched/core.c:4136
> > >  io_schedule+0x11/0x40 kernel/sched/core.c:5780
> > >  wait_on_page_bit_common mm/filemap.c:1174 [inline]
> > >  wait_on_page_bit mm/filemap.c:1223 [inline]
> > >  wait_on_page_locked include/linux/pagemap.h:527 [inline]
> > >  wait_on_page_locked include/linux/pagemap.h:524 [inline]
> > >  wait_on_page_read mm/filemap.c:2767 [inline]
> > >  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
> > >  read_cache_page+0xd/0x10 mm/filemap.c:2894
> > >  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
> > >  verify_page+0x110/0x560 fs/verity/verify.c:120
> > >  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
> > >  verity_work fs/f2fs/data.c:142 [inline]
> > >  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
> > >  process_one_work+0x225/0x550 kernel/workqueue.c:2269
> > >  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
> > >  kthread+0x125/0x140 kernel/kthread.c:255
> > >  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> > > INFO: task kworker/u5:1:1140 blocked for more than 30 seconds.
> > >       Not tainted 5.4.0-rc1-00119-g464e31ba60d0 #13
> > > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > kworker/u5:1    D    0  1140      2 0x80004000
> > > Workqueue: f2fs_post_read_wq f2fs_post_read_work
> > > Call Trace:
> > >  context_switch kernel/sched/core.c:3384 [inline]
> > >  __schedule+0x299/0x6c0 kernel/sched/core.c:4069
> > >  schedule+0x44/0xd0 kernel/sched/core.c:4136
> > >  io_schedule+0x11/0x40 kernel/sched/core.c:5780
> > >  wait_on_page_bit_common mm/filemap.c:1174 [inline]
> > >  wait_on_page_bit mm/filemap.c:1223 [inline]
> > >  wait_on_page_locked include/linux/pagemap.h:527 [inline]
> > >  wait_on_page_locked include/linux/pagemap.h:524 [inline]
> > >  wait_on_page_read mm/filemap.c:2767 [inline]
> > >  do_read_cache_page+0x407/0x660 mm/filemap.c:2810
> > >  read_cache_page+0xd/0x10 mm/filemap.c:2894
> > >  f2fs_read_merkle_tree_page+0x2e/0x30 include/linux/pagemap.h:396
> > >  verify_page+0x110/0x560 fs/verity/verify.c:120
> > >  fsverity_verify_bio+0xe6/0x1a0 fs/verity/verify.c:239
> > >  verity_work fs/f2fs/data.c:142 [inline]
> > >  f2fs_post_read_work+0x36/0x50 fs/f2fs/data.c:160
> > >  process_one_work+0x225/0x550 kernel/workqueue.c:2269
> > >  worker_thread+0x4b/0x3c0 kernel/workqueue.c:2415
> > >  kthread+0x125/0x140 kernel/kthread.c:255
> > >  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> > > 
> > > Showing all locks held in the system:
> > > 1 lock held by khungtaskd/21:
> > >  #0: ffffffff82250520 (rcu_read_lock){....}, at: rcu_lock_acquire.constprop.0+0x0/0x30 include/trace/events/lock.h:13
> > > 2 locks held by kworker/u5:0/61:
> > >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> > >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> > >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> > >  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> > >  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> > >  #1: ffffc90000253e50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> > > 2 locks held by kworker/u5:1/1140:
> > >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> > >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> > >  #0: ffff88807b78eb28 ((wq_completion)f2fs_post_read_wq){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> > >  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_data kernel/workqueue.c:619 [inline]
> > >  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline]
> > >  #1: ffffc9000174be50 ((work_completion)(&ctx->work)){+.+.}, at: process_one_work+0x1ad/0x550 kernel/workqueue.c:2240
> > > .
> > > 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support data compression
  2019-11-18 20:58                 ` Jaegeuk Kim
@ 2019-11-25 17:42                   ` Jaegeuk Kim
  0 siblings, 0 replies; 32+ messages in thread
From: Jaegeuk Kim @ 2019-11-25 17:42 UTC (permalink / raw)
  To: Chao Yu; +Cc: Eric Biggers, linux-kernel, linux-f2fs-devel

Fix having my additional fixes:

---
 fs/f2fs/compress.c | 114 ++++++++++++++++++--------------
 fs/f2fs/data.c     | 158 ++++++++++++++++++++++++++++++---------------
 fs/f2fs/f2fs.h     |  29 +++++++--
 fs/f2fs/file.c     |  25 +++----
 fs/f2fs/inode.c    |   7 +-
 fs/f2fs/namei.c    |   7 +-
 6 files changed, 208 insertions(+), 132 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index e9f633c30942..7ebd2bc018bd 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/lzo.h>
 #include <linux/lz4.h>
 
@@ -86,15 +87,13 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
 
 	cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
 					cc->log_cluster_size, GFP_NOFS);
-	if (!cc->rpages)
-		return -ENOMEM;
-	return 0;
+	return cc->rpages ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
 {
-	f2fs_reset_compress_ctx(cc);
 	kfree(cc->rpages);
+	f2fs_reset_compress_ctx(cc);
 }
 
 void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
@@ -378,7 +377,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 
 	dec_page_count(sbi, F2FS_RD_DATA);
 
-	if (bio->bi_status)
+	if (bio->bi_status || PageError(page))
 		dic->failed = true;
 
 	if (refcount_dec_not_one(&dic->ref))
@@ -420,10 +419,14 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 out_vunmap_rbuf:
 	vunmap(dic->rbuf);
 out_free_dic:
-	f2fs_set_cluster_uptodate(dic->rpages, dic->cluster_size, ret, verity);
+	if (!verity)
+		f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
+								ret, false);
+
 	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
-								dic->clen, ret);
-	f2fs_free_dic(dic);
+							dic->clen, ret);
+	if (!verity)
+		f2fs_free_dic(dic);
 }
 
 static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -470,22 +473,18 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
 		/* beyond EOF */
 		if (page->index >= nr_pages)
 			return false;
-		if (page->index != start_idx_of_cluster(cc) + i)
-			return false;
 	}
 	return true;
 }
 
-int is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
+static int is_compressed_cluster(struct compress_ctx *cc)
 {
 	struct dnode_of_data dn;
-	unsigned int start_idx = cluster_idx(cc, index) <<
-					cc->log_cluster_size;
 	int ret;
-	int i;
 
 	set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
-	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+	ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc),
+							LOOKUP_NODE);
 	if (ret) {
 		if (ret == -ENOENT)
 			ret = 0;
@@ -493,6 +492,8 @@ int is_compressed_cluster(struct compress_ctx *cc, pgoff_t index)
 	}
 
 	if (dn.data_blkaddr == COMPRESS_ADDR) {
+		int i;
+
 		ret = CLUSTER_IS_FULL;
 		for (i = 1; i < cc->cluster_size; i++) {
 			block_t blkaddr;
@@ -516,9 +517,10 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
 		.inode = inode,
 		.log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
 		.cluster_size = F2FS_I(inode)->i_cluster_size,
+		.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
 	};
 
-	return is_compressed_cluster(&cc, index);
+	return is_compressed_cluster(&cc);
 }
 
 static bool cluster_may_compress(struct compress_ctx *cc)
@@ -536,6 +538,7 @@ static bool cluster_may_compress(struct compress_ctx *cc)
 
 void f2fs_reset_compress_ctx(struct compress_ctx *cc)
 {
+	cc->rpages = NULL;
 	cc->nr_rpages = 0;
 	cc->nr_cpages = 0;
 	cc->cluster_idx = NULL_CLUSTER;
@@ -565,19 +568,18 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
 		bool prealloc)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
-	struct bio *bio = NULL;
 	struct address_space *mapping = cc->inode->i_mapping;
 	struct page *page;
 	struct dnode_of_data dn;
 	sector_t last_block_in_bio;
 	unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
-	unsigned int start_idx = cluster_idx(cc, index) << cc->log_cluster_size;
+	unsigned int start_idx = start_idx_of_cluster(cc);
 	int i, idx;
 	int ret;
 
 	ret = f2fs_init_compress_ctx(cc);
 	if (ret)
-		goto out;
+		return ret;
 retry:
 	/* keep page reference to avoid page reclaim */
 	for (i = 0; i < cc->cluster_size; i++) {
@@ -588,26 +590,25 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
 			goto unlock_pages;
 		}
 
-		if (PageUptodate(page)) {
+		if (PageUptodate(page))
 			unlock_page(page);
-			continue;
-		}
-
-		f2fs_compress_ctx_add_page(cc, page);
+		else
+			f2fs_compress_ctx_add_page(cc, page);
 	}
 
 	if (!f2fs_cluster_is_empty(cc)) {
+		struct bio *bio = NULL;
+
 		ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
 						&last_block_in_bio, false);
 		if (ret)
-			goto out;
-
+			return ret;
 		if (bio)
 			f2fs_submit_bio(sbi, bio, DATA);
 
 		ret = f2fs_init_compress_ctx(cc);
 		if (ret)
-			goto out;
+			return ret;
 	}
 
 	for (i = 0; i < cc->cluster_size; i++) {
@@ -620,10 +621,12 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
 		f2fs_put_page(page, 0);
 
 		if (!PageUptodate(page)) {
-			for (idx = i; idx >= 0; idx--) {
-				f2fs_put_page(cc->rpages[idx], 0);
-				f2fs_put_page(cc->rpages[idx], 1);
+			for (idx = 0; idx < cc->cluster_size; idx++) {
+				f2fs_put_page(cc->rpages[idx],
+						(idx <= i) ? 1 : 0);
+				cc->rpages[idx] = NULL;
 			}
+			cc->nr_rpages = 0;
 			goto retry;
 		}
 	}
@@ -658,11 +661,10 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
 release_pages:
 	for (idx = 0; idx < i; idx++) {
 		page = find_lock_page(mapping, start_idx + idx);
-		f2fs_put_page(page, 0);
 		f2fs_put_page(page, 1);
+		f2fs_put_page(page, 0);
 	}
 	f2fs_destroy_compress_ctx(cc);
-out:
 	return ret;
 }
 
@@ -671,12 +673,13 @@ int f2fs_prepare_compress_overwrite(struct inode *inode,
 {
 	struct compress_ctx cc = {
 		.inode = inode,
+		.log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
 		.cluster_size = F2FS_I(inode)->i_cluster_size,
-		.cluster_idx = NULL_CLUSTER,
+		.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
 		.rpages = NULL,
 		.nr_rpages = 0,
 	};
-	int ret = is_compressed_cluster(&cc, index);
+	int ret = is_compressed_cluster(&cc);
 
 	if (ret <= 0)
 		return ret;
@@ -687,7 +690,7 @@ int f2fs_prepare_compress_overwrite(struct inode *inode,
 }
 
 bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
-					pgoff_t index, bool written)
+					pgoff_t index, unsigned copied)
 
 {
 	struct compress_ctx cc = {
@@ -698,7 +701,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
 	bool first_index = (index == cc.rpages[0]->index);
 	int i;
 
-	if (written)
+	if (copied)
 		set_cluster_dirty(&cc);
 
 	for (i = 0; i < cc.cluster_size; i++)
@@ -707,7 +710,6 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
 	f2fs_destroy_compress_ctx(&cc);
 
 	return first_index;
-
 }
 
 static int f2fs_write_compressed_pages(struct compress_ctx *cc,
@@ -857,6 +859,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 			fi->last_disk_size = psize;
 		up_write(&fi->i_sem);
 	}
+	f2fs_reset_compress_ctx(cc);
 	return 0;
 
 out_destroy_crypt:
@@ -904,7 +907,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 static int f2fs_write_raw_pages(struct compress_ctx *cc,
 					int *submitted,
 					struct writeback_control *wbc,
-					enum iostat_type io_type)
+					enum iostat_type io_type,
+					bool compressed)
 {
 	int i, _submitted;
 	int ret, err = 0;
@@ -912,12 +916,24 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 	for (i = 0; i < cc->cluster_size; i++) {
 		if (!cc->rpages[i])
 			continue;
+retry_write:
 		BUG_ON(!PageLocked(cc->rpages[i]));
+
 		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
-						NULL, NULL, wbc, io_type);
+						NULL, NULL, wbc, io_type,
+						compressed);
 		if (ret) {
-			if (ret == AOP_WRITEPAGE_ACTIVATE)
+			if (ret == AOP_WRITEPAGE_ACTIVATE) {
 				unlock_page(cc->rpages[i]);
+				ret = 0;
+			} else if (ret == -EAGAIN) {
+				ret = 0;
+				cond_resched();
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				lock_page(cc->rpages[i]);
+				clear_page_dirty_for_io(cc->rpages[i]);
+				goto retry_write;
+			}
 			err = ret;
 			goto out_fail;
 		}
@@ -928,6 +944,8 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 
 out_fail:
 	/* TODO: revoke partially updated block addresses */
+	BUG_ON(compressed);
+
 	for (++i; i < cc->cluster_size; i++) {
 		if (!cc->rpages[i])
 			continue;
@@ -948,7 +966,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 	int err = -EAGAIN;
 
 	*submitted = 0;
-
 	if (cluster_may_compress(cc)) {
 		err = f2fs_compress_pages(cc);
 		if (err) {
@@ -964,18 +981,19 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 		bool compressed = false;
 
 		f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted);
-		if (is_compressed_cluster(cc, start_idx_of_cluster(cc)))
+
+		if (is_compressed_cluster(cc))
 			compressed = true;
 
-		err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
+		err = f2fs_write_raw_pages(cc, submitted, wbc,
+						io_type, compressed);
 		if (compressed) {
 			stat_sub_compr_blocks(cc->inode, *submitted);
 			F2FS_I(cc->inode)->i_compressed_blocks -= *submitted;
 			f2fs_mark_inode_dirty_sync(cc->inode, true);
 		}
+		f2fs_destroy_compress_ctx(cc);
 	}
-
-	f2fs_reset_compress_ctx(cc);
 	return err;
 }
 
@@ -988,8 +1006,9 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 
 	dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS);
 	if (!dic)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
+	dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
 	dic->inode = cc->inode;
 	refcount_set(&dic->ref, 1);
 	dic->cluster_idx = cc->cluster_idx;
@@ -1042,7 +1061,6 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 
 out_free:
 	f2fs_free_dic(dic);
-out:
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -1073,7 +1091,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
 	kfree(dic);
 }
 
-void f2fs_set_cluster_uptodate(struct page **rpages,
+void f2fs_decompress_end_io(struct page **rpages,
 			unsigned int cluster_size, bool err, bool verity)
 {
 	int i;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c10cbd7d1c06..fcdd6d493f83 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -98,7 +98,7 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
 		page = bv->bv_page;
 
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-		if (compr && PagePrivate(page)) {
+		if (compr && f2fs_is_compressed_page(page)) {
 			f2fs_decompress_pages(bio, page, verity);
 			continue;
 		}
@@ -115,9 +115,14 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
 		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
 		unlock_page(page);
 	}
-	if (bio->bi_private)
-		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
-	bio_put(bio);
+}
+
+static void f2fs_release_read_bio(struct bio *bio);
+static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
+{
+	if (!compr)
+		__read_end_io(bio, false, verity);
+	f2fs_release_read_bio(bio);
 }
 
 static void f2fs_decompress_bio(struct bio *bio, bool verity)
@@ -127,19 +132,45 @@ static void f2fs_decompress_bio(struct bio *bio, bool verity)
 
 static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
 
-static void decrypt_work(struct bio_post_read_ctx *ctx)
+static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
 {
 	fscrypt_decrypt_bio(ctx->bio);
 }
 
-static void decompress_work(struct bio_post_read_ctx *ctx, bool verity)
+static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
 {
-	f2fs_decompress_bio(ctx->bio, verity);
+	f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
 }
 
-static void verity_work(struct bio_post_read_ctx *ctx)
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+static void f2fs_verify_bio(struct bio *bio)
 {
+	struct page *page = bio_first_page_all(bio);
+	struct decompress_io_ctx *dic =
+			(struct decompress_io_ctx *)page_private(page);
+
+	f2fs_decompress_end_io(dic->rpages, dic->cluster_size, false, true);
+	f2fs_free_dic(dic);
+}
+#endif
+
+static void f2fs_verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	/* previous step is decompression */
+	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
+
+		f2fs_verify_bio(ctx->bio);
+		f2fs_release_read_bio(ctx->bio);
+		return;
+	}
+#endif
+
 	fsverity_verify_bio(ctx->bio);
+	__f2fs_read_end_io(ctx->bio, false, false);
 }
 
 static void f2fs_post_read_work(struct work_struct *work)
@@ -148,18 +179,19 @@ static void f2fs_post_read_work(struct work_struct *work)
 		container_of(work, struct bio_post_read_ctx, work);
 
 	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
-		decrypt_work(ctx);
+		f2fs_decrypt_work(ctx);
 
-	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
-		decompress_work(ctx,
-			ctx->enabled_steps & (1 << STEP_VERITY));
+	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
+		f2fs_decompress_work(ctx);
+
+	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+		INIT_WORK(&ctx->work, f2fs_verity_work);
+		fsverity_enqueue_verify_work(&ctx->work);
 		return;
 	}
 
-	if (ctx->enabled_steps & (1 << STEP_VERITY))
-		verity_work(ctx);
-
-	__read_end_io(ctx->bio, false, false);
+	__f2fs_read_end_io(ctx->bio,
+		ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
 }
 
 static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
@@ -176,12 +208,20 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
 	 * we shouldn't recurse to the same workqueue.
 	 */
 
-	if (ctx->enabled_steps) {
+	if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
+		ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
 		INIT_WORK(&ctx->work, f2fs_post_read_work);
 		f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
 		return;
 	}
-	__read_end_io(ctx->bio, false, false);
+
+	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+		INIT_WORK(&ctx->work, f2fs_verity_work);
+		fsverity_enqueue_verify_work(&ctx->work);
+		return;
+	}
+
+	__f2fs_read_end_io(ctx->bio, false, false);
 }
 
 static bool f2fs_bio_post_read_required(struct bio *bio)
@@ -205,7 +245,7 @@ static void f2fs_read_end_io(struct bio *bio)
 		return;
 	}
 
-	__read_end_io(bio, false, false);
+	__f2fs_read_end_io(bio, false, false);
 }
 
 static void f2fs_write_end_io(struct bio *bio)
@@ -624,7 +664,8 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
 
 			found = true;
 
-			if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
+			if (bio_add_page(*bio, page, PAGE_SIZE, 0) ==
+							PAGE_SIZE) {
 				ret = 0;
 				break;
 			}
@@ -858,6 +899,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	return bio;
 }
 
+static void f2fs_release_read_bio(struct bio *bio)
+{
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
+}
+
 /* This can handle encryption stuffs */
 static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 							block_t blkaddr)
@@ -1963,7 +2011,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	if (ret)
 		goto out;
 
-	f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);
+	if (dn.data_blkaddr != COMPRESS_ADDR)
+		goto out;
 
 	for (i = 1; i < cc->cluster_size; i++) {
 		block_t blkaddr;
@@ -2017,7 +2066,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				dic->failed = true;
 				if (refcount_sub_and_test(dic->nr_cpages - i,
 							&dic->ref))
-					f2fs_set_cluster_uptodate(dic->rpages,
+					f2fs_decompress_end_io(dic->rpages,
 							cc->cluster_size, true,
 							false);
 				f2fs_free_dic(dic);
@@ -2047,8 +2096,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 out_put_dnode:
 	f2fs_put_dnode(&dn);
 out:
-	f2fs_set_cluster_uptodate(cc->rpages, cc->cluster_size, true, false);
-	f2fs_reset_compress_ctx(cc);
+	f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
+	f2fs_destroy_compress_ctx(cc);
 	*bio_ret = bio;
 	return ret;
 }
@@ -2443,7 +2492,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 				struct bio **bio,
 				sector_t *last_block,
 				struct writeback_control *wbc,
-				enum iostat_type io_type)
+				enum iostat_type io_type,
+				bool compressed)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -2488,8 +2538,9 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
-	if (f2fs_compressed_file(inode) ||
-		page->index < end_index || f2fs_verity_in_progress(inode))
+	if (page->index < end_index ||
+			f2fs_verity_in_progress(inode) ||
+			compressed)
 		goto write;
 
 	/*
@@ -2610,7 +2661,7 @@ static int f2fs_write_data_page(struct page *page,
 #endif
 
 	return f2fs_write_single_data_page(page, NULL, NULL, NULL,
-						wbc, FS_DATA_IO);
+						wbc, FS_DATA_IO, false);
 }
 
 /*
@@ -2696,17 +2747,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
-			bool need_readd = false;
-
+			bool need_readd;
 readd:
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 			need_readd = false;
 
 			if (f2fs_compressed_file(inode)) {
-				void *fsdata = NULL;
-				struct page *pagep;
-				int ret2;
-
 				ret = f2fs_init_compress_ctx(&cc);
 				if (ret) {
 					done = 1;
@@ -2715,7 +2761,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 
 				if (!f2fs_cluster_can_merge_page(&cc,
 								page->index)) {
-
 					ret = f2fs_write_multi_pages(&cc,
 						&submitted, wbc, io_type);
 					if (!ret)
@@ -2724,6 +2769,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 				}
 
 				if (f2fs_cluster_is_empty(&cc)) {
+					void *fsdata = NULL;
+					struct page *pagep;
+					int ret2;
+
 					ret2 = f2fs_prepare_compress_overwrite(
 							inode, &pagep,
 							page->index, &fsdata);
@@ -2733,24 +2782,27 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 						break;
 					} else if (ret2 &&
 						!f2fs_compress_write_end(inode,
-							fsdata, page->index,
-							true)) {
+								fsdata, page->index,
+								1)) {
 						retry = 1;
 						break;
 					}
+				} else {
+					goto lock_page;
 				}
 			}
 #endif
-
 			/* give a priority to WB_SYNC threads */
 			if (atomic_read(&sbi->wb_sync_req[DATA]) &&
 					wbc->sync_mode == WB_SYNC_NONE) {
 				done = 1;
 				break;
 			}
-
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+lock_page:
+#endif
 			done_index = page->index;
-
+retry_write:
 			lock_page(page);
 
 			if (unlikely(page->mapping != mapping)) {
@@ -2782,7 +2834,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 			}
 #endif
 			ret = f2fs_write_single_data_page(page, &submitted,
-					&bio, &last_block, wbc, io_type);
+					&bio, &last_block, wbc, io_type, false);
 			if (ret == AOP_WRITEPAGE_ACTIVATE)
 				unlock_page(page);
 #ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2801,6 +2853,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 					goto next;
 				} else if (ret == -EAGAIN) {
 					ret = 0;
+					if (wbc->sync_mode == WB_SYNC_ALL) {
+						cond_resched();
+						congestion_wait(BLK_RW_ASYNC,
+								HZ/50);
+						goto retry_write;
+					}
 					goto next;
 				}
 				done_index = page->index + 1;
@@ -2817,21 +2875,21 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 			if (need_readd)
 				goto readd;
 		}
-
 		pagevec_release(&pvec);
 		cond_resched();
 	}
-
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	/* flush remained pages in compress cluster */
 	if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) {
 		ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type);
 		nwritten += submitted;
 		wbc->nr_to_write -= submitted;
-		/* TODO: error handling */
+		if (ret) {
+			done = 1;
+			retry = 0;
+		}
 	}
 #endif
-
 	if ((!cycled && !done) || retry) {
 		cycled = 1;
 		index = 0;
@@ -3606,14 +3664,8 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	if (ret)
 		return ret;
 
-	if (f2fs_compressed_file(inode)) {
-		if (F2FS_I(inode)->i_compressed_blocks)
-			return -EINVAL;
-
-		F2FS_I(inode)->i_flags &= ~FS_COMPR_FL;
-		clear_inode_flag(inode, FI_COMPRESSED_FILE);
-		stat_dec_compr_inode(inode);
-	}
+	if (f2fs_disable_compressed_file(inode))
+		return -EINVAL;
 
 	ret = check_swap_activate(file, sis->max);
 	if (ret)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 11c42042367b..ee7309ca671a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1253,6 +1253,7 @@ struct compress_io_ctx {
 
 /* decompress io context for read IO path */
 struct decompress_io_ctx {
+	u32 magic;			/* magic number to indicate page is compressed */
 	struct inode *inode;		/* inode the context belong to */
 	unsigned int cluster_idx;	/* cluster index number */
 	unsigned int cluster_size;	/* page count in cluster */
@@ -2737,6 +2738,8 @@ static inline void set_compress_context(struct inode *inode)
 			F2FS_OPTION(sbi).compress_log_size;
 	F2FS_I(inode)->i_cluster_size =
 			1 << F2FS_I(inode)->i_log_cluster_size;
+	F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
+	set_inode_flag(inode, FI_COMPRESSED_FILE);
 }
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
@@ -3390,7 +3393,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
 int f2fs_write_single_data_page(struct page *page, int *submitted,
 				struct bio **bio, sector_t *last_block,
 				struct writeback_control *wbc,
-				enum iostat_type io_type);
+				enum iostat_type io_type,
+				bool compressed);
 void f2fs_invalidate_page(struct page *page, unsigned int offset,
 			unsigned int length);
 int f2fs_release_page(struct page *page, gfp_t wait);
@@ -3631,8 +3635,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_inline_dir(inode)			do { } while (0)
 #define stat_inc_compr_inode(inode)			do { } while (0)
 #define stat_dec_compr_inode(inode)			do { } while (0)
-#define stat_add_compr_blocks(inode)			do { } while (0)
-#define stat_sub_compr_blocks(inode)			do { } while (0)
+#define stat_add_compr_blocks(inode, blocks)		do { } while (0)
+#define stat_sub_compr_blocks(inode, blocks)		do { } while (0)
 #define stat_inc_atomic_write(inode)			do { } while (0)
 #define stat_dec_atomic_write(inode)			do { } while (0)
 #define stat_update_max_atomic_write(inode)		do { } while (0)
@@ -3786,7 +3790,7 @@ void f2fs_reset_compress_ctx(struct compress_ctx *cc);
 int f2fs_prepare_compress_overwrite(struct inode *inode,
 			struct page **pagep, pgoff_t index, void **fsdata);
 bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
-					pgoff_t index, bool written);
+					pgoff_t index, unsigned copied);
 void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
 bool f2fs_is_compress_backend_ready(struct inode *inode);
 void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
@@ -3803,7 +3807,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				bool is_readahead);
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
 void f2fs_free_dic(struct decompress_io_ctx *dic);
-void f2fs_set_cluster_uptodate(struct page **rpages,
+void f2fs_decompress_end_io(struct page **rpages,
 			unsigned int cluster_size, bool err, bool verity);
 int f2fs_init_compress_ctx(struct compress_ctx *cc);
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
@@ -3824,6 +3828,21 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
 }
 #endif
 
+static inline u64 f2fs_disable_compressed_file(struct inode *inode)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+
+	if (!f2fs_compressed_file(inode))
+		return 0;
+	if (fi->i_compressed_blocks)
+		return fi->i_compressed_blocks;
+
+	fi->i_flags &= ~F2FS_COMPR_FL;
+	clear_inode_flag(inode, FI_COMPRESSED_FILE);
+	stat_dec_compr_inode(inode);
+	return 0;
+}
+
 #define F2FS_FEATURE_FUNCS(name, flagname) \
 static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
 { \
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a688a4cb212b..4163fc3db1a3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -523,6 +523,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 	if (err)
 		return err;
 
+	if (!f2fs_is_compress_backend_ready(inode))
+		return -EOPNOTSUPP;
+
 	err = fsverity_file_open(inode, filp);
 	if (err)
 		return err;
@@ -1821,7 +1824,6 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 				return -EINVAL;
 
 			set_compress_context(inode);
-			set_inode_flag(inode, FI_COMPRESSED_FILE);
 			stat_inc_compr_inode(inode);
 		}
 	}
@@ -2016,11 +2018,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
 	inode_lock(inode);
 
-	if (f2fs_compressed_file(inode) && !fi->i_compressed_blocks) {
-		fi->i_flags &= ~F2FS_COMPR_FL;
-		clear_inode_flag(inode, FI_COMPRESSED_FILE);
-		stat_dec_compr_inode(inode);
-	}
+	f2fs_disable_compressed_file(inode);
 
 	if (f2fs_is_atomic_file(inode)) {
 		if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
@@ -3224,20 +3222,15 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 		goto out;
 	}
 
-	if (f2fs_compressed_file(inode)) {
-		if (F2FS_I(inode)->i_compressed_blocks) {
-			ret = -EOPNOTSUPP;
-			goto out;
-		}
-		F2FS_I(inode)->i_flags &= ~F2FS_COMPR_FL;
-		clear_inode_flag(inode, FI_COMPRESSED_FILE);
-		stat_dec_compr_inode(inode);
-	}
-
 	ret = f2fs_convert_inline_inode(inode);
 	if (ret)
 		goto out;
 
+	if (f2fs_disable_compressed_file(inode)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
 	set_inode_flag(inode, FI_PIN_FILE);
 	ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
 done:
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 7a85060adad5..3fa728f40c2a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -421,7 +421,8 @@ static int do_read_inode(struct inode *inode)
 		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
 	}
 
-	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi)) {
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) &&
+					(fi->i_flags & F2FS_COMPR_FL)) {
 		if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
 					i_log_cluster_size)) {
 			fi->i_compressed_blocks =
@@ -429,10 +430,8 @@ static int do_read_inode(struct inode *inode)
 			fi->i_compress_algorithm = ri->i_compress_algorithm;
 			fi->i_log_cluster_size = ri->i_log_cluster_size;
 			fi->i_cluster_size = 1 << fi->i_log_cluster_size;
-		}
-
-		if ((fi->i_flags & F2FS_COMPR_FL) && f2fs_may_compress(inode))
 			set_inode_flag(inode, FI_COMPRESSED_FILE);
+		}
 	}
 
 	F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 00c56a3e944b..ac6b1f946e03 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -122,11 +122,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_sb_has_compression(sbi)) {
 		/* Inherit the compression flag in directory */
 		if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
-					f2fs_may_compress(inode)) {
+					f2fs_may_compress(inode))
 			set_compress_context(inode);
-			F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
-			set_inode_flag(inode, FI_COMPRESSED_FILE);
-		}
 	}
 
 	f2fs_set_inode_flags(inode);
@@ -309,9 +306,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
 		if (!is_extension_exist(name, ext[i]))
 			continue;
 
-		F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
 		set_compress_context(inode);
-		set_inode_flag(inode, FI_COMPRESSED_FILE);
 		return;
 	}
 }
-- 
2.19.0.605.g01d371f741-goog



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, back to index

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-22 17:16 [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Jaegeuk Kim
2019-10-22 17:16 ` [f2fs-dev] [PATCH 2/2] f2fs: support data compression Jaegeuk Kim
2019-10-22 17:53   ` Ju Hyung Park
2019-10-24  9:10     ` Chao Yu
2019-10-23  5:24   ` Eric Biggers
2019-10-23 17:28     ` Jaegeuk Kim
2019-10-25  9:07     ` Chao Yu
2019-10-27 22:50   ` Eric Biggers
2019-10-28  2:33     ` Chao Yu
2019-10-29  8:33     ` Chao Yu
2019-10-30  2:55       ` Eric Biggers
2019-10-30  8:43         ` Chao Yu
2019-10-30 16:50           ` Eric Biggers
2019-10-30 17:22             ` Gao Xiang via Linux-f2fs-devel
2019-10-30 17:47             ` Jaegeuk Kim
2019-10-31  2:16             ` Chao Yu
2019-10-31 15:35               ` Jaegeuk Kim
2019-11-01 10:02                 ` Chao Yu
2019-10-30 17:02           ` Eric Biggers
2019-10-31  2:21             ` Chao Yu
2019-11-13 13:10             ` Chao Yu
2019-11-18 16:11               ` Jaegeuk Kim
2019-11-18 20:58                 ` Jaegeuk Kim
2019-11-25 17:42                   ` Jaegeuk Kim
2019-10-24  8:21 ` [f2fs-dev] [PATCH 1/2] f2fs: support aligned pinned file Chao Yu
2019-10-25 18:18   ` Jaegeuk Kim
2019-10-26  1:31     ` Chao Yu
2019-10-30 16:09       ` Jaegeuk Kim
2019-10-31  2:27         ` Chao Yu
2019-10-31 15:29           ` Jaegeuk Kim
2019-11-05  3:39             ` Chao Yu
2019-11-07 19:14 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim

Linux-f2fs-devel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-f2fs-devel/0 linux-f2fs-devel/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-f2fs-devel linux-f2fs-devel/ https://lore.kernel.org/linux-f2fs-devel \
		linux-f2fs-devel@lists.sourceforge.net
	public-inbox-index linux-f2fs-devel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/net.sourceforge.lists.linux-f2fs-devel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git