All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11  3:37 ` Daeho Jeong
  0 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11  3:37 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel, kernel-team; +Cc: Daeho Jeong

From: Daeho Jeong <daehojeong@google.com>

By profiling f2fs compression works, I've found vmap() callings are
bottlenecks of f2fs decompression path. Changing these with
vm_map_ram(), we can enhance f2fs decompression speed pretty much.

[Verification]
dd if=/dev/zero of=dummy bs=1m count=1000
echo 3 > /proc/sys/vm/drop_caches
dd if=dummy of=/dev/zero bs=512k

- w/o compression -
1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s

- before patch -
1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s

- after patch -
1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s

Signed-off-by: Daeho Jeong <daehojeong@google.com>
---
 fs/f2fs/compress.c | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 6e7db450006c..46b7e359f313 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -554,6 +554,8 @@ static void f2fs_compress_free_page(struct page *page)
 	mempool_free(page, compress_page_pool);
 }
 
+#define MAX_VMAP_RETRIES	3
+
 static int f2fs_compress_pages(struct compress_ctx *cc)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
@@ -590,13 +592,23 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 		}
 	}
 
-	cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		cc->rbuf = vm_map_ram(cc->rpages, cc->cluster_size, -1);
+		if (cc->rbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!cc->rbuf) {
 		ret = -ENOMEM;
 		goto out_free_cpages;
 	}
 
-	cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		cc->cbuf = vm_map_ram(cc->cpages, cc->nr_cpages, -1);
+		if (cc->cbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!cc->cbuf) {
 		ret = -ENOMEM;
 		goto out_vunmap_rbuf;
@@ -624,8 +636,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 	memset(&cc->cbuf->cdata[cc->clen], 0,
 	       (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
 
-	vunmap(cc->cbuf);
-	vunmap(cc->rbuf);
+	vm_unmap_ram(cc->cbuf, cc->nr_cpages);
+	vm_unmap_ram(cc->rbuf, cc->cluster_size);
 
 	for (i = nr_cpages; i < cc->nr_cpages; i++) {
 		f2fs_compress_free_page(cc->cpages[i]);
@@ -642,9 +654,9 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 	return 0;
 
 out_vunmap_cbuf:
-	vunmap(cc->cbuf);
+	vm_unmap_ram(cc->cbuf, cc->nr_cpages);
 out_vunmap_rbuf:
-	vunmap(cc->rbuf);
+	vm_unmap_ram(cc->rbuf, cc->cluster_size);
 out_free_cpages:
 	for (i = 0; i < cc->nr_cpages; i++) {
 		if (cc->cpages[i])
@@ -715,13 +727,23 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 			goto out_free_dic;
 	}
 
-	dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		dic->rbuf = vm_map_ram(dic->tpages, dic->cluster_size, -1);
+		if (dic->rbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!dic->rbuf) {
 		ret = -ENOMEM;
 		goto destroy_decompress_ctx;
 	}
 
-	dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		dic->cbuf = vm_map_ram(dic->cpages, dic->nr_cpages, -1);
+		if (dic->cbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!dic->cbuf) {
 		ret = -ENOMEM;
 		goto out_vunmap_rbuf;
@@ -738,9 +760,9 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 	ret = cops->decompress_pages(dic);
 
 out_vunmap_cbuf:
-	vunmap(dic->cbuf);
+	vm_unmap_ram(dic->cbuf, dic->nr_cpages);
 out_vunmap_rbuf:
-	vunmap(dic->rbuf);
+	vm_unmap_ram(dic->rbuf, dic->cluster_size);
 destroy_decompress_ctx:
 	if (cops->destroy_decompress_ctx)
 		cops->destroy_decompress_ctx(dic);
-- 
2.28.0.236.gb10cc79966-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11  3:37 ` Daeho Jeong
  0 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11  3:37 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel, kernel-team; +Cc: Daeho Jeong

From: Daeho Jeong <daehojeong@google.com>

By profiling f2fs compression works, I've found vmap() callings are
bottlenecks of f2fs decompression path. Changing these with
vm_map_ram(), we can enhance f2fs decompression speed pretty much.

[Verification]
dd if=/dev/zero of=dummy bs=1m count=1000
echo 3 > /proc/sys/vm/drop_caches
dd if=dummy of=/dev/zero bs=512k

- w/o compression -
1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s

- before patch -
1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s

- after patch -
1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s

Signed-off-by: Daeho Jeong <daehojeong@google.com>
---
 fs/f2fs/compress.c | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 6e7db450006c..46b7e359f313 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -554,6 +554,8 @@ static void f2fs_compress_free_page(struct page *page)
 	mempool_free(page, compress_page_pool);
 }
 
+#define MAX_VMAP_RETRIES	3
+
 static int f2fs_compress_pages(struct compress_ctx *cc)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
@@ -590,13 +592,23 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 		}
 	}
 
-	cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		cc->rbuf = vm_map_ram(cc->rpages, cc->cluster_size, -1);
+		if (cc->rbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!cc->rbuf) {
 		ret = -ENOMEM;
 		goto out_free_cpages;
 	}
 
-	cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		cc->cbuf = vm_map_ram(cc->cpages, cc->nr_cpages, -1);
+		if (cc->cbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!cc->cbuf) {
 		ret = -ENOMEM;
 		goto out_vunmap_rbuf;
@@ -624,8 +636,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 	memset(&cc->cbuf->cdata[cc->clen], 0,
 	       (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
 
-	vunmap(cc->cbuf);
-	vunmap(cc->rbuf);
+	vm_unmap_ram(cc->cbuf, cc->nr_cpages);
+	vm_unmap_ram(cc->rbuf, cc->cluster_size);
 
 	for (i = nr_cpages; i < cc->nr_cpages; i++) {
 		f2fs_compress_free_page(cc->cpages[i]);
@@ -642,9 +654,9 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 	return 0;
 
 out_vunmap_cbuf:
-	vunmap(cc->cbuf);
+	vm_unmap_ram(cc->cbuf, cc->nr_cpages);
 out_vunmap_rbuf:
-	vunmap(cc->rbuf);
+	vm_unmap_ram(cc->rbuf, cc->cluster_size);
 out_free_cpages:
 	for (i = 0; i < cc->nr_cpages; i++) {
 		if (cc->cpages[i])
@@ -715,13 +727,23 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 			goto out_free_dic;
 	}
 
-	dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		dic->rbuf = vm_map_ram(dic->tpages, dic->cluster_size, -1);
+		if (dic->rbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!dic->rbuf) {
 		ret = -ENOMEM;
 		goto destroy_decompress_ctx;
 	}
 
-	dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
+	for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+		dic->cbuf = vm_map_ram(dic->cpages, dic->nr_cpages, -1);
+		if (dic->cbuf)
+			break;
+		vm_unmap_aliases();
+	}
 	if (!dic->cbuf) {
 		ret = -ENOMEM;
 		goto out_vunmap_rbuf;
@@ -738,9 +760,9 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 	ret = cops->decompress_pages(dic);
 
 out_vunmap_cbuf:
-	vunmap(dic->cbuf);
+	vm_unmap_ram(dic->cbuf, dic->nr_cpages);
 out_vunmap_rbuf:
-	vunmap(dic->rbuf);
+	vm_unmap_ram(dic->rbuf, dic->cluster_size);
 destroy_decompress_ctx:
 	if (cops->destroy_decompress_ctx)
 		cops->destroy_decompress_ctx(dic);
-- 
2.28.0.236.gb10cc79966-goog



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11  3:37 ` [f2fs-dev] " Daeho Jeong
@ 2020-08-11  7:15   ` Gao Xiang
  -1 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2020-08-11  7:15 UTC (permalink / raw)
  To: Daeho Jeong; +Cc: linux-kernel, linux-f2fs-devel, kernel-team, Daeho Jeong

On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> From: Daeho Jeong <daehojeong@google.com>
> 
> By profiling f2fs compression works, I've found vmap() callings are
> bottlenecks of f2fs decompression path. Changing these with
> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> 
> [Verification]
> dd if=/dev/zero of=dummy bs=1m count=1000
> echo 3 > /proc/sys/vm/drop_caches
> dd if=dummy of=/dev/zero bs=512k
> 
> - w/o compression -
> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> 
> - before patch -
> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> 
> - after patch -
> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s

Indeed, vmap() approach has some impact on the whole
workflow. But I don't think the gap is such significant,
maybe it relates to unlocked cpufreq (and big little
core difference if it's on some arm64 board).


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11  7:15   ` Gao Xiang
  0 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2020-08-11  7:15 UTC (permalink / raw)
  To: Daeho Jeong; +Cc: Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> From: Daeho Jeong <daehojeong@google.com>
> 
> By profiling f2fs compression works, I've found vmap() callings are
> bottlenecks of f2fs decompression path. Changing these with
> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> 
> [Verification]
> dd if=/dev/zero of=dummy bs=1m count=1000
> echo 3 > /proc/sys/vm/drop_caches
> dd if=dummy of=/dev/zero bs=512k
> 
> - w/o compression -
> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> 
> - before patch -
> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> 
> - after patch -
> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s

Indeed, vmap() approach has some impact on the whole
workflow. But I don't think the gap is such significant,
maybe it relates to unlocked cpufreq (and big little
core difference if it's on some arm64 board).



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11  7:15   ` Gao Xiang
@ 2020-08-11  7:54     ` Chao Yu
  -1 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2020-08-11  7:54 UTC (permalink / raw)
  To: Daeho Jeong
  Cc: Gao Xiang, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

On 2020/8/11 15:15, Gao Xiang wrote:
> On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
>> From: Daeho Jeong <daehojeong@google.com>
>>
>> By profiling f2fs compression works, I've found vmap() callings are
>> bottlenecks of f2fs decompression path. Changing these with
>> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
>>
>> [Verification]
>> dd if=/dev/zero of=dummy bs=1m count=1000
>> echo 3 > /proc/sys/vm/drop_caches
>> dd if=dummy of=/dev/zero bs=512k
>>
>> - w/o compression -
>> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
>> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
>> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
>>
>> - before patch -
>> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
>> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
>> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>>
>> - after patch -
>> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
>> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
>> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> 
> Indeed, vmap() approach has some impact on the whole
> workflow. But I don't think the gap is such significant,
> maybe it relates to unlocked cpufreq (and big little
> core difference if it's on some arm64 board).

Agreed,

I guess there should be other reason causing the large performance
gap, scheduling, frequency, or something else.

> 
> 
> 
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11  7:54     ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2020-08-11  7:54 UTC (permalink / raw)
  To: Daeho Jeong
  Cc: kernel-team, linux-f2fs-devel, Gao Xiang, Daeho Jeong, linux-kernel

On 2020/8/11 15:15, Gao Xiang wrote:
> On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
>> From: Daeho Jeong <daehojeong@google.com>
>>
>> By profiling f2fs compression works, I've found vmap() callings are
>> bottlenecks of f2fs decompression path. Changing these with
>> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
>>
>> [Verification]
>> dd if=/dev/zero of=dummy bs=1m count=1000
>> echo 3 > /proc/sys/vm/drop_caches
>> dd if=dummy of=/dev/zero bs=512k
>>
>> - w/o compression -
>> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
>> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
>> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
>>
>> - before patch -
>> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
>> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
>> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>>
>> - after patch -
>> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
>> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
>> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> 
> Indeed, vmap() approach has some impact on the whole
> workflow. But I don't think the gap is such significant,
> maybe it relates to unlocked cpufreq (and big little
> core difference if it's on some arm64 board).

Agreed,

I guess there should be other reason causing the large performance
gap, scheduling, frequency, or something else.

> 
> 
> 
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11  7:54     ` Chao Yu
@ 2020-08-11  9:28       ` Daeho Jeong
  -1 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11  9:28 UTC (permalink / raw)
  To: Chao Yu
  Cc: Gao Xiang, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

Actually, as you can see, I use the whole zero data blocks in the test file.
It can maximize the effect of changing virtual mapping.
When I use normal files which can be compressed about 70% from the
original file,
The vm_map_ram() version is about 2x faster than vmap() version.

2020년 8월 11일 (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님이 작성:
>
> On 2020/8/11 15:15, Gao Xiang wrote:
> > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> >> From: Daeho Jeong <daehojeong@google.com>
> >>
> >> By profiling f2fs compression works, I've found vmap() callings are
> >> bottlenecks of f2fs decompression path. Changing these with
> >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> >>
> >> [Verification]
> >> dd if=/dev/zero of=dummy bs=1m count=1000
> >> echo 3 > /proc/sys/vm/drop_caches
> >> dd if=dummy of=/dev/zero bs=512k
> >>
> >> - w/o compression -
> >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> >>
> >> - before patch -
> >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> >>
> >> - after patch -
> >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> >
> > Indeed, vmap() approach has some impact on the whole
> > workflow. But I don't think the gap is such significant,
> > maybe it relates to unlocked cpufreq (and big little
> > core difference if it's on some arm64 board).
>
> Agreed,
>
> I guess there should be other reason causing the large performance
> gap, scheduling, frequency, or something else.
>
> >
> >
> >
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > .
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11  9:28       ` Daeho Jeong
  0 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11  9:28 UTC (permalink / raw)
  To: Chao Yu
  Cc: kernel-team, linux-f2fs-devel, Gao Xiang, Daeho Jeong, linux-kernel

Actually, as you can see, I use the whole zero data blocks in the test file.
It can maximize the effect of changing virtual mapping.
When I use normal files which can be compressed about 70% from the
original file,
The vm_map_ram() version is about 2x faster than vmap() version.

2020년 8월 11일 (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님이 작성:
>
> On 2020/8/11 15:15, Gao Xiang wrote:
> > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> >> From: Daeho Jeong <daehojeong@google.com>
> >>
> >> By profiling f2fs compression works, I've found vmap() callings are
> >> bottlenecks of f2fs decompression path. Changing these with
> >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> >>
> >> [Verification]
> >> dd if=/dev/zero of=dummy bs=1m count=1000
> >> echo 3 > /proc/sys/vm/drop_caches
> >> dd if=dummy of=/dev/zero bs=512k
> >>
> >> - w/o compression -
> >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> >>
> >> - before patch -
> >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> >>
> >> - after patch -
> >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> >
> > Indeed, vmap() approach has some impact on the whole
> > workflow. But I don't think the gap is such significant,
> > maybe it relates to unlocked cpufreq (and big little
> > core difference if it's on some arm64 board).
>
> Agreed,
>
> I guess there should be other reason causing the large performance
> gap, scheduling, frequency, or something else.
>
> >
> >
> >
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > .
> >


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11  9:28       ` Daeho Jeong
@ 2020-08-11  9:33         ` Daeho Jeong
  -1 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11  9:33 UTC (permalink / raw)
  To: Chao Yu
  Cc: Gao Xiang, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

Plus, when we use vmap(), vmap() normally executes in a short time
like vm_map_ram().
But, sometimes, it has a very long delay.

2020년 8월 11일 (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님이 작성:
>
> Actually, as you can see, I use the whole zero data blocks in the test file.
> It can maximize the effect of changing virtual mapping.
> When I use normal files which can be compressed about 70% from the
> original file,
> The vm_map_ram() version is about 2x faster than vmap() version.
>
> 2020년 8월 11일 (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님이 작성:
> >
> > On 2020/8/11 15:15, Gao Xiang wrote:
> > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > >> From: Daeho Jeong <daehojeong@google.com>
> > >>
> > >> By profiling f2fs compression works, I've found vmap() callings are
> > >> bottlenecks of f2fs decompression path. Changing these with
> > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > >>
> > >> [Verification]
> > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > >> echo 3 > /proc/sys/vm/drop_caches
> > >> dd if=dummy of=/dev/zero bs=512k
> > >>
> > >> - w/o compression -
> > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > >>
> > >> - before patch -
> > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > >>
> > >> - after patch -
> > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > >
> > > Indeed, vmap() approach has some impact on the whole
> > > workflow. But I don't think the gap is such significant,
> > > maybe it relates to unlocked cpufreq (and big little
> > > core difference if it's on some arm64 board).
> >
> > Agreed,
> >
> > I guess there should be other reason causing the large performance
> > gap, scheduling, frequency, or something else.
> >
> > >
> > >
> > >
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > .
> > >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11  9:33         ` Daeho Jeong
  0 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11  9:33 UTC (permalink / raw)
  To: Chao Yu
  Cc: kernel-team, linux-f2fs-devel, Gao Xiang, Daeho Jeong, linux-kernel

Plus, when we use vmap(), vmap() normally executes in a short time
like vm_map_ram().
But, sometimes, it has a very long delay.

2020년 8월 11일 (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님이 작성:
>
> Actually, as you can see, I use the whole zero data blocks in the test file.
> It can maximize the effect of changing virtual mapping.
> When I use normal files which can be compressed about 70% from the
> original file,
> The vm_map_ram() version is about 2x faster than vmap() version.
>
> 2020년 8월 11일 (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님이 작성:
> >
> > On 2020/8/11 15:15, Gao Xiang wrote:
> > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > >> From: Daeho Jeong <daehojeong@google.com>
> > >>
> > >> By profiling f2fs compression works, I've found vmap() callings are
> > >> bottlenecks of f2fs decompression path. Changing these with
> > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > >>
> > >> [Verification]
> > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > >> echo 3 > /proc/sys/vm/drop_caches
> > >> dd if=dummy of=/dev/zero bs=512k
> > >>
> > >> - w/o compression -
> > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > >>
> > >> - before patch -
> > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > >>
> > >> - after patch -
> > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > >
> > > Indeed, vmap() approach has some impact on the whole
> > > workflow. But I don't think the gap is such significant,
> > > maybe it relates to unlocked cpufreq (and big little
> > > core difference if it's on some arm64 board).
> >
> > Agreed,
> >
> > I guess there should be other reason causing the large performance
> > gap, scheduling, frequency, or something else.
> >
> > >
> > >
> > >
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > .
> > >


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11  9:33         ` Daeho Jeong
@ 2020-08-11 10:18           ` Gao Xiang
  -1 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2020-08-11 10:18 UTC (permalink / raw)
  To: Daeho Jeong
  Cc: Chao Yu, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8\"", Size: 3495 bytes --]

On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> Plus, when we use vmap(), vmap() normally executes in a short time
> like vm_map_ram().
> But, sometimes, it has a very long delay.
> 
> 2020년 8월 11일 (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님이 작성:
> >
> > Actually, as you can see, I use the whole zero data blocks in the test file.
> > It can maximize the effect of changing virtual mapping.
> > When I use normal files which can be compressed about 70% from the
> > original file,
> > The vm_map_ram() version is about 2x faster than vmap() version.

What f2fs does is much similar to btrfs compression. Even if these
blocks are all zeroed. In principle, the maximum compression ratio
is determined (cluster sized blocks into one compressed block, e.g
16k cluster into one compressed block).

So it'd be better to describe your configured cluster size (16k or
128k) and your hardware information in the commit message as well.

Actually, I also tried with this patch as well on my x86 laptop just
now with FIO (I didn't use zeroed block though), and I didn't notice
much difference with turbo boost off and maxfreq.

I'm not arguing this commit, just a note about this commit message.
> > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s

IMHO, the above number is much like decompressing in the arm64 little cores.

Thanks,
Gao Xiang


> >
> > 2020년 8월 11일 (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님이 작성:
> > >
> > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > >> From: Daeho Jeong <daehojeong@google.com>
> > > >>
> > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > >> bottlenecks of f2fs decompression path. Changing these with
> > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > >>
> > > >> [Verification]
> > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > >> echo 3 > /proc/sys/vm/drop_caches
> > > >> dd if=dummy of=/dev/zero bs=512k
> > > >>
> > > >> - w/o compression -
> > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > >>
> > > >> - before patch -
> > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > >>
> > > >> - after patch -
> > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > >
> > > > Indeed, vmap() approach has some impact on the whole
> > > > workflow. But I don't think the gap is such significant,
> > > > maybe it relates to unlocked cpufreq (and big little
> > > > core difference if it's on some arm64 board).
> > >
> > > Agreed,
> > >
> > > I guess there should be other reason causing the large performance
> > > gap, scheduling, frequency, or something else.
> > >
> > > >
> > > >
> > > >
> > > > _______________________________________________
> > > > Linux-f2fs-devel mailing list
> > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > .
> > > >
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11 10:18           ` Gao Xiang
  0 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2020-08-11 10:18 UTC (permalink / raw)
  To: Daeho Jeong; +Cc: kernel-team, Daeho Jeong, linux-f2fs-devel, linux-kernel

On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> Plus, when we use vmap(), vmap() normally executes in a short time
> like vm_map_ram().
> But, sometimes, it has a very long delay.
> 
> 2020년 8월 11� (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> >
> > Actually, as you can see, I use the whole zero data blocks in the test file.
> > It can maximize the effect of changing virtual mapping.
> > When I use normal files which can be compressed about 70% from the
> > original file,
> > The vm_map_ram() version is about 2x faster than vmap() version.

What f2fs does is much similar to btrfs compression. Even if these
blocks are all zeroed. In principle, the maximum compression ratio
is determined (cluster sized blocks into one compressed block, e.g
16k cluster into one compressed block).

So it'd be better to describe your configured cluster size (16k or
128k) and your hardware information in the commit message as well.

Actually, I also tried with this patch as well on my x86 laptop just
now with FIO (I didn't use zeroed block though), and I didn't notice
much difference with turbo boost off and maxfreq.

I'm not arguing this commit, just a note about this commit message.
> > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s

IMHO, the above number is much like decompressing in the arm64 little cores.

Thanks,
Gao Xiang


> >
> > 2020년 8월 11� (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > >
> > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > >> From: Daeho Jeong <daehojeong@google.com>
> > > >>
> > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > >> bottlenecks of f2fs decompression path. Changing these with
> > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > >>
> > > >> [Verification]
> > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > >> echo 3 > /proc/sys/vm/drop_caches
> > > >> dd if=dummy of=/dev/zero bs=512k
> > > >>
> > > >> - w/o compression -
> > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > >>
> > > >> - before patch -
> > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > >>
> > > >> - after patch -
> > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > >
> > > > Indeed, vmap() approach has some impact on the whole
> > > > workflow. But I don't think the gap is such significant,
> > > > maybe it relates to unlocked cpufreq (and big little
> > > > core difference if it's on some arm64 board).
> > >
> > > Agreed,
> > >
> > > I guess there should be other reason causing the large performance
> > > gap, scheduling, frequency, or something else.
> > >
> > > >
> > > >
> > > >
> > > > _______________________________________________
> > > > Linux-f2fs-devel mailing list
> > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > .
> > > >
> 



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11 10:18           ` Gao Xiang
@ 2020-08-11 11:21             ` Daeho Jeong
  -1 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11 11:21 UTC (permalink / raw)
  To: Gao Xiang
  Cc: Chao Yu, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

Sure, I'll update the test condition as you said in the commit message.
FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.

Thanks,

2020년 8월 11일 (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님이 작성:
>
> On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> > Plus, when we use vmap(), vmap() normally executes in a short time
> > like vm_map_ram().
> > But, sometimes, it has a very long delay.
> >
> > 2020년 8월 11� (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> > >
> > > Actually, as you can see, I use the whole zero data blocks in the test file.
> > > It can maximize the effect of changing virtual mapping.
> > > When I use normal files which can be compressed about 70% from the
> > > original file,
> > > The vm_map_ram() version is about 2x faster than vmap() version.
>
> What f2fs does is much similar to btrfs compression. Even if these
> blocks are all zeroed. In principle, the maximum compression ratio
> is determined (cluster sized blocks into one compressed block, e.g
> 16k cluster into one compressed block).
>
> So it'd be better to describe your configured cluster size (16k or
> 128k) and your hardware information in the commit message as well.
>
> Actually, I also tried with this patch as well on my x86 laptop just
> now with FIO (I didn't use zeroed block though), and I didn't notice
> much difference with turbo boost off and maxfreq.
>
> I'm not arguing this commit, just a note about this commit message.
> > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>
> IMHO, the above number is much like decompressing in the arm64 little cores.
>
> Thanks,
> Gao Xiang
>
>
> > >
> > > 2020년 8월 11� (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > > >
> > > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > > >> From: Daeho Jeong <daehojeong@google.com>
> > > > >>
> > > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > > >> bottlenecks of f2fs decompression path. Changing these with
> > > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > > >>
> > > > >> [Verification]
> > > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > > >> echo 3 > /proc/sys/vm/drop_caches
> > > > >> dd if=dummy of=/dev/zero bs=512k
> > > > >>
> > > > >> - w/o compression -
> > > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > > >>
> > > > >> - before patch -
> > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > > >>
> > > > >> - after patch -
> > > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > > >
> > > > > Indeed, vmap() approach has some impact on the whole
> > > > > workflow. But I don't think the gap is such significant,
> > > > > maybe it relates to unlocked cpufreq (and big little
> > > > > core difference if it's on some arm64 board).
> > > >
> > > > Agreed,
> > > >
> > > > I guess there should be other reason causing the large performance
> > > > gap, scheduling, frequency, or something else.
> > > >
> > > > >
> > > > >
> > > > >
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > .
> > > > >
> >
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11 11:21             ` Daeho Jeong
  0 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11 11:21 UTC (permalink / raw)
  To: Gao Xiang; +Cc: kernel-team, Daeho Jeong, linux-f2fs-devel, linux-kernel

Sure, I'll update the test condition as you said in the commit message.
FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.

Thanks,

2020년 8월 11일 (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님이 작성:
>
> On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> > Plus, when we use vmap(), vmap() normally executes in a short time
> > like vm_map_ram().
> > But, sometimes, it has a very long delay.
> >
> > 2020년 8월 11� (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> > >
> > > Actually, as you can see, I use the whole zero data blocks in the test file.
> > > It can maximize the effect of changing virtual mapping.
> > > When I use normal files which can be compressed about 70% from the
> > > original file,
> > > The vm_map_ram() version is about 2x faster than vmap() version.
>
> What f2fs does is much similar to btrfs compression. Even if these
> blocks are all zeroed. In principle, the maximum compression ratio
> is determined (cluster sized blocks into one compressed block, e.g
> 16k cluster into one compressed block).
>
> So it'd be better to describe your configured cluster size (16k or
> 128k) and your hardware information in the commit message as well.
>
> Actually, I also tried with this patch as well on my x86 laptop just
> now with FIO (I didn't use zeroed block though), and I didn't notice
> much difference with turbo boost off and maxfreq.
>
> I'm not arguing this commit, just a note about this commit message.
> > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>
> IMHO, the above number is much like decompressing in the arm64 little cores.
>
> Thanks,
> Gao Xiang
>
>
> > >
> > > 2020년 8월 11� (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > > >
> > > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > > >> From: Daeho Jeong <daehojeong@google.com>
> > > > >>
> > > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > > >> bottlenecks of f2fs decompression path. Changing these with
> > > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > > >>
> > > > >> [Verification]
> > > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > > >> echo 3 > /proc/sys/vm/drop_caches
> > > > >> dd if=dummy of=/dev/zero bs=512k
> > > > >>
> > > > >> - w/o compression -
> > > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > > >>
> > > > >> - before patch -
> > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > > >>
> > > > >> - after patch -
> > > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > > >
> > > > > Indeed, vmap() approach has some impact on the whole
> > > > > workflow. But I don't think the gap is such significant,
> > > > > maybe it relates to unlocked cpufreq (and big little
> > > > > core difference if it's on some arm64 board).
> > > >
> > > > Agreed,
> > > >
> > > > I guess there should be other reason causing the large performance
> > > > gap, scheduling, frequency, or something else.
> > > >
> > > > >
> > > > >
> > > > >
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > .
> > > > >
> >
>


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11 11:21             ` Daeho Jeong
@ 2020-08-11 11:29               ` Gao Xiang
  -1 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2020-08-11 11:29 UTC (permalink / raw)
  To: Daeho Jeong
  Cc: Chao Yu, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8\"", Size: 4731 bytes --]

On Tue, Aug 11, 2020 at 08:21:23PM +0900, Daeho Jeong wrote:
> Sure, I'll update the test condition as you said in the commit message.
> FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.

Yeah, anyway, it'd better to lock the freq and offline the little
cores in your test as well (it'd make more sense). e.g. if 16k cluster
is applied, even all data is zeroed, the count of vmap/vm_map_ram
isn't hugeous (and as you said, "sometimes, it has a very long delay",
it's much like another scheduling concern as well).

Anyway, I'm not against your commit but the commit message is a bit
of unclear. At least, if you think that is really the case, I'm ok
with that.

Thanks,
Gao Xiang 

> 
> Thanks,
> 
> 2020년 8월 11일 (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님이 작성:
> >
> > On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> > > Plus, when we use vmap(), vmap() normally executes in a short time
> > > like vm_map_ram().
> > > But, sometimes, it has a very long delay.
> > >
> > > 2020년 8월 11� (화) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> > > >
> > > > Actually, as you can see, I use the whole zero data blocks in the test file.
> > > > It can maximize the effect of changing virtual mapping.
> > > > When I use normal files which can be compressed about 70% from the
> > > > original file,
> > > > The vm_map_ram() version is about 2x faster than vmap() version.
> >
> > What f2fs does is much similar to btrfs compression. Even if these
> > blocks are all zeroed. In principle, the maximum compression ratio
> > is determined (cluster sized blocks into one compressed block, e.g
> > 16k cluster into one compressed block).
> >
> > So it'd be better to describe your configured cluster size (16k or
> > 128k) and your hardware information in the commit message as well.
> >
> > Actually, I also tried with this patch as well on my x86 laptop just
> > now with FIO (I didn't use zeroed block though), and I didn't notice
> > much difference with turbo boost off and maxfreq.
> >
> > I'm not arguing this commit, just a note about this commit message.
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> >
> > IMHO, the above number is much like decompressing in the arm64 little cores.
> >
> > Thanks,
> > Gao Xiang
> >
> >
> > > >
> > > > 2020년 8월 11� (화) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > > > >
> > > > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > > > >> From: Daeho Jeong <daehojeong@google.com>
> > > > > >>
> > > > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > > > >> bottlenecks of f2fs decompression path. Changing these with
> > > > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > > > >>
> > > > > >> [Verification]
> > > > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > > > >> echo 3 > /proc/sys/vm/drop_caches
> > > > > >> dd if=dummy of=/dev/zero bs=512k
> > > > > >>
> > > > > >> - w/o compression -
> > > > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > > > >>
> > > > > >> - before patch -
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > > > >>
> > > > > >> - after patch -
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > > > >
> > > > > > Indeed, vmap() approach has some impact on the whole
> > > > > > workflow. But I don't think the gap is such significant,
> > > > > > maybe it relates to unlocked cpufreq (and big little
> > > > > > core difference if it's on some arm64 board).
> > > > >
> > > > > Agreed,
> > > > >
> > > > > I guess there should be other reason causing the large performance
> > > > > gap, scheduling, frequency, or something else.
> > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > _______________________________________________
> > > > > > Linux-f2fs-devel mailing list
> > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > .
> > > > > >
> > >
> >
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11 11:29               ` Gao Xiang
  0 siblings, 0 replies; 20+ messages in thread
From: Gao Xiang @ 2020-08-11 11:29 UTC (permalink / raw)
  To: Daeho Jeong; +Cc: kernel-team, Daeho Jeong, linux-f2fs-devel, linux-kernel

On Tue, Aug 11, 2020 at 08:21:23PM +0900, Daeho Jeong wrote:
> Sure, I'll update the test condition as you said in the commit message.
> FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.

Yeah, anyway, it'd better to lock the freq and offline the little
cores in your test as well (it'd make more sense). e.g. if 16k cluster
is applied, even all data is zeroed, the count of vmap/vm_map_ram
isn't hugeous (and as you said, "sometimes, it has a very long delay",
it's much like another scheduling concern as well).

Anyway, I'm not against your commit but the commit message is a bit
of unclear. At least, if you think that is really the case, I'm ok
with that.

Thanks,
Gao Xiang 

> 
> Thanks,
> 
> 2020년 8월 11� (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님� 작성:
> >
> > On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> > > Plus, when we use vmap(), vmap() normally executes in a short time
> > > like vm_map_ram().
> > > But, sometimes, it has a very long delay.
> > >
> > > 2020년 8� 11� (�) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> > > >
> > > > Actually, as you can see, I use the whole zero data blocks in the test file.
> > > > It can maximize the effect of changing virtual mapping.
> > > > When I use normal files which can be compressed about 70% from the
> > > > original file,
> > > > The vm_map_ram() version is about 2x faster than vmap() version.
> >
> > What f2fs does is much similar to btrfs compression. Even if these
> > blocks are all zeroed. In principle, the maximum compression ratio
> > is determined (cluster sized blocks into one compressed block, e.g
> > 16k cluster into one compressed block).
> >
> > So it'd be better to describe your configured cluster size (16k or
> > 128k) and your hardware information in the commit message as well.
> >
> > Actually, I also tried with this patch as well on my x86 laptop just
> > now with FIO (I didn't use zeroed block though), and I didn't notice
> > much difference with turbo boost off and maxfreq.
> >
> > I'm not arguing this commit, just a note about this commit message.
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> >
> > IMHO, the above number is much like decompressing in the arm64 little cores.
> >
> > Thanks,
> > Gao Xiang
> >
> >
> > > >
> > > > 2020년 8� 11� (�) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > > > >
> > > > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > > > >> From: Daeho Jeong <daehojeong@google.com>
> > > > > >>
> > > > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > > > >> bottlenecks of f2fs decompression path. Changing these with
> > > > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > > > >>
> > > > > >> [Verification]
> > > > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > > > >> echo 3 > /proc/sys/vm/drop_caches
> > > > > >> dd if=dummy of=/dev/zero bs=512k
> > > > > >>
> > > > > >> - w/o compression -
> > > > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > > > >>
> > > > > >> - before patch -
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > > > >>
> > > > > >> - after patch -
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > > > >
> > > > > > Indeed, vmap() approach has some impact on the whole
> > > > > > workflow. But I don't think the gap is such significant,
> > > > > > maybe it relates to unlocked cpufreq (and big little
> > > > > > core difference if it's on some arm64 board).
> > > > >
> > > > > Agreed,
> > > > >
> > > > > I guess there should be other reason causing the large performance
> > > > > gap, scheduling, frequency, or something else.
> > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > _______________________________________________
> > > > > > Linux-f2fs-devel mailing list
> > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > .
> > > > > >
> > >
> >
> 



_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11 11:29               ` Gao Xiang
@ 2020-08-11 11:31                 ` Daeho Jeong
  -1 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11 11:31 UTC (permalink / raw)
  To: Gao Xiang
  Cc: Chao Yu, Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

Plus, differently from your testbed, in my pixel device, there seems
to be much more contention in vmap() operation.
If it's not there, I agree that there might not be a big difference
between vmap() and vm_map_ram().

2020년 8월 11일 (화) 오후 8:29, Gao Xiang <hsiangkao@redhat.com>님이 작성:
>
> On Tue, Aug 11, 2020 at 08:21:23PM +0900, Daeho Jeong wrote:
> > Sure, I'll update the test condition as you said in the commit message.
> > FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.
>
> Yeah, anyway, it'd better to lock the freq and offline the little
> cores in your test as well (it'd make more sense). e.g. if 16k cluster
> is applied, even all data is zeroed, the count of vmap/vm_map_ram
> isn't hugeous (and as you said, "sometimes, it has a very long delay",
> it's much like another scheduling concern as well).
>
> Anyway, I'm not against your commit but the commit message is a bit
> of unclear. At least, if you think that is really the case, I'm ok
> with that.
>
> Thanks,
> Gao Xiang
>
> >
> > Thanks,
> >
> > 2020년 8월 11� (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님� 작성:
> > >
> > > On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> > > > Plus, when we use vmap(), vmap() normally executes in a short time
> > > > like vm_map_ram().
> > > > But, sometimes, it has a very long delay.
> > > >
> > > > 2020년 8� 11� (�) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> > > > >
> > > > > Actually, as you can see, I use the whole zero data blocks in the test file.
> > > > > It can maximize the effect of changing virtual mapping.
> > > > > When I use normal files which can be compressed about 70% from the
> > > > > original file,
> > > > > The vm_map_ram() version is about 2x faster than vmap() version.
> > >
> > > What f2fs does is much similar to btrfs compression. Even if these
> > > blocks are all zeroed. In principle, the maximum compression ratio
> > > is determined (cluster sized blocks into one compressed block, e.g
> > > 16k cluster into one compressed block).
> > >
> > > So it'd be better to describe your configured cluster size (16k or
> > > 128k) and your hardware information in the commit message as well.
> > >
> > > Actually, I also tried with this patch as well on my x86 laptop just
> > > now with FIO (I didn't use zeroed block though), and I didn't notice
> > > much difference with turbo boost off and maxfreq.
> > >
> > > I'm not arguing this commit, just a note about this commit message.
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > >
> > > IMHO, the above number is much like decompressing in the arm64 little cores.
> > >
> > > Thanks,
> > > Gao Xiang
> > >
> > >
> > > > >
> > > > > 2020년 8� 11� (�) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > > > > >
> > > > > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > > > > >> From: Daeho Jeong <daehojeong@google.com>
> > > > > > >>
> > > > > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > > > > >> bottlenecks of f2fs decompression path. Changing these with
> > > > > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > > > > >>
> > > > > > >> [Verification]
> > > > > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > > > > >> echo 3 > /proc/sys/vm/drop_caches
> > > > > > >> dd if=dummy of=/dev/zero bs=512k
> > > > > > >>
> > > > > > >> - w/o compression -
> > > > > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > > > > >>
> > > > > > >> - before patch -
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > > > > >>
> > > > > > >> - after patch -
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > > > > >
> > > > > > > Indeed, vmap() approach has some impact on the whole
> > > > > > > workflow. But I don't think the gap is such significant,
> > > > > > > maybe it relates to unlocked cpufreq (and big little
> > > > > > > core difference if it's on some arm64 board).
> > > > > >
> > > > > > Agreed,
> > > > > >
> > > > > > I guess there should be other reason causing the large performance
> > > > > > gap, scheduling, frequency, or something else.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > .
> > > > > > >
> > > >
> > >
> >
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-11 11:31                 ` Daeho Jeong
  0 siblings, 0 replies; 20+ messages in thread
From: Daeho Jeong @ 2020-08-11 11:31 UTC (permalink / raw)
  To: Gao Xiang; +Cc: kernel-team, Daeho Jeong, linux-f2fs-devel, linux-kernel

Plus, differently from your testbed, in my pixel device, there seems
to be much more contention in vmap() operation.
If it's not there, I agree that there might not be a big difference
between vmap() and vm_map_ram().

2020년 8월 11일 (화) 오후 8:29, Gao Xiang <hsiangkao@redhat.com>님이 작성:
>
> On Tue, Aug 11, 2020 at 08:21:23PM +0900, Daeho Jeong wrote:
> > Sure, I'll update the test condition as you said in the commit message.
> > FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.
>
> Yeah, anyway, it'd better to lock the freq and offline the little
> cores in your test as well (it'd make more sense). e.g. if 16k cluster
> is applied, even all data is zeroed, the count of vmap/vm_map_ram
> isn't hugeous (and as you said, "sometimes, it has a very long delay",
> it's much like another scheduling concern as well).
>
> Anyway, I'm not against your commit but the commit message is a bit
> of unclear. At least, if you think that is really the case, I'm ok
> with that.
>
> Thanks,
> Gao Xiang
>
> >
> > Thanks,
> >
> > 2020년 8월 11� (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님� 작성:
> > >
> > > On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
> > > > Plus, when we use vmap(), vmap() normally executes in a short time
> > > > like vm_map_ram().
> > > > But, sometimes, it has a very long delay.
> > > >
> > > > 2020년 8� 11� (�) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
> > > > >
> > > > > Actually, as you can see, I use the whole zero data blocks in the test file.
> > > > > It can maximize the effect of changing virtual mapping.
> > > > > When I use normal files which can be compressed about 70% from the
> > > > > original file,
> > > > > The vm_map_ram() version is about 2x faster than vmap() version.
> > >
> > > What f2fs does is much similar to btrfs compression. Even if these
> > > blocks are all zeroed. In principle, the maximum compression ratio
> > > is determined (cluster sized blocks into one compressed block, e.g
> > > 16k cluster into one compressed block).
> > >
> > > So it'd be better to describe your configured cluster size (16k or
> > > 128k) and your hardware information in the commit message as well.
> > >
> > > Actually, I also tried with this patch as well on my x86 laptop just
> > > now with FIO (I didn't use zeroed block though), and I didn't notice
> > > much difference with turbo boost off and maxfreq.
> > >
> > > I'm not arguing this commit, just a note about this commit message.
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > >
> > > IMHO, the above number is much like decompressing in the arm64 little cores.
> > >
> > > Thanks,
> > > Gao Xiang
> > >
> > >
> > > > >
> > > > > 2020년 8� 11� (�) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
> > > > > >
> > > > > > On 2020/8/11 15:15, Gao Xiang wrote:
> > > > > > > On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
> > > > > > >> From: Daeho Jeong <daehojeong@google.com>
> > > > > > >>
> > > > > > >> By profiling f2fs compression works, I've found vmap() callings are
> > > > > > >> bottlenecks of f2fs decompression path. Changing these with
> > > > > > >> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
> > > > > > >>
> > > > > > >> [Verification]
> > > > > > >> dd if=/dev/zero of=dummy bs=1m count=1000
> > > > > > >> echo 3 > /proc/sys/vm/drop_caches
> > > > > > >> dd if=dummy of=/dev/zero bs=512k
> > > > > > >>
> > > > > > >> - w/o compression -
> > > > > > >> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
> > > > > > >>
> > > > > > >> - before patch -
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
> > > > > > >>
> > > > > > >> - after patch -
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
> > > > > > >> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
> > > > > > >
> > > > > > > Indeed, vmap() approach has some impact on the whole
> > > > > > > workflow. But I don't think the gap is such significant,
> > > > > > > maybe it relates to unlocked cpufreq (and big little
> > > > > > > core difference if it's on some arm64 board).
> > > > > >
> > > > > > Agreed,
> > > > > >
> > > > > > I guess there should be other reason causing the large performance
> > > > > > gap, scheduling, frequency, or something else.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > .
> > > > > > >
> > > >
> > >
> >
>


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
  2020-08-11 11:31                 ` Daeho Jeong
@ 2020-08-12  1:51                   ` Chao Yu
  -1 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2020-08-12  1:51 UTC (permalink / raw)
  To: Daeho Jeong, Gao Xiang
  Cc: Daeho Jeong, kernel-team, linux-kernel, linux-f2fs-devel

On 2020/8/11 19:31, Daeho Jeong wrote:
> Plus, differently from your testbed, in my pixel device, there seems
> to be much more contention in vmap() operation.
> If it's not there, I agree that there might not be a big difference
> between vmap() and vm_map_ram().
> 
> 2020년 8월 11일 (화) 오후 8:29, Gao Xiang <hsiangkao@redhat.com>님이 작성:
>>
>> On Tue, Aug 11, 2020 at 08:21:23PM +0900, Daeho Jeong wrote:
>>> Sure, I'll update the test condition as you said in the commit message.
>>> FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.
>>
>> Yeah, anyway, it'd better to lock the freq and offline the little
>> cores in your test as well (it'd make more sense). e.g. if 16k cluster

I'm not against this commit, but could you please try to adjust cpufreq to
fixed value and offline little or big core, so that we can supply fair test
environment during test, I just wonder that in such environment, how much we
can improve the performance with vm_map_ram().

>> is applied, even all data is zeroed, the count of vmap/vm_map_ram
>> isn't hugeous (and as you said, "sometimes, it has a very long delay",
>> it's much like another scheduling concern as well).
>>
>> Anyway, I'm not against your commit but the commit message is a bit
>> of unclear. At least, if you think that is really the case, I'm ok
>> with that.
>>
>> Thanks,
>> Gao Xiang
>>
>>>
>>> Thanks,
>>>
>>> 2020년 8월 11� (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님� 작성:
>>>>
>>>> On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
>>>>> Plus, when we use vmap(), vmap() normally executes in a short time
>>>>> like vm_map_ram().
>>>>> But, sometimes, it has a very long delay.
>>>>>
>>>>> 2020년 8� 11� (�) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
>>>>>>
>>>>>> Actually, as you can see, I use the whole zero data blocks in the test file.
>>>>>> It can maximize the effect of changing virtual mapping.
>>>>>> When I use normal files which can be compressed about 70% from the
>>>>>> original file,
>>>>>> The vm_map_ram() version is about 2x faster than vmap() version.
>>>>
>>>> What f2fs does is much similar to btrfs compression. Even if these
>>>> blocks are all zeroed. In principle, the maximum compression ratio
>>>> is determined (cluster sized blocks into one compressed block, e.g
>>>> 16k cluster into one compressed block).
>>>>
>>>> So it'd be better to describe your configured cluster size (16k or
>>>> 128k) and your hardware information in the commit message as well.
>>>>
>>>> Actually, I also tried with this patch as well on my x86 laptop just
>>>> now with FIO (I didn't use zeroed block though), and I didn't notice
>>>> much difference with turbo boost off and maxfreq.
>>>>
>>>> I'm not arguing this commit, just a note about this commit message.
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>>>>
>>>> IMHO, the above number is much like decompressing in the arm64 little cores.
>>>>
>>>> Thanks,
>>>> Gao Xiang
>>>>
>>>>
>>>>>>
>>>>>> 2020년 8� 11� (�) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
>>>>>>>
>>>>>>> On 2020/8/11 15:15, Gao Xiang wrote:
>>>>>>>> On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
>>>>>>>>> From: Daeho Jeong <daehojeong@google.com>
>>>>>>>>>
>>>>>>>>> By profiling f2fs compression works, I've found vmap() callings are
>>>>>>>>> bottlenecks of f2fs decompression path. Changing these with
>>>>>>>>> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
>>>>>>>>>
>>>>>>>>> [Verification]
>>>>>>>>> dd if=/dev/zero of=dummy bs=1m count=1000
>>>>>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>>>>>> dd if=dummy of=/dev/zero bs=512k
>>>>>>>>>
>>>>>>>>> - w/o compression -
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
>>>>>>>>>
>>>>>>>>> - before patch -
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>>>>>>>>>
>>>>>>>>> - after patch -
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
>>>>>>>>
>>>>>>>> Indeed, vmap() approach has some impact on the whole
>>>>>>>> workflow. But I don't think the gap is such significant,
>>>>>>>> maybe it relates to unlocked cpufreq (and big little
>>>>>>>> core difference if it's on some arm64 board).
>>>>>>>
>>>>>>> Agreed,
>>>>>>>
>>>>>>> I guess there should be other reason causing the large performance
>>>>>>> gap, scheduling, frequency, or something else.
>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> Linux-f2fs-devel mailing list
>>>>>>>> Linux-f2fs-devel@lists.sourceforge.net
>>>>>>>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
>>>>>>>> .
>>>>>>>>
>>>>>
>>>>
>>>
>>
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: change virtual mapping way for compression pages
@ 2020-08-12  1:51                   ` Chao Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2020-08-12  1:51 UTC (permalink / raw)
  To: Daeho Jeong, Gao Xiang
  Cc: linux-f2fs-devel, kernel-team, Daeho Jeong, linux-kernel

On 2020/8/11 19:31, Daeho Jeong wrote:
> Plus, differently from your testbed, in my pixel device, there seems
> to be much more contention in vmap() operation.
> If it's not there, I agree that there might not be a big difference
> between vmap() and vm_map_ram().
> 
> 2020년 8월 11일 (화) 오후 8:29, Gao Xiang <hsiangkao@redhat.com>님이 작성:
>>
>> On Tue, Aug 11, 2020 at 08:21:23PM +0900, Daeho Jeong wrote:
>>> Sure, I'll update the test condition as you said in the commit message.
>>> FYI, the test is done with 16kb chunk and Pixel 3 (arm64) device.
>>
>> Yeah, anyway, it'd better to lock the freq and offline the little
>> cores in your test as well (it'd make more sense). e.g. if 16k cluster

I'm not against this commit, but could you please try to adjust cpufreq to
fixed value and offline little or big core, so that we can supply fair test
environment during test, I just wonder that in such environment, how much we
can improve the performance with vm_map_ram().

>> is applied, even all data is zeroed, the count of vmap/vm_map_ram
>> isn't hugeous (and as you said, "sometimes, it has a very long delay",
>> it's much like another scheduling concern as well).
>>
>> Anyway, I'm not against your commit but the commit message is a bit
>> of unclear. At least, if you think that is really the case, I'm ok
>> with that.
>>
>> Thanks,
>> Gao Xiang
>>
>>>
>>> Thanks,
>>>
>>> 2020년 8월 11� (화) 오후 7:18, Gao Xiang <hsiangkao@redhat.com>님� 작성:
>>>>
>>>> On Tue, Aug 11, 2020 at 06:33:26PM +0900, Daeho Jeong wrote:
>>>>> Plus, when we use vmap(), vmap() normally executes in a short time
>>>>> like vm_map_ram().
>>>>> But, sometimes, it has a very long delay.
>>>>>
>>>>> 2020년 8� 11� (�) 오후 6:28, Daeho Jeong <daeho43@gmail.com>님� 작성:
>>>>>>
>>>>>> Actually, as you can see, I use the whole zero data blocks in the test file.
>>>>>> It can maximize the effect of changing virtual mapping.
>>>>>> When I use normal files which can be compressed about 70% from the
>>>>>> original file,
>>>>>> The vm_map_ram() version is about 2x faster than vmap() version.
>>>>
>>>> What f2fs does is much similar to btrfs compression. Even if these
>>>> blocks are all zeroed. In principle, the maximum compression ratio
>>>> is determined (cluster sized blocks into one compressed block, e.g
>>>> 16k cluster into one compressed block).
>>>>
>>>> So it'd be better to describe your configured cluster size (16k or
>>>> 128k) and your hardware information in the commit message as well.
>>>>
>>>> Actually, I also tried with this patch as well on my x86 laptop just
>>>> now with FIO (I didn't use zeroed block though), and I didn't notice
>>>> much difference with turbo boost off and maxfreq.
>>>>
>>>> I'm not arguing this commit, just a note about this commit message.
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>>>>
>>>> IMHO, the above number is much like decompressing in the arm64 little cores.
>>>>
>>>> Thanks,
>>>> Gao Xiang
>>>>
>>>>
>>>>>>
>>>>>> 2020년 8� 11� (�) 오후 4:55, Chao Yu <yuchao0@huawei.com>님� 작성:
>>>>>>>
>>>>>>> On 2020/8/11 15:15, Gao Xiang wrote:
>>>>>>>> On Tue, Aug 11, 2020 at 12:37:53PM +0900, Daeho Jeong wrote:
>>>>>>>>> From: Daeho Jeong <daehojeong@google.com>
>>>>>>>>>
>>>>>>>>> By profiling f2fs compression works, I've found vmap() callings are
>>>>>>>>> bottlenecks of f2fs decompression path. Changing these with
>>>>>>>>> vm_map_ram(), we can enhance f2fs decompression speed pretty much.
>>>>>>>>>
>>>>>>>>> [Verification]
>>>>>>>>> dd if=/dev/zero of=dummy bs=1m count=1000
>>>>>>>>> echo 3 > /proc/sys/vm/drop_caches
>>>>>>>>> dd if=dummy of=/dev/zero bs=512k
>>>>>>>>>
>>>>>>>>> - w/o compression -
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 1.999384 s, 500 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.035988 s, 491 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.039457 s, 490 M/s
>>>>>>>>>
>>>>>>>>> - before patch -
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.146217 s, 109 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 9.997542 s, 100 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 10.109727 s, 99 M/s
>>>>>>>>>
>>>>>>>>> - after patch -
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.253441 s, 444 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.739764 s, 365 M/s
>>>>>>>>> 1048576000 bytes (0.9 G) copied, 2.185649 s, 458 M/s
>>>>>>>>
>>>>>>>> Indeed, vmap() approach has some impact on the whole
>>>>>>>> workflow. But I don't think the gap is such significant,
>>>>>>>> maybe it relates to unlocked cpufreq (and big little
>>>>>>>> core difference if it's on some arm64 board).
>>>>>>>
>>>>>>> Agreed,
>>>>>>>
>>>>>>> I guess there should be other reason causing the large performance
>>>>>>> gap, scheduling, frequency, or something else.
>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> Linux-f2fs-devel mailing list
>>>>>>>> Linux-f2fs-devel@lists.sourceforge.net
>>>>>>>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
>>>>>>>> .
>>>>>>>>
>>>>>
>>>>
>>>
>>
> .
> 


_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-08-12  1:51 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-11  3:37 [PATCH] f2fs: change virtual mapping way for compression pages Daeho Jeong
2020-08-11  3:37 ` [f2fs-dev] " Daeho Jeong
2020-08-11  7:15 ` Gao Xiang
2020-08-11  7:15   ` Gao Xiang
2020-08-11  7:54   ` Chao Yu
2020-08-11  7:54     ` Chao Yu
2020-08-11  9:28     ` Daeho Jeong
2020-08-11  9:28       ` Daeho Jeong
2020-08-11  9:33       ` Daeho Jeong
2020-08-11  9:33         ` Daeho Jeong
2020-08-11 10:18         ` Gao Xiang
2020-08-11 10:18           ` Gao Xiang
2020-08-11 11:21           ` Daeho Jeong
2020-08-11 11:21             ` Daeho Jeong
2020-08-11 11:29             ` Gao Xiang
2020-08-11 11:29               ` Gao Xiang
2020-08-11 11:31               ` Daeho Jeong
2020-08-11 11:31                 ` Daeho Jeong
2020-08-12  1:51                 ` Chao Yu
2020-08-12  1:51                   ` Chao Yu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.