linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 0/4] zram: implement deduplication in zram
@ 2017-04-26  0:52 js1304
  2017-04-26  0:52 ` [PATCH v4 1/4] zram: introduce zram_entry to prepare dedup functionality js1304
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: js1304 @ 2017-04-26  0:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, Sergey Senozhatsky, linux-kernel, kernel-team, Joonsoo Kim

From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

Changes from v3
o fix module build problem
o make zram_dedup_enabled() statically return false if CONFIG_ZRAM_DEDUP=n

Changes from v2
o rebase to latest zram code
o manage alloc/free of the zram_entry in zram_drv.c
o remove useless RB_CLEAR_NODE
o set RO permission tor use_deup sysfs entry if CONFIG_ZRAM_DEDUP=n

Changes from v1
o reogranize dedup specific functions
o support Kconfig on/off
o update zram documents
o compare all the entries with same checksum (patch #4)

This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description
about motivation and experimental results on patch #2 so please
refer it.

Thanks.

Joonsoo Kim (4):
  zram: introduce zram_entry to prepare dedup functionality
  zram: implement deduplication in zram
  zram: make deduplication feature optional
  zram: compare all the entries with same checksum for deduplication

 Documentation/ABI/testing/sysfs-block-zram |  10 ++
 Documentation/blockdev/zram.txt            |   3 +
 drivers/block/zram/Kconfig                 |  14 ++
 drivers/block/zram/Makefile                |   3 +-
 drivers/block/zram/zram_dedup.c            | 254 +++++++++++++++++++++++++++++
 drivers/block/zram/zram_dedup.h            |  45 +++++
 drivers/block/zram/zram_drv.c              | 184 +++++++++++++++++----
 drivers/block/zram/zram_drv.h              |  32 +++-
 8 files changed, 507 insertions(+), 38 deletions(-)
 create mode 100644 drivers/block/zram/zram_dedup.c
 create mode 100644 drivers/block/zram/zram_dedup.h

-- 
2.7.4

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4 1/4] zram: introduce zram_entry to prepare dedup functionality
  2017-04-26  0:52 [PATCH v4 0/4] zram: implement deduplication in zram js1304
@ 2017-04-26  0:52 ` js1304
  2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: js1304 @ 2017-04-26  0:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, Sergey Senozhatsky, linux-kernel, kernel-team, Joonsoo Kim

From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

Following patch will implement deduplication functionality
in the zram and it requires an indirection layer to manage
the life cycle of zsmalloc handle. To prepare that, this patch
introduces zram_entry which can be used to manage the life-cycle
of zsmalloc handle. Many lines are changed due to rename but
core change is just simple introduction about newly data structure.

Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 drivers/block/zram/zram_drv.c | 95 +++++++++++++++++++++++++++----------------
 drivers/block/zram/zram_drv.h |  6 ++-
 2 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index debee95..26dc4e5 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -57,14 +57,15 @@ static inline struct zram *dev_to_zram(struct device *dev)
 	return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
-static unsigned long zram_get_handle(struct zram *zram, u32 index)
+static struct zram_entry *zram_get_entry(struct zram *zram, u32 index)
 {
-	return zram->table[index].handle;
+	return zram->table[index].entry;
 }
 
-static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+static void zram_set_entry(struct zram *zram, u32 index,
+			struct zram_entry *entry)
 {
-	zram->table[index].handle = handle;
+	zram->table[index].entry = entry;
 }
 
 /* flag operations require table entry bit_spin_lock() being held */
@@ -437,7 +438,7 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
 				unsigned int offset, unsigned int len)
 {
 	zram_slot_lock(zram, index);
-	if (unlikely(!zram_get_handle(zram, index) ||
+	if (unlikely(!zram_get_entry(zram, index) ||
 			zram_test_flag(zram, index, ZRAM_SAME))) {
 		void *mem;
 
@@ -476,6 +477,32 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
 	return false;
 }
 
+static struct zram_entry *zram_entry_alloc(struct zram *zram,
+					unsigned int len, gfp_t flags)
+{
+	struct zram_entry *entry;
+
+	entry = kzalloc(sizeof(*entry),
+			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+	if (!entry)
+		return NULL;
+
+	entry->handle = zs_malloc(zram->mem_pool, len, flags);
+	if (!entry->handle) {
+		kfree(entry);
+		return NULL;
+	}
+
+	return entry;
+}
+
+static inline void zram_entry_free(struct zram *zram,
+			struct zram_entry *entry)
+{
+	zs_free(zram->mem_pool, entry->handle);
+	kfree(entry);
+}
+
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -514,7 +541,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	unsigned long handle = zram_get_handle(zram, index);
+	struct zram_entry *entry = zram_get_entry(zram, index);
 
 	/*
 	 * No memory is allocated for same element filled pages.
@@ -527,23 +554,23 @@ static void zram_free_page(struct zram *zram, size_t index)
 		return;
 	}
 
-	if (!handle)
+	if (!entry)
 		return;
 
-	zs_free(zram->mem_pool, handle);
+	zram_entry_free(zram, entry);
 
 	atomic64_sub(zram_get_obj_size(zram, index),
 			&zram->stats.compr_data_size);
 	atomic64_dec(&zram->stats.pages_stored);
 
-	zram_set_handle(zram, index, 0);
+	zram_set_entry(zram, index, NULL);
 	zram_set_obj_size(zram, index, 0);
 }
 
 static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 {
 	int ret;
-	unsigned long handle;
+	struct zram_entry *entry;
 	unsigned int size;
 	void *src, *dst;
 
@@ -551,10 +578,10 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		return 0;
 
 	zram_slot_lock(zram, index);
-	handle = zram_get_handle(zram, index);
+	entry = zram_get_entry(zram, index);
 	size = zram_get_obj_size(zram, index);
 
-	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
+	src = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
 		dst = kmap_atomic(page);
 		memcpy(dst, src, PAGE_SIZE);
@@ -568,7 +595,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		kunmap_atomic(dst);
 		zcomp_stream_put(zram->comp);
 	}
-	zs_unmap_object(zram->mem_pool, handle);
+	zs_unmap_object(zram->mem_pool, entry->handle);
 	zram_slot_unlock(zram, index);
 
 	/* Should NEVER happen. Return bio error if it does. */
@@ -612,14 +639,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 }
 
 static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
-			struct page *page,
-			unsigned long *out_handle, unsigned int *out_comp_len)
+			struct page *page, struct zram_entry **out_entry,
+			unsigned int *out_comp_len)
 {
 	int ret;
 	unsigned int comp_len;
 	void *src;
 	unsigned long alloced_pages;
-	unsigned long handle = 0;
+	struct zram_entry *entry = NULL;
 
 compress_again:
 	src = kmap_atomic(page);
@@ -628,8 +655,8 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 
 	if (unlikely(ret)) {
 		pr_err("Compression failed! err=%d\n", ret);
-		if (handle)
-			zs_free(zram->mem_pool, handle);
+		if (entry)
+			zram_entry_free(zram, entry);
 		return ret;
 	}
 
@@ -637,32 +664,32 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 		comp_len = PAGE_SIZE;
 
 	/*
-	 * handle allocation has 2 paths:
+	 * entry allocation has 2 paths:
 	 * a) fast path is executed with preemption disabled (for
 	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
 	 *  since we can't sleep;
 	 * b) slow path enables preemption and attempts to allocate
 	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
 	 *  put per-cpu compression stream and, thus, to re-do
-	 *  the compression once handle is allocated.
+	 *  the compression once entry is allocated.
 	 *
-	 * if we have a 'non-null' handle here then we are coming
-	 * from the slow path and handle has already been allocated.
+	 * if we have a 'non-null' entry here then we are coming
+	 * from the slow path and entry has already been allocated.
 	 */
-	if (!handle)
-		handle = zs_malloc(zram->mem_pool, comp_len,
+	if (!entry)
+		entry = zram_entry_alloc(zram, comp_len,
 				__GFP_KSWAPD_RECLAIM |
 				__GFP_NOWARN |
 				__GFP_HIGHMEM |
 				__GFP_MOVABLE);
-	if (!handle) {
+	if (!entry) {
 		zcomp_stream_put(zram->comp);
 		atomic64_inc(&zram->stats.writestall);
-		handle = zs_malloc(zram->mem_pool, comp_len,
+		entry = zram_entry_alloc(zram, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
 		*zstrm = zcomp_stream_get(zram->comp);
-		if (handle)
+		if (entry)
 			goto compress_again;
 		return -ENOMEM;
 	}
@@ -671,11 +698,11 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 	update_used_max(zram, alloced_pages);
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
-		zs_free(zram->mem_pool, handle);
+		zram_entry_free(zram, entry);
 		return -ENOMEM;
 	}
 
-	*out_handle = handle;
+	*out_entry = entry;
 	*out_comp_len = comp_len;
 	return 0;
 }
@@ -683,7 +710,7 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
 	int ret;
-	unsigned long handle;
+	struct zram_entry *entry;
 	unsigned int comp_len;
 	void *src, *dst;
 	struct zcomp_strm *zstrm;
@@ -693,13 +720,13 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 		return 0;
 
 	zstrm = zcomp_stream_get(zram->comp);
-	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+	ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
 	if (ret) {
 		zcomp_stream_put(zram->comp);
 		return ret;
 	}
 
-	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
+	dst = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_WO);
 
 	src = zstrm->buffer;
 	if (comp_len == PAGE_SIZE)
@@ -709,7 +736,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 		kunmap_atomic(src);
 
 	zcomp_stream_put(zram->comp);
-	zs_unmap_object(zram->mem_pool, handle);
+	zs_unmap_object(zram->mem_pool, entry->handle);
 
 	/*
 	 * Free memory associated with this sector
@@ -717,7 +744,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	zram_set_handle(zram, index, handle);
+	zram_set_entry(zram, index, entry);
 	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_unlock(zram, index);
 
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d..fe3d216 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,10 +69,14 @@ enum zram_pageflags {
 
 /*-- Data structures */
 
+struct zram_entry {
+	unsigned long handle;
+};
+
 /* Allocated for each disk page */
 struct zram_table_entry {
 	union {
-		unsigned long handle;
+		struct zram_entry *entry;
 		unsigned long element;
 	};
 	unsigned long value;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  0:52 [PATCH v4 0/4] zram: implement deduplication in zram js1304
  2017-04-26  0:52 ` [PATCH v4 1/4] zram: introduce zram_entry to prepare dedup functionality js1304
@ 2017-04-26  0:52 ` js1304
  2017-04-26  2:14   ` Sergey Senozhatsky
                     ` (3 more replies)
  2017-04-26  0:52 ` [PATCH v4 3/4] zram: make deduplication feature optional js1304
                   ` (2 subsequent siblings)
  4 siblings, 4 replies; 25+ messages in thread
From: js1304 @ 2017-04-26  0:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, Sergey Senozhatsky, linux-kernel, kernel-team, Joonsoo Kim

From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

This patch implements deduplication feature in zram. The purpose
of this work is naturally to save amount of memory usage by zram.

Android is one of the biggest users to use zram as swap and it's
really important to save amount of memory usage. There is a paper
that reports that duplication ratio of Android's memory content is
rather high [1]. And, there is a similar work on zswap that also
reports that experiments has shown that around 10-15% of pages
stored in zswp are duplicates and deduplicate them provides some
benefits [2].

Also, there is a different kind of workload that uses zram as blockdev
and store build outputs into it to reduce wear-out problem of real
blockdev. In this workload, deduplication hit is very high due to
temporary files and intermediate object files. Detailed analysis is
on the bottom of this description.

Anyway, if we can detect duplicated content and avoid to store duplicated
content at different memory space, we can save memory. This patch
tries to do that.

Implementation is almost simple and intuitive but I should note
one thing about implementation detail.

To check duplication, this patch uses checksum of the page and
collision of this checksum could be possible. There would be
many choices to handle this situation but this patch chooses
to allow entry with duplicated checksum to be added to the hash,
but, not to compare all entries with duplicated checksum
when checking duplication. I guess that checksum collision is quite
rare event and we don't need to pay any attention to such a case.
Therefore, I decided the most simplest way to implement the feature.
If there is a different opinion, I can accept and go that way.

Following is the result of this patch.

Test result #1 (Swap):
Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18

orig_data_size: 145297408
compr_data_size: 32408125
mem_used_total: 32276480
dup_data_size: 3188134
meta_data_size: 1444272

Last two metrics added to mm_stat are related to this work.
First one, dup_data_size, is amount of saved memory by avoiding
to store duplicated page. Later one, meta_data_size, is the amount of
data structure to support deduplication. If dup > meta, we can judge
that the patch improves memory usage.

In Adnroid, we can save 5% of memory usage by this work.

Test result #2 (Blockdev):
build the kernel and store output to ext4 FS on zram

<no-dedup>
Elapsed time: 249 s
mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0

<dedup>
Elapsed time: 250 s
mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038  3945792

There is no performance degradation and save 23% memory.

Test result #3 (Blockdev):
copy android build output dir(out/host) to ext4 FS on zram

<no-dedup>
Elapsed time: out/host: 88 s
mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0

<dedup>
Elapsed time: out/host: 100 s
mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336

It shows performance degradation roughly 13% and save 24% memory. Maybe,
it is due to overhead of calculating checksum and comparison.

Test result #4 (Blockdev):
copy android build output dir(out/target/common) to ext4 FS on zram

<no-dedup>
Elapsed time: out/host: 203 s
mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0

<dedup>
Elapsed time: out/host: 201 s
mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 24564336

Memory is saved by 42% and performance is the same. Even if there is overhead
of calculating checksum and comparison, large hit ratio compensate it since
hit leads to less compression attempt.

I checked the detailed reason of savings on kernel build workload and
there are some cases that deduplication happens.

1) *.cmd
Build command is usually similar in one directory so content of these file
are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
respectively.

2) intermediate object files
built-in.o and temporary object file have the similar contents. More than
50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.

3) vmlinux
.tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
have the similar contents.

Android test has similar case that some of object files(.class
and .so) are similar with another ones.
(./host/linux-x86/lib/libartd.so and
./host/linux-x86-lib/libartd-comiler.so)

Anyway, benefit seems to be largely dependent on the workload so
following patch will make this feature optional. However, this feature
can help some usecases so is deserved to be merged.

[1]: MemScope: Analyzing Memory Duplication on Android Systems,
dl.acm.org/citation.cfm?id=2797023
[2]: zswap: Optimize compressed pool memory utilization,
lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2

Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 Documentation/blockdev/zram.txt |   2 +
 drivers/block/zram/Makefile     |   2 +-
 drivers/block/zram/zram_dedup.c | 204 ++++++++++++++++++++++++++++++++++++++++
 drivers/block/zram/zram_dedup.h |  22 +++++
 drivers/block/zram/zram_drv.c   |  38 ++++++--
 drivers/block/zram/zram_drv.h   |  17 ++++
 6 files changed, 278 insertions(+), 7 deletions(-)
 create mode 100644 drivers/block/zram/zram_dedup.c
 create mode 100644 drivers/block/zram/zram_dedup.h

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a..2cdc303 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
  same_pages       the number of same element filled pages written to this disk.
                   No memory is allocated for such pages.
  pages_compacted  the number of pages freed during compaction
+ dup_data_size	  deduplicated data size
+ meta_data_size	  the amount of metadata allocated for deduplication feature
 
 9) Deactivate:
 	swapoff /dev/zram0
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 9e2b79e..29cb008 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y	:=	zcomp.o zram_drv.o
+zram-y	:=	zcomp.o zram_drv.o zram_dedup.o
 
 obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
new file mode 100644
index 0000000..a8427f7
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2017 Joonsoo Kim.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#include <linux/highmem.h>
+
+#include "zram_drv.h"
+
+/* One slot will contain 128 pages theoretically */
+#define ZRAM_HASH_SHIFT		7
+#define ZRAM_HASH_SIZE_MIN	(1 << 10)
+#define ZRAM_HASH_SIZE_MAX	(1 << 31)
+
+u64 zram_dedup_dup_size(struct zram *zram)
+{
+	return (u64)atomic64_read(&zram->stats.dup_data_size);
+}
+
+u64 zram_dedup_meta_size(struct zram *zram)
+{
+	return (u64)atomic64_read(&zram->stats.meta_data_size);
+}
+
+static u32 zram_dedup_checksum(unsigned char *mem)
+{
+	return jhash(mem, PAGE_SIZE, 0);
+}
+
+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+				u32 checksum)
+{
+	struct zram_hash *hash;
+	struct rb_root *rb_root;
+	struct rb_node **rb_node, *parent = NULL;
+	struct zram_entry *entry;
+
+	new->checksum = checksum;
+	hash = &zram->hash[checksum % zram->hash_size];
+	rb_root = &hash->rb_root;
+
+	spin_lock(&hash->lock);
+	rb_node = &rb_root->rb_node;
+	while (*rb_node) {
+		parent = *rb_node;
+		entry = rb_entry(parent, struct zram_entry, rb_node);
+		if (checksum < entry->checksum)
+			rb_node = &parent->rb_left;
+		else if (checksum > entry->checksum)
+			rb_node = &parent->rb_right;
+		else
+			rb_node = &parent->rb_left;
+	}
+
+	rb_link_node(&new->rb_node, parent, rb_node);
+	rb_insert_color(&new->rb_node, rb_root);
+	spin_unlock(&hash->lock);
+}
+
+static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,
+				unsigned char *mem)
+{
+	bool match = false;
+	unsigned char *cmem;
+	struct zcomp_strm *zstrm;
+
+	cmem = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);
+	if (entry->len == PAGE_SIZE) {
+		match = !memcmp(mem, cmem, PAGE_SIZE);
+	} else {
+		zstrm = zcomp_stream_get(zram->comp);
+		if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+			match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+		zcomp_stream_put(zram->comp);
+	}
+	zs_unmap_object(zram->mem_pool, entry->handle);
+
+	return match;
+}
+
+static unsigned long zram_dedup_put(struct zram *zram,
+				struct zram_entry *entry)
+{
+	struct zram_hash *hash;
+	u32 checksum;
+
+	checksum = entry->checksum;
+	hash = &zram->hash[checksum % zram->hash_size];
+
+	spin_lock(&hash->lock);
+
+	entry->refcount--;
+	if (!entry->refcount)
+		rb_erase(&entry->rb_node, &hash->rb_root);
+	else
+		atomic64_sub(entry->len, &zram->stats.dup_data_size);
+
+	spin_unlock(&hash->lock);
+
+	return entry->refcount;
+}
+
+static struct zram_entry *zram_dedup_get(struct zram *zram,
+				unsigned char *mem, u32 checksum)
+{
+	struct zram_hash *hash;
+	struct zram_entry *entry;
+	struct rb_node *rb_node;
+
+	hash = &zram->hash[checksum % zram->hash_size];
+
+	spin_lock(&hash->lock);
+	rb_node = hash->rb_root.rb_node;
+	while (rb_node) {
+		entry = rb_entry(rb_node, struct zram_entry, rb_node);
+		if (checksum == entry->checksum) {
+			entry->refcount++;
+			atomic64_add(entry->len, &zram->stats.dup_data_size);
+			spin_unlock(&hash->lock);
+
+			if (zram_dedup_match(zram, entry, mem))
+				return entry;
+
+			zram_entry_free(zram, entry);
+
+			return NULL;
+		}
+
+		if (checksum < entry->checksum)
+			rb_node = rb_node->rb_left;
+		else
+			rb_node = rb_node->rb_right;
+	}
+	spin_unlock(&hash->lock);
+
+	return NULL;
+}
+
+struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
+				u32 *checksum)
+{
+	void *mem;
+	struct zram_entry *entry;
+
+	mem = kmap_atomic(page);
+	*checksum = zram_dedup_checksum(mem);
+
+	entry = zram_dedup_get(zram, mem, *checksum);
+	kunmap_atomic(mem);
+
+	return entry;
+}
+
+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+				unsigned long handle, unsigned int len)
+{
+	entry->handle = handle;
+	entry->refcount = 1;
+	entry->len = len;
+}
+
+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
+{
+	if (zram_dedup_put(zram, entry))
+		return false;
+
+	return true;
+}
+
+int zram_dedup_init(struct zram *zram, size_t num_pages)
+{
+	int i;
+	struct zram_hash *hash;
+
+	zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;
+	zram->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, zram->hash_size);
+	zram->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, zram->hash_size);
+	zram->hash = vzalloc(zram->hash_size * sizeof(struct zram_hash));
+	if (!zram->hash) {
+		pr_err("Error allocating zram entry hash\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < zram->hash_size; i++) {
+		hash = &zram->hash[i];
+		spin_lock_init(&hash->lock);
+		hash->rb_root = RB_ROOT;
+	}
+
+	return 0;
+}
+
+void zram_dedup_fini(struct zram *zram)
+{
+	vfree(zram->hash);
+	zram->hash = NULL;
+	zram->hash_size = 0;
+}
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
new file mode 100644
index 0000000..ebe6bff
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.h
@@ -0,0 +1,22 @@
+#ifndef _ZRAM_DEDUP_H_
+#define _ZRAM_DEDUP_H_
+
+struct zram;
+struct zram_entry;
+
+u64 zram_dedup_dup_size(struct zram *zram);
+u64 zram_dedup_meta_size(struct zram *zram);
+
+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+				u32 checksum);
+struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
+				u32 *checksum);
+
+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+				unsigned long handle, unsigned int len);
+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);
+
+int zram_dedup_init(struct zram *zram, size_t num_pages);
+void zram_dedup_fini(struct zram *zram);
+
+#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 26dc4e5..8eab8a0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -389,14 +389,16 @@ static ssize_t mm_stat_show(struct device *dev,
 	max_used = atomic_long_read(&zram->stats.max_used_pages);
 
 	ret = scnprintf(buf, PAGE_SIZE,
-			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
 			orig_size << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.compr_data_size),
 			mem_used << PAGE_SHIFT,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.same_pages),
-			pool_stats.pages_compacted);
+			pool_stats.pages_compacted,
+			zram_dedup_dup_size(zram),
+			zram_dedup_meta_size(zram));
 	up_read(&zram->init_lock);
 
 	return ret;
@@ -481,26 +483,34 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
 					unsigned int len, gfp_t flags)
 {
 	struct zram_entry *entry;
+	unsigned long handle;
 
 	entry = kzalloc(sizeof(*entry),
 			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
 	if (!entry)
 		return NULL;
 
-	entry->handle = zs_malloc(zram->mem_pool, len, flags);
-	if (!entry->handle) {
+	handle = zs_malloc(zram->mem_pool, len, flags);
+	if (!handle) {
 		kfree(entry);
 		return NULL;
 	}
 
+	zram_dedup_init_entry(zram, entry, handle, len);
+	atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
+
 	return entry;
 }
 
-static inline void zram_entry_free(struct zram *zram,
-			struct zram_entry *entry)
+void zram_entry_free(struct zram *zram, struct zram_entry *entry)
 {
+	if (!zram_dedup_put_entry(zram, entry))
+		return;
+
 	zs_free(zram->mem_pool, entry->handle);
 	kfree(entry);
+
+	atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
 }
 
 static void zram_meta_free(struct zram *zram, u64 disksize)
@@ -513,6 +523,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
 		zram_free_page(zram, index);
 
 	zs_destroy_pool(zram->mem_pool);
+	zram_dedup_fini(zram);
 	vfree(zram->table);
 }
 
@@ -531,6 +542,12 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 		return false;
 	}
 
+	if (zram_dedup_init(zram, num_pages)) {
+		vfree(zram->table);
+		zs_destroy_pool(zram->mem_pool);
+		return false;
+	}
+
 	return true;
 }
 
@@ -715,10 +732,17 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	void *src, *dst;
 	struct zcomp_strm *zstrm;
 	struct page *page = bvec->bv_page;
+	u32 checksum;
 
 	if (zram_same_page_write(zram, index, page))
 		return 0;
 
+	entry = zram_dedup_find(zram, page, &checksum);
+	if (entry) {
+		comp_len = entry->len;
+		goto found_dup;
+	}
+
 	zstrm = zcomp_stream_get(zram->comp);
 	ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
 	if (ret) {
@@ -737,7 +761,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 
 	zcomp_stream_put(zram->comp);
 	zs_unmap_object(zram->mem_pool, entry->handle);
+	zram_dedup_insert(zram, entry, checksum);
 
+found_dup:
 	/*
 	 * Free memory associated with this sector
 	 * before overwriting unused sectors.
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index fe3d216..4b86921 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -18,8 +18,10 @@
 #include <linux/rwsem.h>
 #include <linux/zsmalloc.h>
 #include <linux/crypto.h>
+#include <linux/spinlock.h>
 
 #include "zcomp.h"
+#include "zram_dedup.h"
 
 /*-- Configurable parameters */
 
@@ -70,6 +72,10 @@ enum zram_pageflags {
 /*-- Data structures */
 
 struct zram_entry {
+	struct rb_node rb_node;
+	u32 len;
+	u32 checksum;
+	unsigned long refcount;
 	unsigned long handle;
 };
 
@@ -94,6 +100,13 @@ struct zram_stats {
 	atomic64_t pages_stored;	/* no. of pages currently stored */
 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
 	atomic64_t writestall;		/* no. of write slow paths */
+	atomic64_t dup_data_size;	/* compressed size of pages duplicated */
+	atomic64_t meta_data_size;	/* size of zram_entries */
+};
+
+struct zram_hash {
+	spinlock_t lock;
+	struct rb_root rb_root;
 };
 
 struct zram {
@@ -101,6 +114,8 @@ struct zram {
 	struct zs_pool *mem_pool;
 	struct zcomp *comp;
 	struct gendisk *disk;
+	struct zram_hash *hash;
+	size_t hash_size;
 	/* Prevent concurrent execution of device init */
 	struct rw_semaphore init_lock;
 	/*
@@ -120,4 +135,6 @@ struct zram {
 	 */
 	bool claim; /* Protected by bdev->bd_mutex */
 };
+
+void zram_entry_free(struct zram *zram, struct zram_entry *entry);
 #endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v4 3/4] zram: make deduplication feature optional
  2017-04-26  0:52 [PATCH v4 0/4] zram: implement deduplication in zram js1304
  2017-04-26  0:52 ` [PATCH v4 1/4] zram: introduce zram_entry to prepare dedup functionality js1304
  2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
@ 2017-04-26  0:52 ` js1304
  2017-04-26  0:52 ` [PATCH v4 4/4] zram: compare all the entries with same checksum for deduplication js1304
  2017-04-27  7:49 ` [PATCH v4 0/4] zram: implement deduplication in zram Sergey Senozhatsky
  4 siblings, 0 replies; 25+ messages in thread
From: js1304 @ 2017-04-26  0:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, Sergey Senozhatsky, linux-kernel, kernel-team, Joonsoo Kim

From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.

Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 Documentation/ABI/testing/sysfs-block-zram | 10 ++++
 Documentation/blockdev/zram.txt            |  1 +
 drivers/block/zram/Kconfig                 | 14 ++++++
 drivers/block/zram/Makefile                |  3 +-
 drivers/block/zram/zram_dedup.c            | 15 ++++++
 drivers/block/zram/zram_dedup.h            | 23 +++++++++
 drivers/block/zram/zram_drv.c              | 81 ++++++++++++++++++++++++++----
 drivers/block/zram/zram_drv.h              |  9 ++++
 8 files changed, 144 insertions(+), 12 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d8..3c1945f 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,13 @@ Description:
 		device's debugging info useful for kernel developers. Its
 		format is not documented intentionally and may change
 		anytime without any notice.
+
+What:		/sys/block/zram<id>/use_dedup
+Date:		March 2017
+Contact:	Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Description:
+		The use_dedup file is read-write and specifies deduplication
+		feature is used or not. If enabled, duplicated data is
+		managed by reference count and will not be stored in memory
+		twice. Benefit of this feature largely depends on the workload
+		so keep attention when use.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 2cdc303..cbbe39b 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams  RW    the number of possible concurrent compress operations
 comp_algorithm    RW    show and change the compression algorithm
 compact           WO    trigger memory compaction
 debug_stat        RO    this file is used for zram debugging purposes
+use_dedup	  RW	show and set deduplication feature
 
 
 User space is advised to use the following files to read the device statistics.
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6..2f3dd1f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,17 @@ config ZRAM
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
+
+config ZRAM_DEDUP
+	bool "Deduplication support for ZRAM data"
+	depends on ZRAM
+	default n
+	help
+	  Deduplicate ZRAM data to reduce amount of memory consumption.
+	  Advantage largely depends on the workload. In some cases, this
+	  option reduces memory usage to the half. However, if there is no
+	  duplicated data, the amount of memory consumption would be
+	  increased due to additional metadata usage. And, there is
+	  computation time trade-off. Please check the benefit before
+	  enabling this option. Experiment shows the positive effect when
+	  the zram is used as blockdev and is used to store build output.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 29cb008..d7204ef 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,4 @@
-zram-y	:=	zcomp.o zram_drv.o zram_dedup.o
+zram-y				:=	zcomp.o zram_drv.o
+zram-$(CONFIG_ZRAM_DEDUP)	+=	zram_dedup.o
 
 obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index a8427f7..560b1f5 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -41,6 +41,9 @@ void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
 	struct rb_node **rb_node, *parent = NULL;
 	struct zram_entry *entry;
 
+	if (!zram_dedup_enabled(zram))
+		return;
+
 	new->checksum = checksum;
 	hash = &zram->hash[checksum % zram->hash_size];
 	rb_root = &hash->rb_root;
@@ -148,6 +151,9 @@ struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
 	void *mem;
 	struct zram_entry *entry;
 
+	if (!zram_dedup_enabled(zram))
+		return NULL;
+
 	mem = kmap_atomic(page);
 	*checksum = zram_dedup_checksum(mem);
 
@@ -160,6 +166,9 @@ struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
 void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
 				unsigned long handle, unsigned int len)
 {
+	if (!zram_dedup_enabled(zram))
+		return;
+
 	entry->handle = handle;
 	entry->refcount = 1;
 	entry->len = len;
@@ -167,6 +176,9 @@ void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
 
 bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
 {
+	if (!zram_dedup_enabled(zram))
+		return true;
+
 	if (zram_dedup_put(zram, entry))
 		return false;
 
@@ -178,6 +190,9 @@ int zram_dedup_init(struct zram *zram, size_t num_pages)
 	int i;
 	struct zram_hash *hash;
 
+	if (!zram_dedup_enabled(zram))
+		return 0;
+
 	zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;
 	zram->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, zram->hash_size);
 	zram->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, zram->hash_size);
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
index ebe6bff..8ab267b 100644
--- a/drivers/block/zram/zram_dedup.h
+++ b/drivers/block/zram/zram_dedup.h
@@ -4,6 +4,8 @@
 struct zram;
 struct zram_entry;
 
+#ifdef CONFIG_ZRAM_DEDUP
+
 u64 zram_dedup_dup_size(struct zram *zram);
 u64 zram_dedup_meta_size(struct zram *zram);
 
@@ -18,5 +20,26 @@ bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);
 
 int zram_dedup_init(struct zram *zram, size_t num_pages);
 void zram_dedup_fini(struct zram *zram);
+#else
+
+static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
+static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
+
+static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+			u32 checksum) { }
+static inline struct zram_entry *zram_dedup_find(struct zram *zram,
+			struct page *page, u32 *checksum) { return NULL; }
+
+static inline void zram_dedup_init_entry(struct zram *zram,
+			struct zram_entry *entry, unsigned long handle,
+			unsigned int len) { }
+static inline bool zram_dedup_put_entry(struct zram *zram,
+			struct zram_entry *entry) { return true; }
+
+static inline int zram_dedup_init(struct zram *zram,
+			size_t num_pages) { return 0; }
+static inline void zram_dedup_fini(struct zram *zram) { }
+
+#endif
 
 #endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8eab8a0..372602c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -333,6 +333,41 @@ static ssize_t comp_algorithm_store(struct device *dev,
 	return len;
 }
 
+static ssize_t use_dedup_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	bool val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	val = zram->use_dedup;
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (int)val);
+}
+
+#ifdef CONFIG_ZRAM_DEDUP
+static ssize_t use_dedup_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int val;
+	struct zram *zram = dev_to_zram(dev);
+
+	if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1))
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Can't change dedup usage for initialized device\n");
+		return -EBUSY;
+	}
+	zram->use_dedup = val;
+	up_write(&zram->init_lock);
+	return len;
+}
+#endif
+
 static ssize_t compact_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -425,6 +460,15 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
+static unsigned long zram_entry_handle(struct zram *zram,
+		struct zram_entry *entry)
+{
+	if (zram_dedup_enabled(zram))
+		return entry->handle;
+	else
+		return (unsigned long)entry;
+}
+
 static void zram_slot_lock(struct zram *zram, u32 index)
 {
 	bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
@@ -485,14 +529,17 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
 	struct zram_entry *entry;
 	unsigned long handle;
 
-	entry = kzalloc(sizeof(*entry),
-			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
-	if (!entry)
+	handle = zs_malloc(zram->mem_pool, len, flags);
+	if (!handle)
 		return NULL;
 
-	handle = zs_malloc(zram->mem_pool, len, flags);
-	if (!handle) {
-		kfree(entry);
+	if (!zram_dedup_enabled(zram))
+		return (struct zram_entry *)handle;
+
+	entry = kzalloc(sizeof(*entry),
+			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+	if (!entry) {
+		zs_free(zram->mem_pool, handle);
 		return NULL;
 	}
 
@@ -507,7 +554,11 @@ void zram_entry_free(struct zram *zram, struct zram_entry *entry)
 	if (!zram_dedup_put_entry(zram, entry))
 		return;
 
-	zs_free(zram->mem_pool, entry->handle);
+	zs_free(zram->mem_pool, zram_entry_handle(zram, entry));
+
+	if (!zram_dedup_enabled(zram))
+		return;
+
 	kfree(entry);
 
 	atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
@@ -598,7 +649,8 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 	entry = zram_get_entry(zram, index);
 	size = zram_get_obj_size(zram, index);
 
-	src = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);
+	src = zs_map_object(zram->mem_pool,
+		zram_entry_handle(zram, entry), ZS_MM_RO);
 	if (size == PAGE_SIZE) {
 		dst = kmap_atomic(page);
 		memcpy(dst, src, PAGE_SIZE);
@@ -612,7 +664,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		kunmap_atomic(dst);
 		zcomp_stream_put(zram->comp);
 	}
-	zs_unmap_object(zram->mem_pool, entry->handle);
+	zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry));
 	zram_slot_unlock(zram, index);
 
 	/* Should NEVER happen. Return bio error if it does. */
@@ -750,7 +802,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 		return ret;
 	}
 
-	dst = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_WO);
+	dst = zs_map_object(zram->mem_pool,
+		zram_entry_handle(zram, entry), ZS_MM_WO);
 
 	src = zstrm->buffer;
 	if (comp_len == PAGE_SIZE)
@@ -760,7 +813,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 		kunmap_atomic(src);
 
 	zcomp_stream_put(zram->comp);
-	zs_unmap_object(zram->mem_pool, entry->handle);
+	zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry));
 	zram_dedup_insert(zram, entry, checksum);
 
 found_dup:
@@ -1159,6 +1212,11 @@ static DEVICE_ATTR_WO(mem_limit);
 static DEVICE_ATTR_WO(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_DEDUP
+static DEVICE_ATTR_RW(use_dedup);
+#else
+static DEVICE_ATTR_RO(use_dedup);
+#endif
 
 static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_disksize.attr,
@@ -1169,6 +1227,7 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_mem_used_max.attr,
 	&dev_attr_max_comp_streams.attr,
 	&dev_attr_comp_algorithm.attr,
+	&dev_attr_use_dedup.attr,
 	&dev_attr_io_stat.attr,
 	&dev_attr_mm_stat.attr,
 	&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 4b86921..0091e23 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -134,7 +134,16 @@ struct zram {
 	 * zram is claimed so open request will be failed
 	 */
 	bool claim; /* Protected by bdev->bd_mutex */
+	bool use_dedup;
 };
 
+static inline bool zram_dedup_enabled(struct zram *zram)
+{
+#ifdef CONFIG_ZRAM_DEDUP
+	return zram->use_dedup;
+#else
+	return false;
+#endif
+}
 void zram_entry_free(struct zram *zram, struct zram_entry *entry);
 #endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v4 4/4] zram: compare all the entries with same checksum for deduplication
  2017-04-26  0:52 [PATCH v4 0/4] zram: implement deduplication in zram js1304
                   ` (2 preceding siblings ...)
  2017-04-26  0:52 ` [PATCH v4 3/4] zram: make deduplication feature optional js1304
@ 2017-04-26  0:52 ` js1304
  2017-04-27  7:49 ` [PATCH v4 0/4] zram: implement deduplication in zram Sergey Senozhatsky
  4 siblings, 0 replies; 25+ messages in thread
From: js1304 @ 2017-04-26  0:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, Sergey Senozhatsky, linux-kernel, kernel-team, Joonsoo Kim

From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

Until now, we compare just one entry with same checksum when
checking duplication since it is the simplest way to implement.
However, for the completeness, checking all the entries is better
so this patch implement to compare all the entries with same checksum.
Since this event would be rare so there would be no performance loss.

Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 drivers/block/zram/zram_dedup.c | 59 ++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index 560b1f5..14c4988 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -109,6 +109,51 @@ static unsigned long zram_dedup_put(struct zram *zram,
 	return entry->refcount;
 }
 
+static struct zram_entry *__zram_dedup_get(struct zram *zram,
+				struct zram_hash *hash, unsigned char *mem,
+				struct zram_entry *entry)
+{
+	struct zram_entry *tmp, *prev = NULL;
+	struct rb_node *rb_node;
+
+	/* find left-most entry with same checksum */
+	while ((rb_node = rb_prev(&entry->rb_node))) {
+		tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+		if (tmp->checksum != entry->checksum)
+			break;
+
+		entry = tmp;
+	}
+
+again:
+	entry->refcount++;
+	atomic64_add(entry->len, &zram->stats.dup_data_size);
+	spin_unlock(&hash->lock);
+
+	if (prev)
+		zram_entry_free(zram, prev);
+
+	if (zram_dedup_match(zram, entry, mem))
+		return entry;
+
+	spin_lock(&hash->lock);
+	tmp = NULL;
+	rb_node = rb_next(&entry->rb_node);
+	if (rb_node)
+		tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+
+	if (tmp && (tmp->checksum == entry->checksum)) {
+		prev = entry;
+		entry = tmp;
+		goto again;
+	}
+
+	spin_unlock(&hash->lock);
+	zram_entry_free(zram, entry);
+
+	return NULL;
+}
+
 static struct zram_entry *zram_dedup_get(struct zram *zram,
 				unsigned char *mem, u32 checksum)
 {
@@ -122,18 +167,8 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,
 	rb_node = hash->rb_root.rb_node;
 	while (rb_node) {
 		entry = rb_entry(rb_node, struct zram_entry, rb_node);
-		if (checksum == entry->checksum) {
-			entry->refcount++;
-			atomic64_add(entry->len, &zram->stats.dup_data_size);
-			spin_unlock(&hash->lock);
-
-			if (zram_dedup_match(zram, entry, mem))
-				return entry;
-
-			zram_entry_free(zram, entry);
-
-			return NULL;
-		}
+		if (checksum == entry->checksum)
+			return __zram_dedup_get(zram, hash, mem, entry);
 
 		if (checksum < entry->checksum)
 			rb_node = rb_node->rb_left;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
@ 2017-04-26  2:14   ` Sergey Senozhatsky
  2017-04-26  5:57     ` Joonsoo Kim
  2017-04-26  2:37   ` Sergey Senozhatsky
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  2:14 UTC (permalink / raw)
  To: js1304
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team, Joonsoo Kim

Hello,

On (04/26/17 09:52), js1304@gmail.com wrote:
[..]
>  	ret = scnprintf(buf, PAGE_SIZE,
> -			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> +			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
>  			orig_size << PAGE_SHIFT,
>  			(u64)atomic64_read(&zram->stats.compr_data_size),
>  			mem_used << PAGE_SHIFT,
>  			zram->limit_pages << PAGE_SHIFT,
>  			max_used << PAGE_SHIFT,
>  			(u64)atomic64_read(&zram->stats.same_pages),
> -			pool_stats.pages_compacted);
> +			pool_stats.pages_compacted,
> +			zram_dedup_dup_size(zram),
> +			zram_dedup_meta_size(zram));

hm... should't we subtract zram_dedup_dup_size(zram) from
->stats.compr_data_size? we don't use extra memory for dedupped
pages. or don't inc ->stats.compr_data_size for dedupped pages?

for instance, same_page_write() does not inc ->stats.compr_data_size,
while successful zram_dedup_find() does (in __zram_bvec_write()).

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
  2017-04-26  2:14   ` Sergey Senozhatsky
@ 2017-04-26  2:37   ` Sergey Senozhatsky
  2017-04-26  5:59     ` Joonsoo Kim
  2017-04-26  4:02   ` Sergey Senozhatsky
  2017-04-26  4:28   ` Sergey Senozhatsky
  3 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  2:37 UTC (permalink / raw)
  To: js1304
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team, Joonsoo Kim

On (04/26/17 09:52), js1304@gmail.com wrote:
[..]
>  struct zram_entry {
> +	struct rb_node rb_node;
> +	u32 len;
> +	u32 checksum;
> +	unsigned long refcount;

	use refcount_t? what do you think?

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
  2017-04-26  2:14   ` Sergey Senozhatsky
  2017-04-26  2:37   ` Sergey Senozhatsky
@ 2017-04-26  4:02   ` Sergey Senozhatsky
  2017-04-26  6:04     ` Joonsoo Kim
  2017-04-26  4:28   ` Sergey Senozhatsky
  3 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  4:02 UTC (permalink / raw)
  To: js1304
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team, Joonsoo Kim

On (04/26/17 09:52), js1304@gmail.com wrote:
[..]
> <no-dedup>
> Elapsed time: out/host: 88 s
> mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> 
> <dedup>
> Elapsed time: out/host: 100 s
> mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> 
> It shows performance degradation roughly 13% and save 24% memory. Maybe,
> it is due to overhead of calculating checksum and comparison.

I like the patch set, and it makes sense. the benefit is, obviously,
case-by-case. on my system I've managed to save just 60MB on a 2.7G
data set, which is far less than I was hoping to save :)


I usually do DIRECT IO fio performance test. JFYI, the results
were as follows:

                     NO DEDUP    DEDUP
#jobs1                         	                
WRITE:               (586MB/s)	 (235MB/s)     
WRITE:               (535MB/s)	 (226MB/s)     
READ:                (415MB/s)	 (189MB/s)     
WRITE:               (416MB/s)	 (190MB/s)     
READ:                (382MB/s)	 (375MB/s)     
READ:                (312MB/s)	 (314MB/s)     
READ:                (197MB/s)	 (166MB/s)     
WRITE:               (197MB/s)	 (166MB/s)     
#jobs2                         	                
WRITE:              (1039MB/s)	 (447MB/s)     
WRITE:               (642MB/s)	 (262MB/s)     
READ:                (654MB/s)	 (337MB/s)     
WRITE:               (654MB/s)	 (337MB/s)     
READ:               (1444MB/s)	 (1184MB/s)    
READ:               (1291MB/s)	 (1140MB/s)    
READ:                (451MB/s)	 (223MB/s)     
WRITE:               (451MB/s)	 (224MB/s)     
#jobs3                         	                
WRITE:              (1175MB/s)	 (648MB/s)     
WRITE:               (678MB/s)	 (277MB/s)     
READ:                (921MB/s)	 (509MB/s)     
WRITE:               (921MB/s)	 (509MB/s)     
READ:               (1322MB/s)	 (1146MB/s)    
READ:               (1642MB/s)	 (1266MB/s)    
READ:                (583MB/s)	 (248MB/s)     
WRITE:               (582MB/s)	 (248MB/s)     
#jobs4                         	                
WRITE:              (1734MB/s)	 (878MB/s)     
WRITE:               (712MB/s)	 (291MB/s)     
READ:               (1298MB/s)	 (695MB/s)     
WRITE:              (1300MB/s)	 (695MB/s)     
READ:               (1885MB/s)	 (2197MB/s)    
READ:               (2908MB/s)	 (2256MB/s)    
READ:                (621MB/s)	 (268MB/s)     
WRITE:               (622MB/s)	 (268MB/s)     
#jobs5                         	                
WRITE:              (2260MB/s)	 (1104MB/s)    
WRITE:               (740MB/s)	 (296MB/s)     
READ:               (1578MB/s)	 (900MB/s)     
WRITE:              (1579MB/s)	 (901MB/s)     
READ:               (2378MB/s)	 (1969MB/s)    
READ:               (2573MB/s)	 (1753MB/s)    
READ:                (650MB/s)	 (275MB/s)     
WRITE:               (650MB/s)	 (275MB/s)     
#jobs6                         	                
WRITE:              (2907MB/s)	 (1331MB/s)    
WRITE:               (773MB/s)	 (299MB/s)     
READ:               (1733MB/s)	 (1010MB/s)    
WRITE:              (1734MB/s)	 (1011MB/s)    
READ:               (3010MB/s)	 (2465MB/s)    
READ:               (2715MB/s)	 (1772MB/s)    
READ:                (672MB/s)	 (280MB/s)     
WRITE:               (672MB/s)	 (280MB/s)     
#jobs7                         	                
WRITE:              (3098MB/s)	 (1526MB/s)    
WRITE:               (779MB/s)	 (299MB/s)     
READ:               (1800MB/s)	 (1151MB/s)    
WRITE:              (1800MB/s)	 (1151MB/s)    
READ:               (3294MB/s)	 (2765MB/s)    
READ:               (2861MB/s)	 (2031MB/s)    
READ:                (693MB/s)	 (283MB/s)     
WRITE:               (693MB/s)	 (283MB/s)     
#jobs8                         	                
WRITE:              (3425MB/s)	 (1739MB/s)    
WRITE:               (792MB/s)	 (302MB/s)     
READ:               (2003MB/s)	 (1289MB/s)    
WRITE:              (2004MB/s)	 (1289MB/s)    
READ:               (3901MB/s)	 (3218MB/s)    
READ:               (2954MB/s)	 (2297MB/s)    
READ:                (714MB/s)	 (285MB/s)     
WRITE:               (714MB/s)	 (286MB/s)     
#jobs9                         	                
WRITE:              (3744MB/s)	 (1944MB/s)    
WRITE:               (794MB/s)	 (303MB/s)     
READ:               (2038MB/s)	 (1400MB/s)    
WRITE:              (2040MB/s)	 (1401MB/s)    
READ:               (4159MB/s)	 (3583MB/s)    
READ:               (2963MB/s)	 (2425MB/s)    
READ:                (700MB/s)	 (286MB/s)     
WRITE:               (700MB/s)	 (286MB/s)     
#jobs10                        	                
WRITE:              (3863MB/s)	 (2122MB/s)    
WRITE:               (780MB/s)	 (304MB/s)     
READ:               (2059MB/s)	 (1551MB/s)    
WRITE:              (2060MB/s)	 (1552MB/s)    
READ:               (4573MB/s)	 (4004MB/s)    
READ:               (3025MB/s)	 (2271MB/s)    
READ:                (706MB/s)	 (287MB/s)     
WRITE:               (706MB/s)	 (287MB/s)     

                                   NO DEDUP                      DEDUP
jobs1                              perfstat         	                          
stalled-cycles-frontend      86,881,019,525 (  45.68%)	   95,848,055,454 (  44.71%)
stalled-cycles-backend       41,714,207,854 (  21.93%)	   44,958,946,213 (  20.97%)
instructions                179,619,952,260 (    0.94)	  208,777,551,585 (    0.97)
branches                     40,332,931,254 ( 669.275)	   46,951,257,852 ( 680.888)
branch-misses                   211,977,694 (   0.53%)	      227,694,865 (   0.48%)
jobs2                              perfstat         	                          
stalled-cycles-frontend     126,409,074,631 (  49.54%)	  151,381,853,397 (  50.04%)
stalled-cycles-backend       63,694,911,449 (  24.96%)	   75,662,875,621 (  25.01%)
instructions                237,127,811,965 (    0.93)	  278,316,276,728 (    0.92)
branches                     51,822,635,738 ( 597.433)	   60,986,851,952 ( 590.588)
branch-misses                   315,755,016 (   0.61%)	      347,424,093 (   0.57%)
jobs3                              perfstat         	                          
stalled-cycles-frontend     191,089,532,462 (  51.36%)	  231,335,643,820 (  52.60%)
stalled-cycles-backend       97,686,908,504 (  26.26%)	  120,989,629,180 (  27.51%)
instructions                331,602,259,024 (    0.89)	  381,383,314,335 (    0.87)
branches                     72,970,225,738 ( 539.030)	   84,708,904,219 ( 536.393)
branch-misses                   507,931,782 (   0.70%)	      518,364,954 (   0.61%)
jobs4                              perfstat         	                          
stalled-cycles-frontend     223,445,892,011 (  52.52%)	  249,360,340,619 (  53.35%)
stalled-cycles-backend      114,943,710,906 (  27.02%)	  128,993,039,366 (  27.60%)
instructions                380,626,111,963 (    0.89)	  414,605,021,230 (    0.89)
branches                     82,445,041,251 ( 536.219)	   89,259,825,600 ( 535.932)
branch-misses                   583,497,374 (   0.71%)	      588,281,124 (   0.66%)
jobs5                              perfstat         	                          
stalled-cycles-frontend     263,619,571,807 (  52.85%)	  321,088,781,157 (  54.21%)
stalled-cycles-backend      134,179,697,076 (  26.90%)	  167,682,863,875 (  28.31%)
instructions                443,228,511,108 (    0.89)	  507,399,916,840 (    0.86)
branches                     95,466,677,594 ( 532.379)	  110,214,238,221 ( 523.279)
branch-misses                   672,890,143 (   0.70%)	      730,685,294 (   0.66%)
jobs6                              perfstat         	                          
stalled-cycles-frontend     291,216,196,229 (  53.06%)	  368,134,629,095 (  54.77%)
stalled-cycles-backend      148,091,553,338 (  26.98%)	  196,277,284,992 (  29.20%)
instructions                490,912,765,147 (    0.89)	  568,687,905,057 (    0.85)
branches                    105,042,602,744 ( 532.627)	  123,581,871,771 ( 515.815)
branch-misses                   714,275,025 (   0.68%)	      795,739,846 (   0.64%)
jobs7                              perfstat         	                          
stalled-cycles-frontend     330,653,507,547 (  53.29%)	  408,847,632,764 (  54.82%)
stalled-cycles-backend      173,091,173,960 (  27.90%)	  225,113,359,025 (  30.18%)
instructions                551,239,096,548 (    0.89)	  634,980,768,831 (    0.85)
branches                    117,955,044,019 ( 531.343)	  137,597,472,747 ( 516.418)
branch-misses                   803,168,054 (   0.68%)	      890,836,684 (   0.65%)
jobs8                              perfstat         	                          
stalled-cycles-frontend     356,648,221,633 (  53.60%)	  430,630,870,321 (  54.78%)
stalled-cycles-backend      181,355,532,439 (  27.26%)	  225,833,853,276 (  28.73%)
instructions                591,082,455,684 (    0.89)	  675,551,813,173 (    0.86)
branches                    125,997,333,678 ( 530.684)	  145,611,739,589 ( 519.054)
branch-misses                   823,592,583 (   0.65%)	      934,474,711 (   0.64%)
jobs9                              perfstat         	                          
stalled-cycles-frontend     390,346,479,419 (  53.77%)	  464,669,227,714 (  54.83%)
stalled-cycles-backend      194,450,562,714 (  26.79%)	  231,567,413,546 (  27.32%)
instructions                645,446,764,508 (    0.89)	  732,189,754,115 (    0.86)
branches                    137,219,009,187 ( 529.491)	  157,387,961,718 ( 520.710)
branch-misses                   898,414,437 (   0.65%)	      990,118,876 (   0.63%)
jobs10                             perfstat         	                          
stalled-cycles-frontend     424,813,625,460 (  54.03%)	  504,229,836,920 (  54.93%)
stalled-cycles-backend      216,498,718,346 (  27.54%)	  264,019,747,607 (  28.76%)
instructions                698,139,050,494 (    0.89)	  791,700,475,394 (    0.86)
branches                    148,117,364,023 ( 527.254)	  170,053,726,692 ( 520.519)
branch-misses                   962,807,047 (   0.65%)	    1,065,430,885 (   0.63%)


                       NO DEDUP         DEDUP
seconds elapsed        63.480344573	97.971439666
seconds elapsed        58.655823065	110.635292879
seconds elapsed        73.470677655	139.499601409
seconds elapsed        78.538327853	150.636241458
seconds elapsed        87.074751505	177.352050355
seconds elapsed        96.067528245	200.932336337
seconds elapsed        107.100552618	223.780631440
seconds elapsed        114.803603478	244.200975111
seconds elapsed        127.158686582	267.434811513
seconds elapsed        139.436204098	291.221754828

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
                     ` (2 preceding siblings ...)
  2017-04-26  4:02   ` Sergey Senozhatsky
@ 2017-04-26  4:28   ` Sergey Senozhatsky
  2017-04-26  6:08     ` Joonsoo Kim
  3 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  4:28 UTC (permalink / raw)
  To: js1304
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team, Joonsoo Kim

On (04/26/17 09:52), js1304@gmail.com wrote:
[..]
> +struct zram_hash {
> +	spinlock_t lock;
> +	struct rb_root rb_root;
>  };

just a note.

we can easily have N CPUs spinning on ->lock for __zram_dedup_get() lookup,
which can invole a potentially slow zcomp_decompress() [zlib, for example,
with 64k pages] and memcmp(). the larger PAGE_SHIFT is, the more serialized
IOs become. in theory, at least.

CPU0				CPU1		...	CPUN

__zram_bvec_write()	__zram_bvec_write()		__zram_bvec_write()
 zram_dedup_find()	 zram_dedup_find()		 zram_dedup_find()
  spin_lock(&hash->lock);
			  spin_lock(&hash->lock);	  spin_lock(&hash->lock);
   __zram_dedup_get()
    zcomp_decompress()
     ...


so may be there is a way to use read-write lock instead on spinlock for hash
and reduce write/read IO serialization.

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  2:14   ` Sergey Senozhatsky
@ 2017-04-26  5:57     ` Joonsoo Kim
  2017-04-26  6:29       ` Minchan Kim
  0 siblings, 1 reply; 25+ messages in thread
From: Joonsoo Kim @ 2017-04-26  5:57 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team

On Wed, Apr 26, 2017 at 11:14:52AM +0900, Sergey Senozhatsky wrote:
> Hello,
> 
> On (04/26/17 09:52), js1304@gmail.com wrote:
> [..]
> >  	ret = scnprintf(buf, PAGE_SIZE,
> > -			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> > +			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
> >  			orig_size << PAGE_SHIFT,
> >  			(u64)atomic64_read(&zram->stats.compr_data_size),
> >  			mem_used << PAGE_SHIFT,
> >  			zram->limit_pages << PAGE_SHIFT,
> >  			max_used << PAGE_SHIFT,
> >  			(u64)atomic64_read(&zram->stats.same_pages),
> > -			pool_stats.pages_compacted);
> > +			pool_stats.pages_compacted,
> > +			zram_dedup_dup_size(zram),
> > +			zram_dedup_meta_size(zram));
> 
> hm... should't we subtract zram_dedup_dup_size(zram) from
> ->stats.compr_data_size? we don't use extra memory for dedupped
> pages. or don't inc ->stats.compr_data_size for dedupped pages?

Hmm... My intention is to keep previous stat as much as possible. User
can just notice the saving by only checking mem_used.

However, it's also odd that compr_data_size doesn't show actual
compressed data size.

Minchan, what do you think about it?

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  2:37   ` Sergey Senozhatsky
@ 2017-04-26  5:59     ` Joonsoo Kim
  2017-04-26  6:04       ` Sergey Senozhatsky
  0 siblings, 1 reply; 25+ messages in thread
From: Joonsoo Kim @ 2017-04-26  5:59 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team

On Wed, Apr 26, 2017 at 11:37:18AM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 09:52), js1304@gmail.com wrote:
> [..]
> >  struct zram_entry {
> > +	struct rb_node rb_node;
> > +	u32 len;
> > +	u32 checksum;
> > +	unsigned long refcount;
> 
> 	use refcount_t? what do you think?

We don't need atomic operation for refcount but refcount_t does.
API of refcount_t provides additional guarantee that refcount will not
overflow but I'm not sure that this overhead is needed here.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  4:02   ` Sergey Senozhatsky
@ 2017-04-26  6:04     ` Joonsoo Kim
  2017-04-26  6:21       ` Sergey Senozhatsky
  0 siblings, 1 reply; 25+ messages in thread
From: Joonsoo Kim @ 2017-04-26  6:04 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team

On Wed, Apr 26, 2017 at 01:02:43PM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 09:52), js1304@gmail.com wrote:
> [..]
> > <no-dedup>
> > Elapsed time: out/host: 88 s
> > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> > 
> > <dedup>
> > Elapsed time: out/host: 100 s
> > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> > 
> > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > it is due to overhead of calculating checksum and comparison.
> 
> I like the patch set, and it makes sense. the benefit is, obviously,
> case-by-case. on my system I've managed to save just 60MB on a 2.7G
> data set, which is far less than I was hoping to save :)
> 
> 
> I usually do DIRECT IO fio performance test. JFYI, the results
> were as follows:

Could you share your fio test setting? I will try to re-generate the
result and analyze it.

I guess that contention happens due to same data page. Could you check
it?

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  5:59     ` Joonsoo Kim
@ 2017-04-26  6:04       ` Sergey Senozhatsky
  0 siblings, 0 replies; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  6:04 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim,
	Sergey Senozhatsky, linux-kernel, kernel-team

On (04/26/17 14:59), Joonsoo Kim wrote:
> On Wed, Apr 26, 2017 at 11:37:18AM +0900, Sergey Senozhatsky wrote:
> > On (04/26/17 09:52), js1304@gmail.com wrote:
> > [..]
> > >  struct zram_entry {
> > > +	struct rb_node rb_node;
> > > +	u32 len;
> > > +	u32 checksum;
> > > +	unsigned long refcount;
> > 
> > 	use refcount_t? what do you think?
> 
> We don't need atomic operation for refcount but refcount_t does.
> API of refcount_t provides additional guarantee that refcount will not
> overflow but I'm not sure that this overhead is needed here.

refcount_t has some additional debugging features which probably
could be helpful one day, but not necessarily needed in current
zram_dedup implementation, agree. asked just in case.

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  4:28   ` Sergey Senozhatsky
@ 2017-04-26  6:08     ` Joonsoo Kim
  2017-04-26  6:14       ` Sergey Senozhatsky
  0 siblings, 1 reply; 25+ messages in thread
From: Joonsoo Kim @ 2017-04-26  6:08 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team

On Wed, Apr 26, 2017 at 01:28:26PM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 09:52), js1304@gmail.com wrote:
> [..]
> > +struct zram_hash {
> > +	spinlock_t lock;
> > +	struct rb_root rb_root;
> >  };
> 
> just a note.
> 
> we can easily have N CPUs spinning on ->lock for __zram_dedup_get() lookup,
> which can invole a potentially slow zcomp_decompress() [zlib, for example,
> with 64k pages] and memcmp(). the larger PAGE_SHIFT is, the more serialized
> IOs become. in theory, at least.
> 
> CPU0				CPU1		...	CPUN
> 
> __zram_bvec_write()	__zram_bvec_write()		__zram_bvec_write()
>  zram_dedup_find()	 zram_dedup_find()		 zram_dedup_find()
>   spin_lock(&hash->lock);
> 			  spin_lock(&hash->lock);	  spin_lock(&hash->lock);
>    __zram_dedup_get()
>     zcomp_decompress()
>      ...
> 
> 
> so may be there is a way to use read-write lock instead on spinlock for hash
> and reduce write/read IO serialization.

In fact, dedup release hash->lock before doing zcomp_decompress(). So,
above contention cannot happen.

However, contention still possible when traversing the rb_tree. If
your fio shows that contention, I will change it to read-write lock.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  6:08     ` Joonsoo Kim
@ 2017-04-26  6:14       ` Sergey Senozhatsky
  0 siblings, 0 replies; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  6:14 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim,
	Sergey Senozhatsky, linux-kernel, kernel-team

On (04/26/17 15:08), Joonsoo Kim wrote:
> > > +struct zram_hash {
> > > +	spinlock_t lock;
> > > +	struct rb_root rb_root;
> > >  };
> > 
> > just a note.
> > 
> > we can easily have N CPUs spinning on ->lock for __zram_dedup_get() lookup,
> > which can invole a potentially slow zcomp_decompress() [zlib, for example,
> > with 64k pages] and memcmp(). the larger PAGE_SHIFT is, the more serialized
> > IOs become. in theory, at least.
> > 
> > CPU0				CPU1		...	CPUN
> > 
> > __zram_bvec_write()	__zram_bvec_write()		__zram_bvec_write()
> >  zram_dedup_find()	 zram_dedup_find()		 zram_dedup_find()
> >   spin_lock(&hash->lock);
> > 			  spin_lock(&hash->lock);	  spin_lock(&hash->lock);
> >    __zram_dedup_get()
> >     zcomp_decompress()
> >      ...
> > 
> > 
> > so may be there is a way to use read-write lock instead on spinlock for hash
> > and reduce write/read IO serialization.
> 
> In fact, dedup release hash->lock before doing zcomp_decompress(). So,
> above contention cannot happen.

oh, my bad. you are right. somehow I didn't spot the unlock.

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  6:04     ` Joonsoo Kim
@ 2017-04-26  6:21       ` Sergey Senozhatsky
  2017-04-27  6:57         ` Joonsoo Kim
  0 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  6:21 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim,
	Sergey Senozhatsky, linux-kernel, kernel-team

On (04/26/17 15:04), Joonsoo Kim wrote:
> On Wed, Apr 26, 2017 at 01:02:43PM +0900, Sergey Senozhatsky wrote:
> > On (04/26/17 09:52), js1304@gmail.com wrote:
> > [..]
> > > <no-dedup>
> > > Elapsed time: out/host: 88 s
> > > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> > > 
> > > <dedup>
> > > Elapsed time: out/host: 100 s
> > > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> > > 
> > > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > > it is due to overhead of calculating checksum and comparison.
> > 
> > I like the patch set, and it makes sense. the benefit is, obviously,
> > case-by-case. on my system I've managed to save just 60MB on a 2.7G
> > data set, which is far less than I was hoping to save :)
> > 
> > 
> > I usually do DIRECT IO fio performance test. JFYI, the results
> > were as follows:
> 
> Could you share your fio test setting? I will try to re-generate the
> result and analyze it.

sure.

I think I used this one: https://github.com/sergey-senozhatsky/zram-perf-test

// hm... may be slightly modified on my box.

I'll run more tests.



what I did:

#0
ZRAM_SIZE=2G ZRAM_COMP_ALG=lzo LOG_SUFFIX=NO-DEDUP FIO_LOOPS=2 ./zram-fio-test.sh

#1
add `echo 1 > /sys/block/zram0/use_dedup` to create_zram
ZRAM_SIZE=2G ZRAM_COMP_ALG=lzo LOG_SUFFIX=DEDUP FIO_LOOPS=2 ./zram-fio-test.sh


both in ./conf/fio-template-static-buffer fio config.

and then

#2
./fio-perf-o-meter.sh /tmp/test-fio-zram-NO-DEDUP /tmp/test-fio-zram-DEDUP > /tmp/RES

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  5:57     ` Joonsoo Kim
@ 2017-04-26  6:29       ` Minchan Kim
  2017-04-26  6:59         ` Joonsoo Kim
  2017-04-26  7:12         ` Sergey Senozhatsky
  0 siblings, 2 replies; 25+ messages in thread
From: Minchan Kim @ 2017-04-26  6:29 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Sergey Senozhatsky, Andrew Morton, Sergey Senozhatsky,
	linux-kernel, kernel-team

Hi Sergey and Joonsoo,

On Wed, Apr 26, 2017 at 02:57:03PM +0900, Joonsoo Kim wrote:
> On Wed, Apr 26, 2017 at 11:14:52AM +0900, Sergey Senozhatsky wrote:
> > Hello,
> > 
> > On (04/26/17 09:52), js1304@gmail.com wrote:
> > [..]
> > >  	ret = scnprintf(buf, PAGE_SIZE,
> > > -			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> > > +			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
> > >  			orig_size << PAGE_SHIFT,
> > >  			(u64)atomic64_read(&zram->stats.compr_data_size),
> > >  			mem_used << PAGE_SHIFT,
> > >  			zram->limit_pages << PAGE_SHIFT,
> > >  			max_used << PAGE_SHIFT,
> > >  			(u64)atomic64_read(&zram->stats.same_pages),
> > > -			pool_stats.pages_compacted);
> > > +			pool_stats.pages_compacted,
> > > +			zram_dedup_dup_size(zram),
> > > +			zram_dedup_meta_size(zram));
> > 
> > hm... should't we subtract zram_dedup_dup_size(zram) from
> > ->stats.compr_data_size? we don't use extra memory for dedupped
> > pages. or don't inc ->stats.compr_data_size for dedupped pages?
> 
> Hmm... My intention is to keep previous stat as much as possible. User
> can just notice the saving by only checking mem_used.
> 
> However, it's also odd that compr_data_size doesn't show actual
> compressed data size.

Actually, I found it for the last review cycle but didn't say that
intentionally. Because it is also odd to me that pages_stored isn't
increased for same_pages so I thought we can fix it all.

I mean:

* normal page
        inc pages_stored
        inc compr_data_size
* same_page
        inc pages_stored
        inc same_pages
* dedup_page
        inc pages_stored
        inc dup_data_size
         
IOW, pages_stored should be increased for every write IO.
But the concern is we have said in zram.txt

 orig_data_size   uncompressed size of data stored in this disk.
                  This excludes same-element-filled pages (same_pages) since
                  no memory is allocated for them.

So, we might be too late. :-(
What do you think about it?
If anyone doesn't have any objection, I want to correct it all.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  6:29       ` Minchan Kim
@ 2017-04-26  6:59         ` Joonsoo Kim
  2017-04-26  7:21           ` Sergey Senozhatsky
  2017-04-26  7:12         ` Sergey Senozhatsky
  1 sibling, 1 reply; 25+ messages in thread
From: Joonsoo Kim @ 2017-04-26  6:59 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Andrew Morton, Sergey Senozhatsky, LKML, kernel-team

2017-04-26 15:29 GMT+09:00 Minchan Kim <minchan@kernel.org>:
> Hi Sergey and Joonsoo,
>
> On Wed, Apr 26, 2017 at 02:57:03PM +0900, Joonsoo Kim wrote:
>> On Wed, Apr 26, 2017 at 11:14:52AM +0900, Sergey Senozhatsky wrote:
>> > Hello,
>> >
>> > On (04/26/17 09:52), js1304@gmail.com wrote:
>> > [..]
>> > >   ret = scnprintf(buf, PAGE_SIZE,
>> > > -                 "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
>> > > +                 "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
>> > >                   orig_size << PAGE_SHIFT,
>> > >                   (u64)atomic64_read(&zram->stats.compr_data_size),
>> > >                   mem_used << PAGE_SHIFT,
>> > >                   zram->limit_pages << PAGE_SHIFT,
>> > >                   max_used << PAGE_SHIFT,
>> > >                   (u64)atomic64_read(&zram->stats.same_pages),
>> > > -                 pool_stats.pages_compacted);
>> > > +                 pool_stats.pages_compacted,
>> > > +                 zram_dedup_dup_size(zram),
>> > > +                 zram_dedup_meta_size(zram));
>> >
>> > hm... should't we subtract zram_dedup_dup_size(zram) from
>> > ->stats.compr_data_size? we don't use extra memory for dedupped
>> > pages. or don't inc ->stats.compr_data_size for dedupped pages?
>>
>> Hmm... My intention is to keep previous stat as much as possible. User
>> can just notice the saving by only checking mem_used.
>>
>> However, it's also odd that compr_data_size doesn't show actual
>> compressed data size.
>
> Actually, I found it for the last review cycle but didn't say that
> intentionally. Because it is also odd to me that pages_stored isn't
> increased for same_pages so I thought we can fix it all.
>
> I mean:
>
> * normal page
>         inc pages_stored
>         inc compr_data_size
> * same_page
>         inc pages_stored
>         inc same_pages
> * dedup_page
>         inc pages_stored
>         inc dup_data_size
>
> IOW, pages_stored should be increased for every write IO.
> But the concern is we have said in zram.txt
>
>  orig_data_size   uncompressed size of data stored in this disk.
>                   This excludes same-element-filled pages (same_pages) since
>                   no memory is allocated for them.
>
> So, we might be too late. :-(
> What do you think about it?
> If anyone doesn't have any objection, I want to correct it all.

I have no objection.
If so, do I need to postpone this patchset until others are fixed?

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  6:29       ` Minchan Kim
  2017-04-26  6:59         ` Joonsoo Kim
@ 2017-04-26  7:12         ` Sergey Senozhatsky
  1 sibling, 0 replies; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  7:12 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Joonsoo Kim, Sergey Senozhatsky, Andrew Morton,
	Sergey Senozhatsky, linux-kernel, kernel-team

On (04/26/17 15:29), Minchan Kim wrote:
[..]
> Actually, I found it for the last review cycle but didn't say that
> intentionally. Because it is also odd to me that pages_stored isn't
> increased for same_pages so I thought we can fix it all.
> 
> I mean:
> 
> * normal page
>         inc pages_stored
>         inc compr_data_size
> * same_page
>         inc pages_stored
>         inc same_pages
> * dedup_page
>         inc pages_stored
>         inc dup_data_size
>
> IOW, pages_stored should be increased for every write IO.

looks good to me.

> But the concern is we have said in zram.txt
> 
>  orig_data_size   uncompressed size of data stored in this disk.
>                   This excludes same-element-filled pages (same_pages) since
>                   no memory is allocated for them.
> 
> So, we might be too late. :-(
> What do you think about it?

personally, I don't think anyone cares :) the numbers are different
(and, apparently, not quite accurate) all the time. my point was that
stats were not represented in a very convenient way and I agree with
the proposed change.

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  6:59         ` Joonsoo Kim
@ 2017-04-26  7:21           ` Sergey Senozhatsky
  2017-04-26  7:39             ` Minchan Kim
  0 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-26  7:21 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Minchan Kim, Sergey Senozhatsky, Andrew Morton,
	Sergey Senozhatsky, LKML, kernel-team

On (04/26/17 15:59), Joonsoo Kim wrote:
[..]
> > Actually, I found it for the last review cycle but didn't say that
> > intentionally. Because it is also odd to me that pages_stored isn't
> > increased for same_pages so I thought we can fix it all.
> >
> > I mean:
> >
> > * normal page
> >         inc pages_stored
> >         inc compr_data_size
> > * same_page
> >         inc pages_stored
> >         inc same_pages
> > * dedup_page
> >         inc pages_stored
> >         inc dup_data_size
> >
> > IOW, pages_stored should be increased for every write IO.
> > But the concern is we have said in zram.txt
> >
> >  orig_data_size   uncompressed size of data stored in this disk.
> >                   This excludes same-element-filled pages (same_pages) since
> >                   no memory is allocated for them.
> >
> > So, we might be too late. :-(
> > What do you think about it?
> > If anyone doesn't have any objection, I want to correct it all.
> 
> I have no objection.
> If so, do I need to postpone this patchset until others are fixed?

this probably will mess with your series a lot. so I don't mind if you or
Minchan will send stats-fixup patch after the dedup series. may be/preferably
as the last patch in the series. but if you or Minchan want to fix stats
first, then I wouldn't mind either. I just don't make a big deal out of those
stats, a bunch of fun to know numbers. my 5cents.

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  7:21           ` Sergey Senozhatsky
@ 2017-04-26  7:39             ` Minchan Kim
  0 siblings, 0 replies; 25+ messages in thread
From: Minchan Kim @ 2017-04-26  7:39 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Joonsoo Kim, Andrew Morton, Sergey Senozhatsky, LKML, kernel-team

On Wed, Apr 26, 2017 at 04:21:47PM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 15:59), Joonsoo Kim wrote:
> [..]
> > > Actually, I found it for the last review cycle but didn't say that
> > > intentionally. Because it is also odd to me that pages_stored isn't
> > > increased for same_pages so I thought we can fix it all.
> > >
> > > I mean:
> > >
> > > * normal page
> > >         inc pages_stored
> > >         inc compr_data_size
> > > * same_page
> > >         inc pages_stored
> > >         inc same_pages
> > > * dedup_page
> > >         inc pages_stored
> > >         inc dup_data_size
> > >
> > > IOW, pages_stored should be increased for every write IO.
> > > But the concern is we have said in zram.txt
> > >
> > >  orig_data_size   uncompressed size of data stored in this disk.
> > >                   This excludes same-element-filled pages (same_pages) since
> > >                   no memory is allocated for them.
> > >
> > > So, we might be too late. :-(
> > > What do you think about it?
> > > If anyone doesn't have any objection, I want to correct it all.
> > 
> > I have no objection.
> > If so, do I need to postpone this patchset until others are fixed?
> 
> this probably will mess with your series a lot. so I don't mind if you or
> Minchan will send stats-fixup patch after the dedup series. may be/preferably
> as the last patch in the series. but if you or Minchan want to fix stats
> first, then I wouldn't mind either. I just don't make a big deal out of those
> stats, a bunch of fun to know numbers. my 5cents.

After Andrew takes dedup patchset, I will fix it later.
Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-26  6:21       ` Sergey Senozhatsky
@ 2017-04-27  6:57         ` Joonsoo Kim
  2017-04-27  7:46           ` Sergey Senozhatsky
  0 siblings, 1 reply; 25+ messages in thread
From: Joonsoo Kim @ 2017-04-27  6:57 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team

On Wed, Apr 26, 2017 at 03:21:04PM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 15:04), Joonsoo Kim wrote:
> > On Wed, Apr 26, 2017 at 01:02:43PM +0900, Sergey Senozhatsky wrote:
> > > On (04/26/17 09:52), js1304@gmail.com wrote:
> > > [..]
> > > > <no-dedup>
> > > > Elapsed time: out/host: 88 s
> > > > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> > > > 
> > > > <dedup>
> > > > Elapsed time: out/host: 100 s
> > > > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> > > > 
> > > > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > > > it is due to overhead of calculating checksum and comparison.
> > > 
> > > I like the patch set, and it makes sense. the benefit is, obviously,
> > > case-by-case. on my system I've managed to save just 60MB on a 2.7G
> > > data set, which is far less than I was hoping to save :)
> > > 
> > > 
> > > I usually do DIRECT IO fio performance test. JFYI, the results
> > > were as follows:
> > 
> > Could you share your fio test setting? I will try to re-generate the
> > result and analyze it.
> 
> sure.
> 
> I think I used this one: https://github.com/sergey-senozhatsky/zram-perf-test
> 
> // hm... may be slightly modified on my box.
> 
> I'll run more tests.
> 
> 

Hello,

I tested with your benchmark and found that contention happens
since the data page is perfectly the same. All the written data (2GB)
is de-duplicated.

I tried to optimize it with read-write lock but I failed since
there is another contention, which cannot be fixed simply. That is
zsmalloc. We need to map the object and compare the content of the
compressed page to check de-duplication. Zsmalloc pins the object
by using bit spinlock when mapping. So, parallel readers to the same
object contend here.

I think that this case is so artificial and, in practice, there
would be no case that the same data page is repeatedly and parallel
written as like this. So, I'd like to keep current code. How do you
think about it, Sergey?

Just note, if we do parallel read (direct-io) to the same offset,
zsmalloc contention would happen regardless deduplication feature.
It seems that it's fundamental issue in zsmalloc.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-27  6:57         ` Joonsoo Kim
@ 2017-04-27  7:46           ` Sergey Senozhatsky
  2017-05-02  5:31             ` Joonsoo Kim
  0 siblings, 1 reply; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-27  7:46 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim,
	Sergey Senozhatsky, linux-kernel, kernel-team

Hello,

On (04/27/17 15:57), Joonsoo Kim wrote:
[..]
> I tested with your benchmark and found that contention happens
> since the data page is perfectly the same. All the written data (2GB)
> is de-duplicated.

yes, a statically filled buffer to guarantee that
compression/decompression numbers/impact will be stable.
otherwise the test results are "apples vs oranges" :)

> I tried to optimize it with read-write lock but I failed since
> there is another contention, which cannot be fixed simply. That is
> zsmalloc. We need to map the object and compare the content of the
> compressed page to check de-duplication. Zsmalloc pins the object
> by using bit spinlock when mapping. So, parallel readers to the same
> object contend here.
>
> I think that this case is so artificial and, in practice, there
> would be no case that the same data page is repeatedly and parallel
> written as like this. So, I'd like to keep current code. How do you
> think about it, Sergey?

I agree. thanks for taking a look!
I see no blockers for the patch set.


<off topic>

ok, in general, seems that (correct me if I'm wrong)

  a) the higher the dedup ratio the slower zram _can_ perform.

because dedup can create parallel access scenarios where they previously
never existed: different offset writes now can compete for the same dedupped
zsmalloc object.

and... tricky and probably over exaggerated

  b) the lower the dedup ratio the slower zram _can_ perform.

think of almost full zram device with dedup ratio of just 3-5%. tree lookups
are serialized by the hash->lock. a balanced tree gives us slow lookup
complexity growth, it's still there but can leave with it. at the same time
low dedup ratio means that we have wasted CPU cycles on checksum calculation
(potentially for millions of pages if zram device in question is X gigabytes
in size), this can't go unnoticed.


it's just I was slightly confused by the performance numbers that you
have observed. some tests were

: It shows performance degradation roughly 13% and save 24% memory. Maybe,
: it is due to overhead of calculating checksum and comparison.

while others were

: There is no performance degradation and save 23% memory.


I understand that you didn't perform direct io, flush, fsync, etc. and
there is a whole bunch of factors that could have affected your tests,
e.g. write back, etc. etc. but the numbers are still very unstable.
may be now we will have a bit better understanding :)

</off topic>

> Just note, if we do parallel read (direct-io) to the same offset,
> zsmalloc contention would happen regardless deduplication feature.
> It seems that it's fundamental issue in zsmalloc.

that's a good find.

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 0/4] zram: implement deduplication in zram
  2017-04-26  0:52 [PATCH v4 0/4] zram: implement deduplication in zram js1304
                   ` (3 preceding siblings ...)
  2017-04-26  0:52 ` [PATCH v4 4/4] zram: compare all the entries with same checksum for deduplication js1304
@ 2017-04-27  7:49 ` Sergey Senozhatsky
  4 siblings, 0 replies; 25+ messages in thread
From: Sergey Senozhatsky @ 2017-04-27  7:49 UTC (permalink / raw)
  To: js1304
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team, Joonsoo Kim

On (04/26/17 09:52), js1304@gmail.com wrote:
> From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> Changes from v3
> o fix module build problem
> o make zram_dedup_enabled() statically return false if CONFIG_ZRAM_DEDUP=n

the entire patch set

Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>

	-ss

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4 2/4] zram: implement deduplication in zram
  2017-04-27  7:46           ` Sergey Senozhatsky
@ 2017-05-02  5:31             ` Joonsoo Kim
  0 siblings, 0 replies; 25+ messages in thread
From: Joonsoo Kim @ 2017-05-02  5:31 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Sergey Senozhatsky, linux-kernel,
	kernel-team

On Thu, Apr 27, 2017 at 04:46:22PM +0900, Sergey Senozhatsky wrote:
> Hello,
> 
> On (04/27/17 15:57), Joonsoo Kim wrote:
> [..]
> > I tested with your benchmark and found that contention happens
> > since the data page is perfectly the same. All the written data (2GB)
> > is de-duplicated.
> 
> yes, a statically filled buffer to guarantee that
> compression/decompression numbers/impact will be stable.
> otherwise the test results are "apples vs oranges" :)

Yes, but, we can maintain buffer set and using it on the test will
ensure "apples vs apples" test and will be less aftificial test.
Compression algorithm's effect also cannot be measured by a statically
filled buffer.

> 
> > I tried to optimize it with read-write lock but I failed since
> > there is another contention, which cannot be fixed simply. That is
> > zsmalloc. We need to map the object and compare the content of the
> > compressed page to check de-duplication. Zsmalloc pins the object
> > by using bit spinlock when mapping. So, parallel readers to the same
> > object contend here.
> >
> > I think that this case is so artificial and, in practice, there
> > would be no case that the same data page is repeatedly and parallel
> > written as like this. So, I'd like to keep current code. How do you
> > think about it, Sergey?
> 
> I agree. thanks for taking a look!
> I see no blockers for the patch set.

Thanks!

> 
> 
> <off topic>
> 
> ok, in general, seems that (correct me if I'm wrong)
> 
>   a) the higher the dedup ratio the slower zram _can_ perform.
> 
> because dedup can create parallel access scenarios where they previously
> never existed: different offset writes now can compete for the same dedupped
> zsmalloc object.

However, the higher the dedup ratio doesn't necessarily mean that all
the data is the same. Important thing is the distribution of the data
rather than dedup ratio. And, parallelism is also important.

> and... tricky and probably over exaggerated
> 
>   b) the lower the dedup ratio the slower zram _can_ perform.
> 
> think of almost full zram device with dedup ratio of just 3-5%. tree lookups
> are serialized by the hash->lock. a balanced tree gives us slow lookup
> complexity growth, it's still there but can leave with it. at the same time
> low dedup ratio means that we have wasted CPU cycles on checksum calculation
> (potentially for millions of pages if zram device in question is X gigabytes
> in size), this can't go unnoticed.
> 
> 
> it's just I was slightly confused by the performance numbers that you
> have observed. some tests were
> 
> : It shows performance degradation roughly 13% and save 24% memory. Maybe,
> : it is due to overhead of calculating checksum and comparison.
> 
> while others were
> 
> : There is no performance degradation and save 23% memory.

As I said above, dedup ratio itself doesn't say everything. And, they are
quite different tests (kernel build vs file copy) (multi thread vs
single thread) so I cannot say anything by the fact that their saving
ratio is similar. :)

> 
> I understand that you didn't perform direct io, flush, fsync, etc. and
> there is a whole bunch of factors that could have affected your tests,
> e.g. write back, etc. etc. but the numbers are still very unstable.
> may be now we will have a bit better understanding :)

I also hope so.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2017-05-02  5:31 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-26  0:52 [PATCH v4 0/4] zram: implement deduplication in zram js1304
2017-04-26  0:52 ` [PATCH v4 1/4] zram: introduce zram_entry to prepare dedup functionality js1304
2017-04-26  0:52 ` [PATCH v4 2/4] zram: implement deduplication in zram js1304
2017-04-26  2:14   ` Sergey Senozhatsky
2017-04-26  5:57     ` Joonsoo Kim
2017-04-26  6:29       ` Minchan Kim
2017-04-26  6:59         ` Joonsoo Kim
2017-04-26  7:21           ` Sergey Senozhatsky
2017-04-26  7:39             ` Minchan Kim
2017-04-26  7:12         ` Sergey Senozhatsky
2017-04-26  2:37   ` Sergey Senozhatsky
2017-04-26  5:59     ` Joonsoo Kim
2017-04-26  6:04       ` Sergey Senozhatsky
2017-04-26  4:02   ` Sergey Senozhatsky
2017-04-26  6:04     ` Joonsoo Kim
2017-04-26  6:21       ` Sergey Senozhatsky
2017-04-27  6:57         ` Joonsoo Kim
2017-04-27  7:46           ` Sergey Senozhatsky
2017-05-02  5:31             ` Joonsoo Kim
2017-04-26  4:28   ` Sergey Senozhatsky
2017-04-26  6:08     ` Joonsoo Kim
2017-04-26  6:14       ` Sergey Senozhatsky
2017-04-26  0:52 ` [PATCH v4 3/4] zram: make deduplication feature optional js1304
2017-04-26  0:52 ` [PATCH v4 4/4] zram: compare all the entries with same checksum for deduplication js1304
2017-04-27  7:49 ` [PATCH v4 0/4] zram: implement deduplication in zram Sergey Senozhatsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).