Re: [RFC PATCH v1 1/4] zram: introduce merge identical pages mechanism

From: Chen Wandun <chenwandun@huawei.com>
To: Alexey Romanov <avromanov@sberdevices.ru>, <minchan@kernel.org>,
	<senozhatsky@chromium.org>, <ngupta@vflare.org>,
	<akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
	<kernel@sberdevices.ru>, <ddrokosov@sberdevices.ru>
Subject: Re: [RFC PATCH v1 1/4] zram: introduce merge identical pages mechanism
Date: Wed, 23 Nov 2022 16:25:00 +0800	[thread overview]
Message-ID: <bc2c6314-7c94-8096-19f0-0c11882479e4@huawei.com> (raw)
In-Reply-To: <20221121190020.66548-2-avromanov@sberdevices.ru>


在 2022/11/22 3:00, Alexey Romanov 写道:
> zram maps each page (struct page) to a zsmalloc object that
> contains a compressed buffer of that page's data. This fact
> generates data redundancy: for example, two identical pages
> will be store in compressed form in zsmalloc allocator twice.
>
> This patch adds a mechanism to scan zram_table_entry array
> and frees all identical objects in zsmalloc allocator, leaving
> only one. All zram_table_entry elements which reference this
> freed objects now refer to the same, not freed, object.
>
> To implement this mechanism, we sequentially scan the
> zram_table_entry array, counting the hash from the contents of
> the compressed pages (zsmalloc objects) and enter the index of
> the object into the hash table (hlist_head). If the hash matches,
> we remove the identical zsmalloc (zs_free) objects and update
> the link rbtree.
>
> Also, we can't just call zs_free() function during zram_free_page().
> Before calling this function we have to make sure that no one else
> refers to this zsmalloc object.
>
> To implement the mechanism for merging identical compressed
> pages (zsmalloc objects), a rbtree is needed.
>
> The tree node key is a reference to the zsmalloc object
> (handle), and the value is the number of references to
> this object (atomic counter). This is necessary for data
> consistency so that we do not zs_free the object referenced
> by any element of the zram_table_entry array.
>
> Signed-off-by: Alexey Romanov <avromanov@sberdevices.ru>
> ---
>   drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++-
>   drivers/block/zram/zram_drv.h |   6 +
>   2 files changed, 283 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index e290d6d97047..716c2f72805e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -33,12 +33,15 @@
>   #include <linux/debugfs.h>
>   #include <linux/cpuhotplug.h>
>   #include <linux/part_stat.h>
> +#include <linux/hashtable.h>
> +#include <linux/xxhash.h>
>   
>   #include "zram_drv.h"
>   
>   static DEFINE_IDR(zram_index_idr);
>   /* idr index must be protected */
>   static DEFINE_MUTEX(zram_index_mutex);
> +static DEFINE_MUTEX(zram_rbtree_mutex);
>   
>   static int zram_major;
>   static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
> @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
>   static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
>   				u32 index, int offset, struct bio *bio);
>   
> +struct zram_rbtree_node {
> +	struct rb_node node;
> +	unsigned long key;
> +	unsigned long cnt;
> +};
> +
> +struct zram_hash_node {
> +	unsigned long index;
> +	struct hlist_node next;
> +};
>   
>   static int zram_slot_trylock(struct zram *zram, u32 index)
>   {
> @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat);
>   #endif
>   static DEVICE_ATTR_RO(debug_stat);
>   
> +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
> +{
> +	struct rb_node **new = &(root->rb_node), *parent = NULL;
> +	struct zram_rbtree_node *this;
> +
> +	while (*new) {
> +		this = rb_entry(*new, struct zram_rbtree_node, node);
> +		parent = *new;
> +		if (data->key < this->key)
> +			new = &((*new)->rb_left);
> +		else if (data->key > this->key)
> +			new = &((*new)->rb_right);
> +		else
> +			return false;
> +	}
> +
> +	rb_link_node(&data->node, parent, new);
> +	rb_insert_color(&data->node, root);
> +	return true;
> +}
> +
> +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
> +		unsigned long key)
> +{
> +	struct rb_node *node = root->rb_node;
> +	struct zram_rbtree_node *data;
> +
> +	while (node) {
> +		data = rb_entry(node, struct zram_rbtree_node, node);
> +		if (key < data->key)
> +			node = node->rb_left;
> +		else if (key > data->key)
> +			node = node->rb_right;
> +		else
> +			return data;
> +	}
> +
> +	return NULL;
> +}
> +
> +static unsigned long zram_calc_hash(void *src, size_t len)
> +{
> +	return xxhash(src, len, 0);
> +}
> +
> +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
> +		size_t htable_size, size_t index)
> +{
> +	struct zram_rbtree_node *rb_node;
> +	struct zram_hash_node *node;
> +	unsigned long handle, cur_handle;
> +	size_t obj_size;
> +	char *src, *buf;
> +	unsigned long hash;
> +	int ret = 0;
> +
> +	handle = zram_get_handle(zram, index);
> +	if (!handle)
> +		return ret;
> +
> +	obj_size = zram_get_obj_size(zram, index);
> +	buf = kmalloc(obj_size, GFP_KERNEL);
> +	if (!buf) {
> +		pr_err("Failed to allocate zs_map_object buffer\n");
> +		return -ENOMEM;
> +	}
> +
> +	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
> +	memcpy(buf, src, obj_size);
> +	zs_unmap_object(zram->mem_pool, handle);
> +	hash = zram_calc_hash(buf, obj_size);
> +
> +	mutex_lock(&zram_rbtree_mutex);
> +	hlist_for_each_entry(node, &htable[hash % htable_size], next) {
> +		int cmp;
> +
> +		zram_slot_lock(zram, node->index);
> +
> +		/*
> +		 * Page may change as the hash table is being formed,
> +		 * so the checks below are necessary.
> +		 */
> +		cur_handle = zram_get_handle(zram, node->index);
> +		if (handle == cur_handle ||
> +			obj_size != zram_get_obj_size(zram, node->index)) {
> +			zram_slot_unlock(zram, node->index);
> +			continue;
> +		}
> +
> +		src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
> +		cmp = memcmp(buf, src, obj_size);
> +		zs_unmap_object(zram->mem_pool, cur_handle);
> +
> +		if (!cmp) {
> +			rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
> +
> +			/*
> +			 * This check is necessary in order not to zs_free an object
> +			 * that someone already refers to. This situation is possible
> +			 * when with repeated calls to zram_do_scan(). For example:
> +			 *
> +			 * [slot0] [slot1] [slot2] [slot3] [slot4]
> +			 * [obj0]  [obj1]  [obj2]  [obj3]  [obj4]
> +			 *
> +			 * Let's imagine that obj2 and obj3 are equal, and we called
> +			 * zram_do_scan() function:
> +			 *
> +			 * [slot0] [slot1] [slot2] [slot3] [slot4]
> +			 * [obj0]  [obj1]  [obj2]  [obj2]  [obj4]
> +			 *
> +			 * Now, slot2 and slot3 refers to obj2 zsmalloc object.
> +			 * Time passed, now slot0 refres to obj0_n, which is equal
> +			 * to obj2:
> +			 *
> +			 * [slot0]  [slot1] [slot2] [slot3] [slot4]
> +			 * [obj0_n] [obj1]  [obj2]  [obj2]  [obj4]
> +			 *
> +			 * Now we call zram_do_scan() function again. We get to slot2,
> +			 * and we understand that obj2 and obj0_n hashes are the same. We
> +			 * try to zs_free(obj2), but slot3 also already refers to it.
> +			 *
> +			 * This is not correct!
> +			 */
In this case, it seem like can't merge all same object, is that right?

how about let slot2 point to obj0_n and decrease the rb_node ref of 
slot2/slot3 in the first loop,
let slot3 point to obj0_n and decrease the rb_node ref in the next 
loop,  then the obj2 can be free.
> +			if (unlikely(rb_node))
> +				if (rb_node->cnt > 1) {
> +					zram_slot_unlock(zram, node->index);
> +					continue;
> +				}
> +
> +			zram_set_handle(zram, index, cur_handle);
> +			zs_free(zram->mem_pool, handle);
> +
> +			rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
> +
> +			if (!rb_node) {
> +				rb_node = kzalloc(sizeof(struct zram_rbtree_node),
> +								GFP_KERNEL);
> +				if (!rb_node) {
> +					pr_err("Failed to allocate rb_node\n");
> +					ret = -ENOMEM;
> +					zram_slot_unlock(zram, node->index);
> +					mutex_unlock(&zram_rbtree_mutex);
> +					goto merged_or_err;
> +				}
> +
> +				rb_node->key = cur_handle;
> +				/* Two slots refers to an zsmalloc object with cur_handle key */
> +				rb_node->cnt = 2;
> +				zram_rbtree_insert(&zram->sph_rbtree, rb_node);
> +			} else {
> +				rb_node->cnt++;
> +			}
> +
> +			atomic64_sub(obj_size, &zram->stats.compr_data_size);
> +			zram_set_flag(zram, index, ZRAM_MERGED);
> +			zram_set_flag(zram, node->index, ZRAM_MERGED);
> +
> +			zram_slot_unlock(zram, node->index);
> +			mutex_unlock(&zram_rbtree_mutex);
> +			goto merged_or_err;
> +		}
> +
> +		zram_slot_unlock(zram, node->index);
> +	}
> +
> +	mutex_unlock(&zram_rbtree_mutex);
> +
> +	node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
> +	if (!node) {
> +		ret = -ENOMEM;
> +		goto merged_or_err;
> +	}
> +
> +	node->index = index;
> +	hlist_add_head(&node->next, &htable[hash % htable_size]);
> +
> +merged_or_err:
> +	kfree(buf);
> +	return ret;
> +}
> +
> +static void zram_free_htable_entries(struct hlist_head *htable,
> +		size_t htable_size)
> +{
> +	struct hlist_node *n;
> +	struct zram_hash_node *node;
> +
> +	hlist_for_each_entry_safe(node, n, htable, next) {
> +		hlist_del(&node->next);
> +		kfree(node);
> +	}
> +}
> +
> +static int zram_do_scan(struct zram *zram)
> +{
> +	size_t num_pages = zram->disksize >> PAGE_SHIFT;
> +	size_t htable_size = num_pages;
> +	size_t index;
> +	struct hlist_head *htable;
> +	int i, ret = 0;
> +
> +	htable = vzalloc(htable_size * sizeof(struct hlist_head));
> +	if (!htable) {
> +		pr_err("Failed to allocate hash table\n");
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < htable_size; i++)
> +		INIT_HLIST_HEAD(&htable[i]);
> +
> +	for (index = 0; index < num_pages; index++) {
> +		zram_slot_lock(zram, index);
> +
> +		if (!zram_allocated(zram, index)) {
> +			zram_slot_unlock(zram, index);
> +			continue;
> +		}
> +
> +		if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
> +			zram_test_flag(zram, index, ZRAM_WB) ||
> +			zram_test_flag(zram, index, ZRAM_SAME)) {
> +			zram_slot_unlock(zram, index);
> +			continue;
> +		}
> +
> +		/* Ignore pages that have been recompressed */
> +		if (zram_get_priority(zram, index) != 0)
> +			continue;
> +
> +		ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
> +		zram_slot_unlock(zram, index);
> +		if (ret != 0)
> +			goto out;
> +	}
> +
> +out:
> +	zram_free_htable_entries(htable, htable_size);
> +	vfree(htable);
> +	return ret;
> +}
> +
>   static void zram_meta_free(struct zram *zram, u64 disksize)
>   {
>   	size_t num_pages = disksize >> PAGE_SHIFT;
> @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
>   static void zram_free_page(struct zram *zram, size_t index)
>   {
>   	unsigned long handle;
> +	struct zram_rbtree_node *node;
>   
>   #ifdef CONFIG_ZRAM_MEMORY_TRACKING
>   	zram->table[index].ac_time = 0;
> @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index)
>   	if (!handle)
>   		return;
>   
> -	zs_free(zram->mem_pool, handle);
> +	if (zram_test_flag(zram, index, ZRAM_MERGED)) {
> +		zram_clear_flag(zram, index, ZRAM_MERGED);
> +		mutex_lock(&zram_rbtree_mutex);
> +
> +		node = zram_rbtree_search(&zram->sph_rbtree, handle);
> +		BUG_ON(!node);
> +
> +		node->cnt--;
> +		if (node->cnt == 0) {
> +			rb_erase(&node->node, &zram->sph_rbtree);
> +			mutex_unlock(&zram_rbtree_mutex);
> +
> +			zs_free(zram->mem_pool, handle);
> +			kfree(node);
> +		} else {
> +			mutex_unlock(&zram_rbtree_mutex);
> +		}
> +	} else {
> +		zs_free(zram->mem_pool, handle);
> +	}
>   
>   	atomic64_sub(zram_get_obj_size(zram, index),
>   			&zram->stats.compr_data_size);
> @@ -2421,6 +2695,8 @@ static int zram_add(void)
>   
>   	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
>   
> +	zram->sph_rbtree = RB_ROOT;
> +
>   	zram_debugfs_register(zram);
>   	pr_info("Added device: %s\n", zram->disk->disk_name);
>   	return device_id;
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index c5254626f051..4a7151c94523 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -56,6 +56,7 @@ enum zram_pageflags {
>   
>   	ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
>   	ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
> +	ZRAM_MERGED,	/* page was merged */
>   
>   	__NR_ZRAM_PAGEFLAGS,
>   };
> @@ -140,5 +141,10 @@ struct zram {
>   #ifdef CONFIG_ZRAM_MEMORY_TRACKING
>   	struct dentry *debugfs_dir;
>   #endif
> +	/*
> +	 * This is same pages handle's rb tree, where the key is a handle
> +	 * to same pages and the value is a link counter
> +	 */
> +	struct rb_root sph_rbtree;
>   };
>   #endif