All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: zswap: multiple zpool support
@ 2023-05-31  2:29 Yosry Ahmed
  2023-05-31  2:32 ` Yosry Ahmed
  2023-06-01 15:58 ` Johannes Weiner
  0 siblings, 2 replies; 14+ messages in thread
From: Yosry Ahmed @ 2023-05-31  2:29 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, Andrew Morton, Seth Jennings,
	Dan Streetman, Vitaly Wool
  Cc: Johannes Weiner, Nhat Pham, Domenico Cerasuolo, Yu Zhao,
	linux-mm, linux-kernel, Yosry Ahmed

Support using multiple zpools of the same type in zswap, for concurrency
purposes. Add CONFIG_ZSWAP_NR_ZPOOLS_ORDER to control the number of
zpools. The order is specific by the config rather than the absolute
number to guarantee a power of 2. This is useful so that we can use
deterministically link each entry to a zpool by hashing the zswap_entry
pointer.

On a setup with zswap and zsmalloc, comparing a single zpool (current
default) to 32 zpools (by setting CONFIG_ZSWAP_NR_ZPOOLS_ORDER=32) shows
improvements in the zsmalloc lock contention, especially on the swap out
path.

The following shows the perf analysis of the swapout path when 10
workloads are simulatenously reclaiming and refaulting tmpfs pages.
There are some improvements on the swapin path as well, but much less
significant.

1 zpool:

 |--28.99%--zswap_frontswap_store
       |     |
       <snip>
       |     |
       |     |--8.98%--zpool_map_handle
       |     |     |
       |     |      --8.98%--zs_zpool_map
       |     |           |
       |     |            --8.95%--zs_map_object
       |     |                 |
       |     |                  --8.38%--_raw_spin_lock
       |     |                       |
       |     |                        --7.39%--queued_spin_lock_slowpath
       |     |
       |     |--8.82%--zpool_malloc
       |     |     |
       |     |      --8.82%--zs_zpool_malloc
       |     |           |
       |     |            --8.80%--zs_malloc
       |     |                 |
       |     |                 |--7.21%--_raw_spin_lock
       |     |                 |     |
       |     |                 |      --6.81%--queued_spin_lock_slowpath
       <snip>

32 zpools:

 |--16.73%--zswap_frontswap_store
       |     |
       <snip>
       |     |
       |     |--1.81%--zpool_malloc
       |     |     |
       |     |      --1.81%--zs_zpool_malloc
       |     |           |
       |     |            --1.79%--zs_malloc
       |     |                 |
       |     |                  --0.73%--obj_malloc
       |     |
       |     |--1.06%--zswap_update_total_size
       |     |
       |     |--0.59%--zpool_map_handle
       |     |     |
       |     |      --0.59%--zs_zpool_map
       |     |           |
       |     |            --0.57%--zs_map_object
       |     |                 |
       |     |                  --0.51%--_raw_spin_lock
       <snip>

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 mm/Kconfig | 12 +++++++
 mm/zswap.c | 95 ++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 92c30879bf67..de1da56d2c07 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -59,6 +59,18 @@ config ZSWAP_EXCLUSIVE_LOADS
 	  The cost is that if the page was never dirtied and needs to be
 	  swapped out again, it will be re-compressed.
 
+config ZSWAP_NR_ZPOOLS_ORDER
+	int "Number of zpools in zswap, as power of 2"
+	default 0
+	depends on ZSWAP
+	help
+	  This options determines the number of zpools to use for zswap, it
+	  will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
+
+	  Having multiple zpools helps with concurrency and lock contention
+	  on the swap in and swap out paths, but uses a little bit of extra
+	  space.
+
 choice
 	prompt "Default compressor"
 	depends on ZSWAP
diff --git a/mm/zswap.c b/mm/zswap.c
index fba80330afd1..7111036f8aa5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -137,6 +137,9 @@ static bool zswap_non_same_filled_pages_enabled = true;
 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
 		   bool, 0644);
 
+/* Order of zpools for global pool when memcg is enabled */
+static unsigned int zswap_nr_zpools = 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER;
+
 /*********************************
 * data structures
 **********************************/
@@ -150,7 +153,6 @@ struct crypto_acomp_ctx {
 };
 
 struct zswap_pool {
-	struct zpool *zpool;
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
 	struct kref kref;
 	struct list_head list;
@@ -158,6 +160,7 @@ struct zswap_pool {
 	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+	struct zpool *zpools[];
 };
 
 /*
@@ -236,7 +239,7 @@ static bool zswap_has_pool;
 
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
-		 zpool_get_type((p)->zpool))
+		 zpool_get_type((p)->zpools[0]))
 
 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
 static int zswap_pool_get(struct zswap_pool *pool);
@@ -263,11 +266,13 @@ static void zswap_update_total_size(void)
 {
 	struct zswap_pool *pool;
 	u64 total = 0;
+	int i;
 
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		total += zpool_get_total_size(pool->zpool);
+		for (i = 0; i < zswap_nr_zpools; i++)
+			total += zpool_get_total_size(pool->zpools[i]);
 
 	rcu_read_unlock();
 
@@ -350,6 +355,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	}
 }
 
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+	int i;
+
+	i = zswap_nr_zpools == 1 ? 0 :
+	    hash_ptr(entry, ilog2(zswap_nr_zpools));
+
+	return entry->pool->zpools[i];
+}
+
 /*
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
@@ -363,7 +378,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
-		zpool_free(entry->pool->zpool, entry->handle);
+		zpool_free(zswap_find_zpool(entry), entry->handle);
 		zswap_pool_put(entry->pool);
 	}
 	zswap_entry_cache_free(entry);
@@ -572,7 +587,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
 		if (strcmp(pool->tfm_name, compressor))
 			continue;
-		if (strcmp(zpool_get_type(pool->zpool), type))
+		/* all zpools share the same type */
+		if (strcmp(zpool_get_type(pool->zpools[0]), type))
 			continue;
 		/* if we can't get it, it's about to be destroyed */
 		if (!zswap_pool_get(pool))
@@ -587,14 +603,17 @@ static void shrink_worker(struct work_struct *w)
 {
 	struct zswap_pool *pool = container_of(w, typeof(*pool),
 						shrink_work);
+	int i;
 
-	if (zpool_shrink(pool->zpool, 1, NULL))
-		zswap_reject_reclaim_fail++;
+	for (i = 0; i < zswap_nr_zpools; i++)
+		if (zpool_shrink(pool->zpools[i], 1, NULL))
+			zswap_reject_reclaim_fail++;
 	zswap_pool_put(pool);
 }
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
+	int i;
 	struct zswap_pool *pool;
 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -611,19 +630,25 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 			return NULL;
 	}
 
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	pool = kzalloc(sizeof(*pool) +
+		       sizeof(pool->zpools[0]) * zswap_nr_zpools,
+		       GFP_KERNEL);
 	if (!pool)
 		return NULL;
 
-	/* unique name for each pool specifically required by zsmalloc */
-	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+	for (i = 0; i < zswap_nr_zpools; i++) {
+		/* unique name for each pool specifically required by zsmalloc */
+		snprintf(name, 38, "zswap%x",
+			 atomic_inc_return(&zswap_pools_count));
 
-	pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
-	if (!pool->zpool) {
-		pr_err("%s zpool not available\n", type);
-		goto error;
+		pool->zpools[i] = zpool_create_pool(type, name, gfp,
+						    &zswap_zpool_ops);
+		if (!pool->zpools[i]) {
+			pr_err("%s zpool not available\n", type);
+			goto error;
+		}
 	}
-	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
 
 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 
@@ -653,8 +678,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
-	if (pool->zpool)
-		zpool_destroy_pool(pool->zpool);
+	while (i--)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 	return NULL;
 }
@@ -703,11 +728,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
 
 static void zswap_pool_destroy(struct zswap_pool *pool)
 {
+	int i;
+
 	zswap_pool_debug("destroying", pool);
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
-	zpool_destroy_pool(pool->zpool);
+	for (i = 0; i < zswap_nr_zpools; i++)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 }
 
@@ -1160,6 +1188,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	unsigned long handle, value;
 	char *buf;
 	u8 *src, *dst;
+	struct zpool *zpool;
 	struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
 	gfp_t gfp;
 
@@ -1259,11 +1288,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	}
 
 	/* store */
-	hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+	zpool = zswap_find_zpool(entry);
+	hlen = zpool_evictable(zpool) ? sizeof(zhdr) : 0;
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(entry->pool->zpool))
+	if (zpool_malloc_support_movable(zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
+	ret = zpool_malloc(zpool, hlen + dlen, gfp, &handle);
+
 	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
 		goto put_dstmem;
@@ -1272,10 +1303,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		zswap_reject_alloc_fail++;
 		goto put_dstmem;
 	}
-	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, &zhdr, hlen);
 	memcpy(buf + hlen, dst, dlen);
-	zpool_unmap_handle(entry->pool->zpool, handle);
+	zpool_unmap_handle(zpool, handle);
 	mutex_unlock(acomp_ctx->mutex);
 
 	/* populate entry */
@@ -1353,6 +1384,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	u8 *src, *dst, *tmp;
 	unsigned int dlen;
 	int ret;
+	struct zpool *zpool;
 
 	/* find */
 	spin_lock(&tree->lock);
@@ -1372,7 +1404,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 		goto stats;
 	}
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	zpool = zswap_find_zpool(entry);
+	if (!zpool_can_sleep_mapped(zpool)) {
 		tmp = kmalloc(entry->length, GFP_KERNEL);
 		if (!tmp) {
 			ret = -ENOMEM;
@@ -1382,14 +1415,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
-	if (zpool_evictable(entry->pool->zpool))
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+	if (zpool_evictable(zpool))
 		src += sizeof(struct zswap_header);
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	if (!zpool_can_sleep_mapped(zpool)) {
 		memcpy(tmp, src, entry->length);
 		src = tmp;
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1401,8 +1434,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
 	mutex_unlock(acomp_ctx->mutex);
 
-	if (zpool_can_sleep_mapped(entry->pool->zpool))
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	if (zpool_can_sleep_mapped(zpool))
+		zpool_unmap_handle(zpool, entry->handle);
 	else
 		kfree(tmp);
 
@@ -1558,7 +1591,7 @@ static int zswap_setup(void)
 	pool = __zswap_pool_create_fallback();
 	if (pool) {
 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
-			zpool_get_type(pool->zpool));
+			zpool_get_type(pool->zpools[0]));
 		list_add(&pool->list, &zswap_pools);
 		zswap_has_pool = true;
 	} else {
-- 
2.41.0.rc0.172.g3f132b7071-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-05-31  2:29 [PATCH] mm: zswap: multiple zpool support Yosry Ahmed
@ 2023-05-31  2:32 ` Yosry Ahmed
  2023-06-01 15:58 ` Johannes Weiner
  1 sibling, 0 replies; 14+ messages in thread
From: Yosry Ahmed @ 2023-05-31  2:32 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, Andrew Morton, Seth Jennings,
	Dan Streetman, Vitaly Wool
  Cc: Johannes Weiner, Nhat Pham, Domenico Cerasuolo, Yu Zhao,
	linux-mm, linux-kernel

On Tue, May 30, 2023 at 7:29 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> Support using multiple zpools of the same type in zswap, for concurrency
> purposes. Add CONFIG_ZSWAP_NR_ZPOOLS_ORDER to control the number of
> zpools. The order is specific by the config rather than the absolute
> number to guarantee a power of 2. This is useful so that we can use
> deterministically link each entry to a zpool by hashing the zswap_entry
> pointer.
>
> On a setup with zswap and zsmalloc, comparing a single zpool (current
> default) to 32 zpools (by setting CONFIG_ZSWAP_NR_ZPOOLS_ORDER=32) shows
> improvements in the zsmalloc lock contention, especially on the swap out
> path.
>
> The following shows the perf analysis of the swapout path when 10
> workloads are simulatenously reclaiming and refaulting tmpfs pages.
> There are some improvements on the swapin path as well, but much less
> significant.
>
> 1 zpool:
>
>  |--28.99%--zswap_frontswap_store
>        |     |
>        <snip>
>        |     |
>        |     |--8.98%--zpool_map_handle
>        |     |     |
>        |     |      --8.98%--zs_zpool_map
>        |     |           |
>        |     |            --8.95%--zs_map_object
>        |     |                 |
>        |     |                  --8.38%--_raw_spin_lock
>        |     |                       |
>        |     |                        --7.39%--queued_spin_lock_slowpath
>        |     |
>        |     |--8.82%--zpool_malloc
>        |     |     |
>        |     |      --8.82%--zs_zpool_malloc
>        |     |           |
>        |     |            --8.80%--zs_malloc
>        |     |                 |
>        |     |                 |--7.21%--_raw_spin_lock
>        |     |                 |     |
>        |     |                 |      --6.81%--queued_spin_lock_slowpath
>        <snip>
>
> 32 zpools:
>
>  |--16.73%--zswap_frontswap_store
>        |     |
>        <snip>
>        |     |
>        |     |--1.81%--zpool_malloc
>        |     |     |
>        |     |      --1.81%--zs_zpool_malloc
>        |     |           |
>        |     |            --1.79%--zs_malloc
>        |     |                 |
>        |     |                  --0.73%--obj_malloc
>        |     |
>        |     |--1.06%--zswap_update_total_size
>        |     |
>        |     |--0.59%--zpool_map_handle
>        |     |     |
>        |     |      --0.59%--zs_zpool_map
>        |     |           |
>        |     |            --0.57%--zs_map_object
>        |     |                 |
>        |     |                  --0.51%--_raw_spin_lock
>        <snip>
>
> Suggested-by: Yu Zhao <yuzhao@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>  mm/Kconfig | 12 +++++++
>  mm/zswap.c | 95 ++++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 76 insertions(+), 31 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 92c30879bf67..de1da56d2c07 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -59,6 +59,18 @@ config ZSWAP_EXCLUSIVE_LOADS
>           The cost is that if the page was never dirtied and needs to be
>           swapped out again, it will be re-compressed.
>
> +config ZSWAP_NR_ZPOOLS_ORDER
> +       int "Number of zpools in zswap, as power of 2"
> +       default 0
> +       depends on ZSWAP
> +       help
> +         This options determines the number of zpools to use for zswap, it
> +         will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
> +
> +         Having multiple zpools helps with concurrency and lock contention
> +         on the swap in and swap out paths, but uses a little bit of extra
> +         space.
> +
>  choice
>         prompt "Default compressor"
>         depends on ZSWAP
> diff --git a/mm/zswap.c b/mm/zswap.c
> index fba80330afd1..7111036f8aa5 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -137,6 +137,9 @@ static bool zswap_non_same_filled_pages_enabled = true;
>  module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
>                    bool, 0644);
>
> +/* Order of zpools for global pool when memcg is enabled */

This comment is an artifact of an older implementation, please ignore.

> +static unsigned int zswap_nr_zpools = 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER;
> +
>  /*********************************
>  * data structures
>  **********************************/
> @@ -150,7 +153,6 @@ struct crypto_acomp_ctx {
>  };
>
>  struct zswap_pool {
> -       struct zpool *zpool;
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
>         struct kref kref;
>         struct list_head list;
> @@ -158,6 +160,7 @@ struct zswap_pool {
>         struct work_struct shrink_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> +       struct zpool *zpools[];
>  };
>
>  /*
> @@ -236,7 +239,7 @@ static bool zswap_has_pool;
>
>  #define zswap_pool_debug(msg, p)                               \
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> -                zpool_get_type((p)->zpool))
> +                zpool_get_type((p)->zpools[0]))
>
>  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
>  static int zswap_pool_get(struct zswap_pool *pool);
> @@ -263,11 +266,13 @@ static void zswap_update_total_size(void)
>  {
>         struct zswap_pool *pool;
>         u64 total = 0;
> +       int i;
>
>         rcu_read_lock();
>
>         list_for_each_entry_rcu(pool, &zswap_pools, list)
> -               total += zpool_get_total_size(pool->zpool);
> +               for (i = 0; i < zswap_nr_zpools; i++)
> +                       total += zpool_get_total_size(pool->zpools[i]);
>
>         rcu_read_unlock();
>
> @@ -350,6 +355,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
>         }
>  }
>
> +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> +{
> +       int i;
> +
> +       i = zswap_nr_zpools == 1 ? 0 :
> +           hash_ptr(entry, ilog2(zswap_nr_zpools));
> +
> +       return entry->pool->zpools[i];
> +}
> +
>  /*
>   * Carries out the common pattern of freeing and entry's zpool allocation,
>   * freeing the entry itself, and decrementing the number of stored pages.
> @@ -363,7 +378,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
>         if (!entry->length)
>                 atomic_dec(&zswap_same_filled_pages);
>         else {
> -               zpool_free(entry->pool->zpool, entry->handle);
> +               zpool_free(zswap_find_zpool(entry), entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
>         zswap_entry_cache_free(entry);
> @@ -572,7 +587,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         list_for_each_entry_rcu(pool, &zswap_pools, list) {
>                 if (strcmp(pool->tfm_name, compressor))
>                         continue;
> -               if (strcmp(zpool_get_type(pool->zpool), type))
> +               /* all zpools share the same type */
> +               if (strcmp(zpool_get_type(pool->zpools[0]), type))
>                         continue;
>                 /* if we can't get it, it's about to be destroyed */
>                 if (!zswap_pool_get(pool))
> @@ -587,14 +603,17 @@ static void shrink_worker(struct work_struct *w)
>  {
>         struct zswap_pool *pool = container_of(w, typeof(*pool),
>                                                 shrink_work);
> +       int i;
>
> -       if (zpool_shrink(pool->zpool, 1, NULL))
> -               zswap_reject_reclaim_fail++;
> +       for (i = 0; i < zswap_nr_zpools; i++)
> +               if (zpool_shrink(pool->zpools[i], 1, NULL))
> +                       zswap_reject_reclaim_fail++;
>         zswap_pool_put(pool);
>  }
>
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
> +       int i;
>         struct zswap_pool *pool;
>         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
>         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> @@ -611,19 +630,25 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>                         return NULL;
>         }
>
> -       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
> +       pool = kzalloc(sizeof(*pool) +
> +                      sizeof(pool->zpools[0]) * zswap_nr_zpools,
> +                      GFP_KERNEL);
>         if (!pool)
>                 return NULL;
>
> -       /* unique name for each pool specifically required by zsmalloc */
> -       snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> +       for (i = 0; i < zswap_nr_zpools; i++) {
> +               /* unique name for each pool specifically required by zsmalloc */
> +               snprintf(name, 38, "zswap%x",
> +                        atomic_inc_return(&zswap_pools_count));
>
> -       pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
> -       if (!pool->zpool) {
> -               pr_err("%s zpool not available\n", type);
> -               goto error;
> +               pool->zpools[i] = zpool_create_pool(type, name, gfp,
> +                                                   &zswap_zpool_ops);
> +               if (!pool->zpools[i]) {
> +                       pr_err("%s zpool not available\n", type);
> +                       goto error;
> +               }
>         }
> -       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> +       pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
>
>         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
>
> @@ -653,8 +678,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> -       if (pool->zpool)
> -               zpool_destroy_pool(pool->zpool);
> +       while (i--)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>         return NULL;
>  }
> @@ -703,11 +728,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
>
>  static void zswap_pool_destroy(struct zswap_pool *pool)
>  {
> +       int i;
> +
>         zswap_pool_debug("destroying", pool);
>
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>         free_percpu(pool->acomp_ctx);
> -       zpool_destroy_pool(pool->zpool);
> +       for (i = 0; i < zswap_nr_zpools; i++)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>  }
>
> @@ -1160,6 +1188,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         unsigned long handle, value;
>         char *buf;
>         u8 *src, *dst;
> +       struct zpool *zpool;
>         struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
>         gfp_t gfp;
>
> @@ -1259,11 +1288,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* store */
> -       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> +       zpool = zswap_find_zpool(entry);
> +       hlen = zpool_evictable(zpool) ? sizeof(zhdr) : 0;
>         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> -       if (zpool_malloc_support_movable(entry->pool->zpool))
> +       if (zpool_malloc_support_movable(zpool))
>                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> -       ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
> +       ret = zpool_malloc(zpool, hlen + dlen, gfp, &handle);
> +
>         if (ret == -ENOSPC) {
>                 zswap_reject_compress_poor++;
>                 goto put_dstmem;
> @@ -1272,10 +1303,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>                 zswap_reject_alloc_fail++;
>                 goto put_dstmem;
>         }
> -       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> +       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
>         memcpy(buf, &zhdr, hlen);
>         memcpy(buf + hlen, dst, dlen);
> -       zpool_unmap_handle(entry->pool->zpool, handle);
> +       zpool_unmap_handle(zpool, handle);
>         mutex_unlock(acomp_ctx->mutex);
>
>         /* populate entry */
> @@ -1353,6 +1384,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         u8 *src, *dst, *tmp;
>         unsigned int dlen;
>         int ret;
> +       struct zpool *zpool;
>
>         /* find */
>         spin_lock(&tree->lock);
> @@ -1372,7 +1404,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>                 goto stats;
>         }
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       zpool = zswap_find_zpool(entry);
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 tmp = kmalloc(entry->length, GFP_KERNEL);
>                 if (!tmp) {
>                         ret = -ENOMEM;
> @@ -1382,14 +1415,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>
>         /* decompress */
>         dlen = PAGE_SIZE;
> -       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> -       if (zpool_evictable(entry->pool->zpool))
> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> +       if (zpool_evictable(zpool))
>                 src += sizeof(struct zswap_header);
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 memcpy(tmp, src, entry->length);
>                 src = tmp;
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +               zpool_unmap_handle(zpool, entry->handle);
>         }
>
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -1401,8 +1434,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
>         mutex_unlock(acomp_ctx->mutex);
>
> -       if (zpool_can_sleep_mapped(entry->pool->zpool))
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +       if (zpool_can_sleep_mapped(zpool))
> +               zpool_unmap_handle(zpool, entry->handle);
>         else
>                 kfree(tmp);
>
> @@ -1558,7 +1591,7 @@ static int zswap_setup(void)
>         pool = __zswap_pool_create_fallback();
>         if (pool) {
>                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> -                       zpool_get_type(pool->zpool));
> +                       zpool_get_type(pool->zpools[0]));
>                 list_add(&pool->list, &zswap_pools);
>                 zswap_has_pool = true;
>         } else {
> --
> 2.41.0.rc0.172.g3f132b7071-goog
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-05-31  2:29 [PATCH] mm: zswap: multiple zpool support Yosry Ahmed
  2023-05-31  2:32 ` Yosry Ahmed
@ 2023-06-01 15:58 ` Johannes Weiner
  2023-06-01 17:51   ` Yosry Ahmed
  1 sibling, 1 reply; 14+ messages in thread
From: Johannes Weiner @ 2023-06-01 15:58 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Konrad Rzeszutek Wilk, Andrew Morton, Seth Jennings,
	Dan Streetman, Vitaly Wool, Nhat Pham, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Wed, May 31, 2023 at 02:29:11AM +0000, Yosry Ahmed wrote:
> Support using multiple zpools of the same type in zswap, for concurrency
> purposes. Add CONFIG_ZSWAP_NR_ZPOOLS_ORDER to control the number of
> zpools. The order is specific by the config rather than the absolute
> number to guarantee a power of 2. This is useful so that we can use
> deterministically link each entry to a zpool by hashing the zswap_entry
> pointer.
> 
> On a setup with zswap and zsmalloc, comparing a single zpool (current
> default) to 32 zpools (by setting CONFIG_ZSWAP_NR_ZPOOLS_ORDER=32) shows
> improvements in the zsmalloc lock contention, especially on the swap out
> path.
> 
> The following shows the perf analysis of the swapout path when 10
> workloads are simulatenously reclaiming and refaulting tmpfs pages.
> There are some improvements on the swapin path as well, but much less
> significant.
> 
> 1 zpool:
> 
>  |--28.99%--zswap_frontswap_store
>        |     |
>        <snip>
>        |     |
>        |     |--8.98%--zpool_map_handle
>        |     |     |
>        |     |      --8.98%--zs_zpool_map
>        |     |           |
>        |     |            --8.95%--zs_map_object
>        |     |                 |
>        |     |                  --8.38%--_raw_spin_lock
>        |     |                       |
>        |     |                        --7.39%--queued_spin_lock_slowpath
>        |     |
>        |     |--8.82%--zpool_malloc
>        |     |     |
>        |     |      --8.82%--zs_zpool_malloc
>        |     |           |
>        |     |            --8.80%--zs_malloc
>        |     |                 |
>        |     |                 |--7.21%--_raw_spin_lock
>        |     |                 |     |
>        |     |                 |      --6.81%--queued_spin_lock_slowpath
>        <snip>
> 
> 32 zpools:
> 
>  |--16.73%--zswap_frontswap_store
>        |     |
>        <snip>
>        |     |
>        |     |--1.81%--zpool_malloc
>        |     |     |
>        |     |      --1.81%--zs_zpool_malloc
>        |     |           |
>        |     |            --1.79%--zs_malloc
>        |     |                 |
>        |     |                  --0.73%--obj_malloc
>        |     |
>        |     |--1.06%--zswap_update_total_size
>        |     |
>        |     |--0.59%--zpool_map_handle
>        |     |     |
>        |     |      --0.59%--zs_zpool_map
>        |     |           |
>        |     |            --0.57%--zs_map_object
>        |     |                 |
>        |     |                  --0.51%--_raw_spin_lock
>        <snip>
> 
> Suggested-by: Yu Zhao <yuzhao@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>  mm/Kconfig | 12 +++++++
>  mm/zswap.c | 95 ++++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 76 insertions(+), 31 deletions(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 92c30879bf67..de1da56d2c07 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -59,6 +59,18 @@ config ZSWAP_EXCLUSIVE_LOADS
>  	  The cost is that if the page was never dirtied and needs to be
>  	  swapped out again, it will be re-compressed.
>  
> +config ZSWAP_NR_ZPOOLS_ORDER
> +	int "Number of zpools in zswap, as power of 2"
> +	default 0
> +	depends on ZSWAP
> +	help
> +	  This options determines the number of zpools to use for zswap, it
> +	  will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
> +
> +	  Having multiple zpools helps with concurrency and lock contention
> +	  on the swap in and swap out paths, but uses a little bit of extra
> +	  space.

This is nearly impossible for a user, let alone a distribution, to
answer adequately.

The optimal value needs to be found empirically. And it varies heavily
not just by workload but by implementation changes. If we make changes
to the lock holding time or redo the data structures, a previously
chosen value might no longer be a net positive, and even be harmful.

Architecturally, the pool scoping and the interaction with zswap_tree
is currently a mess. We're aware of lifetime bugs, where swapoff kills
the tree but the pool still exists with entries making dead references
e.g. We likely need to rearchitect this in the near future - maybe tie
pools to trees to begin with.

(I'm assuming you're already using multiple swap files to avoid tree
lock contention, because it has the same scope as the pool otherwise.)

Such changes quickly invalidate any settings the user or distro might
make here. Exposing the implementation detail of the pools might even
get in the way of fixing bugs and cleaning up the architecture.

> @@ -263,11 +266,13 @@ static void zswap_update_total_size(void)
>  {
>  	struct zswap_pool *pool;
>  	u64 total = 0;
> +	int i;
>  
>  	rcu_read_lock();
>  
>  	list_for_each_entry_rcu(pool, &zswap_pools, list)
> -		total += zpool_get_total_size(pool->zpool);
> +		for (i = 0; i < zswap_nr_zpools; i++)
> +			total += zpool_get_total_size(pool->zpools[i]);

This adds a O(nr_pools) operation to every load and store. It's easy
for this to dominate or outweigh locking costs as workload concurrency
changes, or data structures and locking change inside the kernel.

> @@ -587,14 +603,17 @@ static void shrink_worker(struct work_struct *w)
>  {
>  	struct zswap_pool *pool = container_of(w, typeof(*pool),
>  						shrink_work);
> +	int i;
>  
> -	if (zpool_shrink(pool->zpool, 1, NULL))
> -		zswap_reject_reclaim_fail++;
> +	for (i = 0; i < zswap_nr_zpools; i++)
> +		if (zpool_shrink(pool->zpools[i], 1, NULL))
> +			zswap_reject_reclaim_fail++;
>  	zswap_pool_put(pool);

This scales reclaim batch size by the number of zpools, which can lead
to varying degrees of overreclaim especially on small zswap sizes with
high pool concurrency.

I don't think this patch is ready for primetime. A user build time
setting is not appropriate for an optimization that is heavily tied to
implementation details and workload dynamics.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-01 15:58 ` Johannes Weiner
@ 2023-06-01 17:51   ` Yosry Ahmed
  2023-06-02 16:49     ` Johannes Weiner
  0 siblings, 1 reply; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-01 17:51 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Konrad Rzeszutek Wilk, Andrew Morton, Seth Jennings,
	Dan Streetman, Vitaly Wool, Nhat Pham, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Thu, Jun 1, 2023 at 8:58 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Wed, May 31, 2023 at 02:29:11AM +0000, Yosry Ahmed wrote:
> > Support using multiple zpools of the same type in zswap, for concurrency
> > purposes. Add CONFIG_ZSWAP_NR_ZPOOLS_ORDER to control the number of
> > zpools. The order is specific by the config rather than the absolute
> > number to guarantee a power of 2. This is useful so that we can use
> > deterministically link each entry to a zpool by hashing the zswap_entry
> > pointer.
> >
> > On a setup with zswap and zsmalloc, comparing a single zpool (current
> > default) to 32 zpools (by setting CONFIG_ZSWAP_NR_ZPOOLS_ORDER=32) shows
> > improvements in the zsmalloc lock contention, especially on the swap out
> > path.
> >
> > The following shows the perf analysis of the swapout path when 10
> > workloads are simulatenously reclaiming and refaulting tmpfs pages.
> > There are some improvements on the swapin path as well, but much less
> > significant.
> >
> > 1 zpool:
> >
> >  |--28.99%--zswap_frontswap_store
> >        |     |
> >        <snip>
> >        |     |
> >        |     |--8.98%--zpool_map_handle
> >        |     |     |
> >        |     |      --8.98%--zs_zpool_map
> >        |     |           |
> >        |     |            --8.95%--zs_map_object
> >        |     |                 |
> >        |     |                  --8.38%--_raw_spin_lock
> >        |     |                       |
> >        |     |                        --7.39%--queued_spin_lock_slowpath
> >        |     |
> >        |     |--8.82%--zpool_malloc
> >        |     |     |
> >        |     |      --8.82%--zs_zpool_malloc
> >        |     |           |
> >        |     |            --8.80%--zs_malloc
> >        |     |                 |
> >        |     |                 |--7.21%--_raw_spin_lock
> >        |     |                 |     |
> >        |     |                 |      --6.81%--queued_spin_lock_slowpath
> >        <snip>
> >
> > 32 zpools:
> >
> >  |--16.73%--zswap_frontswap_store
> >        |     |
> >        <snip>
> >        |     |
> >        |     |--1.81%--zpool_malloc
> >        |     |     |
> >        |     |      --1.81%--zs_zpool_malloc
> >        |     |           |
> >        |     |            --1.79%--zs_malloc
> >        |     |                 |
> >        |     |                  --0.73%--obj_malloc
> >        |     |
> >        |     |--1.06%--zswap_update_total_size
> >        |     |
> >        |     |--0.59%--zpool_map_handle
> >        |     |     |
> >        |     |      --0.59%--zs_zpool_map
> >        |     |           |
> >        |     |            --0.57%--zs_map_object
> >        |     |                 |
> >        |     |                  --0.51%--_raw_spin_lock
> >        <snip>
> >
> > Suggested-by: Yu Zhao <yuzhao@google.com>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >  mm/Kconfig | 12 +++++++
> >  mm/zswap.c | 95 ++++++++++++++++++++++++++++++++++++------------------
> >  2 files changed, 76 insertions(+), 31 deletions(-)
> >
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 92c30879bf67..de1da56d2c07 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -59,6 +59,18 @@ config ZSWAP_EXCLUSIVE_LOADS
> >         The cost is that if the page was never dirtied and needs to be
> >         swapped out again, it will be re-compressed.
> >
> > +config ZSWAP_NR_ZPOOLS_ORDER
> > +     int "Number of zpools in zswap, as power of 2"
> > +     default 0
> > +     depends on ZSWAP
> > +     help
> > +       This options determines the number of zpools to use for zswap, it
> > +       will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
> > +
> > +       Having multiple zpools helps with concurrency and lock contention
> > +       on the swap in and swap out paths, but uses a little bit of extra
> > +       space.
>
> This is nearly impossible for a user, let alone a distribution, to
> answer adequately.
>
> The optimal value needs to be found empirically. And it varies heavily
> not just by workload but by implementation changes. If we make changes
> to the lock holding time or redo the data structures, a previously
> chosen value might no longer be a net positive, and even be harmful.

Yeah, I agree that this can only be tuned empirically, but there is a
real benefit to tuning it, at least in our experience. I imagined
having the config option with a default of 0 gives those who want to
tune it the option, while not messing with users that do not care.

>
> Architecturally, the pool scoping and the interaction with zswap_tree
> is currently a mess. We're aware of lifetime bugs, where swapoff kills
> the tree but the pool still exists with entries making dead references
> e.g. We likely need to rearchitect this in the near future - maybe tie
> pools to trees to begin with.
>
> (I'm assuming you're already using multiple swap files to avoid tree
> lock contention, because it has the same scope as the pool otherwise.)
>
> Such changes quickly invalidate any settings the user or distro might
> make here. Exposing the implementation detail of the pools might even
> get in the way of fixing bugs and cleaning up the architecture.

I was under the impression that config options are not very stable.
IOW, if we fix the lock contention on an architectural level, and
there is no more benefit to tuning the number of zpools per zswap
pool, we can remove the config option. Is this incorrect?

>
> > @@ -263,11 +266,13 @@ static void zswap_update_total_size(void)
> >  {
> >       struct zswap_pool *pool;
> >       u64 total = 0;
> > +     int i;
> >
> >       rcu_read_lock();
> >
> >       list_for_each_entry_rcu(pool, &zswap_pools, list)
> > -             total += zpool_get_total_size(pool->zpool);
> > +             for (i = 0; i < zswap_nr_zpools; i++)
> > +                     total += zpool_get_total_size(pool->zpools[i]);
>
> This adds a O(nr_pools) operation to every load and store. It's easy
> for this to dominate or outweigh locking costs as workload concurrency
> changes, or data structures and locking change inside the kernel.

Right, which is why this is empirically tuned. In the perf analysis in
the commit log, the lock contention gains far outweigh this cost.
FWIW, the cost here is constant, we just iterate the pools and read a
value.

>
> > @@ -587,14 +603,17 @@ static void shrink_worker(struct work_struct *w)
> >  {
> >       struct zswap_pool *pool = container_of(w, typeof(*pool),
> >                                               shrink_work);
> > +     int i;
> >
> > -     if (zpool_shrink(pool->zpool, 1, NULL))
> > -             zswap_reject_reclaim_fail++;
> > +     for (i = 0; i < zswap_nr_zpools; i++)
> > +             if (zpool_shrink(pool->zpools[i], 1, NULL))
> > +                     zswap_reject_reclaim_fail++;
> >       zswap_pool_put(pool);
>
> This scales reclaim batch size by the number of zpools, which can lead
> to varying degrees of overreclaim especially on small zswap sizes with
> high pool concurrency.

I was under the assumption that with Domenico's patch we will mostly
be reclaiming multiple pages anyway, but I can certainly remove this
part and only reclaim one page at a time from one zpool. We can select
one at random or round robin through the zpools.

>
> I don't think this patch is ready for primetime. A user build time
> setting is not appropriate for an optimization that is heavily tied to
> implementation details and workload dynamics.

What would you suggest instead? We do find value in having multiple
zpools, at least for the current architecture.

An internal implementation that we have exposes this as a module
parameter instead, but that is more complicated (I image), because you
need to set it before enabling zswap, or before changing the zswap
pool, otherwise changing it is nop because the zpool(s) is already
allocated. I am also guessing module params are more stable than
config options. Hence, I thought a config option might be more
appropriate.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-01 17:51   ` Yosry Ahmed
@ 2023-06-02 16:49     ` Johannes Weiner
  2023-06-02 16:59       ` Yosry Ahmed
  0 siblings, 1 reply; 14+ messages in thread
From: Johannes Weiner @ 2023-06-02 16:49 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Konrad Rzeszutek Wilk, Andrew Morton, Seth Jennings,
	Dan Streetman, Vitaly Wool, Nhat Pham, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Thu, Jun 01, 2023 at 10:51:39AM -0700, Yosry Ahmed wrote:
> On Thu, Jun 1, 2023 at 8:58 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > On Wed, May 31, 2023 at 02:29:11AM +0000, Yosry Ahmed wrote:
> > > +config ZSWAP_NR_ZPOOLS_ORDER
> > > +     int "Number of zpools in zswap, as power of 2"
> > > +     default 0
> > > +     depends on ZSWAP
> > > +     help
> > > +       This options determines the number of zpools to use for zswap, it
> > > +       will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
> > > +
> > > +       Having multiple zpools helps with concurrency and lock contention
> > > +       on the swap in and swap out paths, but uses a little bit of extra
> > > +       space.
> >
> > This is nearly impossible for a user, let alone a distribution, to
> > answer adequately.
> >
> > The optimal value needs to be found empirically. And it varies heavily
> > not just by workload but by implementation changes. If we make changes
> > to the lock holding time or redo the data structures, a previously
> > chosen value might no longer be a net positive, and even be harmful.
> 
> Yeah, I agree that this can only be tuned empirically, but there is a
> real benefit to tuning it, at least in our experience. I imagined
> having the config option with a default of 0 gives those who want to
> tune it the option, while not messing with users that do not care.

Realistically, how many users besides you will be able to do this
tuning and benefit from this?

> > Architecturally, the pool scoping and the interaction with zswap_tree
> > is currently a mess. We're aware of lifetime bugs, where swapoff kills
> > the tree but the pool still exists with entries making dead references
> > e.g. We likely need to rearchitect this in the near future - maybe tie
> > pools to trees to begin with.
> >
> > (I'm assuming you're already using multiple swap files to avoid tree
> > lock contention, because it has the same scope as the pool otherwise.)
> >
> > Such changes quickly invalidate any settings the user or distro might
> > make here. Exposing the implementation detail of the pools might even
> > get in the way of fixing bugs and cleaning up the architecture.
> 
> I was under the impression that config options are not very stable.
> IOW, if we fix the lock contention on an architectural level, and
> there is no more benefit to tuning the number of zpools per zswap
> pool, we can remove the config option. Is this incorrect?

The problem is that it complicates the code for everybody, for the
benefit of a likely very small set of users - who have now opted out
of any need for making the code better.

And we have to keep this, and work around it, until things work for
thosee users who have no incentive to address the underlying issues.

That's difficult to justify.

> > I don't think this patch is ready for primetime. A user build time
> > setting is not appropriate for an optimization that is heavily tied to
> > implementation details and workload dynamics.
> 
> What would you suggest instead? We do find value in having multiple
> zpools, at least for the current architecture.

I think it makes sense to look closer at the lock contention, and
whether this can be improved more generically.

zbud and z3fold don't have a lock in the ->map callback that heavily
shows in the profile in your changelog. Is this zsmalloc specific?

If so, looking more closely at zsmalloc locking makes more sense. Is a
lockless implementation feasible? Can we do rw-locking against
zs_compact() to allow concurrency in zs_map_object()?

This would benefit everybody, even zram users.

Again, what about the zswap_tree.lock and swap_info_struct.lock?
They're the same scope unless you use multiple swap files. Would it
make sense to tie pools to trees, so that using multiple swapfiles for
concurrency purposes also implies this optimization?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-02 16:49     ` Johannes Weiner
@ 2023-06-02 16:59       ` Yosry Ahmed
  2023-06-02 18:34         ` Johannes Weiner
  0 siblings, 1 reply; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-02 16:59 UTC (permalink / raw)
  To: Johannes Weiner, Sergey Senozhatsky, Minchan Kim
  Cc: Konrad Rzeszutek Wilk, Andrew Morton, Seth Jennings,
	Dan Streetman, Vitaly Wool, Nhat Pham, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Thu, Jun 01, 2023 at 10:51:39AM -0700, Yosry Ahmed wrote:
> > On Thu, Jun 1, 2023 at 8:58 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > On Wed, May 31, 2023 at 02:29:11AM +0000, Yosry Ahmed wrote:
> > > > +config ZSWAP_NR_ZPOOLS_ORDER
> > > > +     int "Number of zpools in zswap, as power of 2"
> > > > +     default 0
> > > > +     depends on ZSWAP
> > > > +     help
> > > > +       This options determines the number of zpools to use for zswap, it
> > > > +       will be 1 << CONFIG_ZSWAP_NR_ZPOOLS_ORDER.
> > > > +
> > > > +       Having multiple zpools helps with concurrency and lock contention
> > > > +       on the swap in and swap out paths, but uses a little bit of extra
> > > > +       space.
> > >
> > > This is nearly impossible for a user, let alone a distribution, to
> > > answer adequately.
> > >
> > > The optimal value needs to be found empirically. And it varies heavily
> > > not just by workload but by implementation changes. If we make changes
> > > to the lock holding time or redo the data structures, a previously
> > > chosen value might no longer be a net positive, and even be harmful.
> >
> > Yeah, I agree that this can only be tuned empirically, but there is a
> > real benefit to tuning it, at least in our experience. I imagined
> > having the config option with a default of 0 gives those who want to
> > tune it the option, while not messing with users that do not care.
>
> Realistically, how many users besides you will be able to do this
> tuning and benefit from this?

It's not difficult to try increasing the number of zpools and observe
reduction in lock contention vs. increase in cost for updating total
pool size. Updating the pool size is lockless and only involves an
atomic_read(), so I imagine it won't start showing up as a degradation
until the number of zpools is significant.

>
> > > Architecturally, the pool scoping and the interaction with zswap_tree
> > > is currently a mess. We're aware of lifetime bugs, where swapoff kills
> > > the tree but the pool still exists with entries making dead references
> > > e.g. We likely need to rearchitect this in the near future - maybe tie
> > > pools to trees to begin with.
> > >
> > > (I'm assuming you're already using multiple swap files to avoid tree
> > > lock contention, because it has the same scope as the pool otherwise.)
> > >
> > > Such changes quickly invalidate any settings the user or distro might
> > > make here. Exposing the implementation detail of the pools might even
> > > get in the way of fixing bugs and cleaning up the architecture.
> >
> > I was under the impression that config options are not very stable.
> > IOW, if we fix the lock contention on an architectural level, and
> > there is no more benefit to tuning the number of zpools per zswap
> > pool, we can remove the config option. Is this incorrect?
>
> The problem is that it complicates the code for everybody, for the
> benefit of a likely very small set of users - who have now opted out
> of any need for making the code better.
>
> And we have to keep this, and work around it, until things work for
> thosee users who have no incentive to address the underlying issues.
>
> That's difficult to justify.

I agree that it makes the code less digestible, but is it unbearably
terrible? I think it's arguably within reason.

>
> > > I don't think this patch is ready for primetime. A user build time
> > > setting is not appropriate for an optimization that is heavily tied to
> > > implementation details and workload dynamics.
> >
> > What would you suggest instead? We do find value in having multiple
> > zpools, at least for the current architecture.
>
> I think it makes sense to look closer at the lock contention, and
> whether this can be improved more generically.
>
> zbud and z3fold don't have a lock in the ->map callback that heavily
> shows in the profile in your changelog. Is this zsmalloc specific?

We have observed this with zsmalloc, yes.

>
> If so, looking more closely at zsmalloc locking makes more sense. Is a
> lockless implementation feasible? Can we do rw-locking against
> zs_compact() to allow concurrency in zs_map_object()?

+Sergey Senozhatsky +Minchan Kim

I am not really sure, but it's not just zs_map_object(), it's also
zs_malloc() and I suspect other functions as well.

>
> This would benefit everybody, even zram users.
>
> Again, what about the zswap_tree.lock and swap_info_struct.lock?
> They're the same scope unless you use multiple swap files. Would it
> make sense to tie pools to trees, so that using multiple swapfiles for
> concurrency purposes also implies this optimization?

Yeah, using multiple swapfiles helps with those locks, but it doesn't
help with the zpool lock.

I am reluctant to take this path because I am trying to get rid of
zswap's dependency on swapfiles to begin with, and have it act as its
own standalone swapping backend. If I am successful, then having one
zpool per zswap_tree is just a temporary fix.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-02 16:59       ` Yosry Ahmed
@ 2023-06-02 18:34         ` Johannes Weiner
  2023-06-02 19:14           ` Yosry Ahmed
  0 siblings, 1 reply; 14+ messages in thread
From: Johannes Weiner @ 2023-06-02 18:34 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Fri, Jun 02, 2023 at 09:59:20AM -0700, Yosry Ahmed wrote:
> On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > Again, what about the zswap_tree.lock and swap_info_struct.lock?
> > They're the same scope unless you use multiple swap files. Would it
> > make sense to tie pools to trees, so that using multiple swapfiles for
> > concurrency purposes also implies this optimization?
> 
> Yeah, using multiple swapfiles helps with those locks, but it doesn't
> help with the zpool lock.
> 
> I am reluctant to take this path because I am trying to get rid of
> zswap's dependency on swapfiles to begin with, and have it act as its
> own standalone swapping backend. If I am successful, then having one
> zpool per zswap_tree is just a temporary fix.

What about making the pools per-cpu?

This would scale nicely with the machine size. And we commonly deal
with for_each_cpu() loops and per-cpu data structures, so have good
developer intuition about what's reasonable to squeeze into those.

It would eliminate the lock contention, for everybody, right away, and
without asking questions.

It would open the door to all kinds of locking optimizations on top.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-02 18:34         ` Johannes Weiner
@ 2023-06-02 19:14           ` Yosry Ahmed
  2023-06-02 20:24             ` Johannes Weiner
  0 siblings, 1 reply; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-02 19:14 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Fri, Jun 2, 2023 at 11:34 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Jun 02, 2023 at 09:59:20AM -0700, Yosry Ahmed wrote:
> > On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > Again, what about the zswap_tree.lock and swap_info_struct.lock?
> > > They're the same scope unless you use multiple swap files. Would it
> > > make sense to tie pools to trees, so that using multiple swapfiles for
> > > concurrency purposes also implies this optimization?
> >
> > Yeah, using multiple swapfiles helps with those locks, but it doesn't
> > help with the zpool lock.
> >
> > I am reluctant to take this path because I am trying to get rid of
> > zswap's dependency on swapfiles to begin with, and have it act as its
> > own standalone swapping backend. If I am successful, then having one
> > zpool per zswap_tree is just a temporary fix.
>
> What about making the pools per-cpu?
>
> This would scale nicely with the machine size. And we commonly deal
> with for_each_cpu() loops and per-cpu data structures, so have good
> developer intuition about what's reasonable to squeeze into those.
>
> It would eliminate the lock contention, for everybody, right away, and
> without asking questions.
>
> It would open the door to all kinds of locking optimizations on top.

The page can get swapped out on one cpu and swapped in on another, no?

We will need to store which zpool the page is stored in in its zswap
entry, and potentially grab percpu locks from other cpus in the swap
in path. The lock contention would probably be less, but certainly not
eliminated.

Did I misunderstand?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-02 19:14           ` Yosry Ahmed
@ 2023-06-02 20:24             ` Johannes Weiner
  2023-06-06  1:56               ` Yosry Ahmed
  0 siblings, 1 reply; 14+ messages in thread
From: Johannes Weiner @ 2023-06-02 20:24 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Fri, Jun 02, 2023 at 12:14:28PM -0700, Yosry Ahmed wrote:
> On Fri, Jun 2, 2023 at 11:34 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Jun 02, 2023 at 09:59:20AM -0700, Yosry Ahmed wrote:
> > > On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > Again, what about the zswap_tree.lock and swap_info_struct.lock?
> > > > They're the same scope unless you use multiple swap files. Would it
> > > > make sense to tie pools to trees, so that using multiple swapfiles for
> > > > concurrency purposes also implies this optimization?
> > >
> > > Yeah, using multiple swapfiles helps with those locks, but it doesn't
> > > help with the zpool lock.
> > >
> > > I am reluctant to take this path because I am trying to get rid of
> > > zswap's dependency on swapfiles to begin with, and have it act as its
> > > own standalone swapping backend. If I am successful, then having one
> > > zpool per zswap_tree is just a temporary fix.
> >
> > What about making the pools per-cpu?
> >
> > This would scale nicely with the machine size. And we commonly deal
> > with for_each_cpu() loops and per-cpu data structures, so have good
> > developer intuition about what's reasonable to squeeze into those.
> >
> > It would eliminate the lock contention, for everybody, right away, and
> > without asking questions.
> >
> > It would open the door to all kinds of locking optimizations on top.
> 
> The page can get swapped out on one cpu and swapped in on another, no?
> 
> We will need to store which zpool the page is stored in in its zswap
> entry, and potentially grab percpu locks from other cpus in the swap
> in path. The lock contention would probably be less, but certainly not
> eliminated.
> 
> Did I misunderstand?

Sorry, I should have been more precise.

I'm saying that using NR_CPUS pools, and replacing the hash with
smp_processor_id(), would accomplish your goal of pool concurrency.
But it would do so with a broadly-used, well-understood scaling
factor. We might not need a config option at all.

The lock would still be there, but contention would be reduced fairly
optimally (barring preemption) for store concurrency at least. Not
fully eliminated due to frees and compaction, though, yes.

I'm not proposing more than that at this point. I only wrote the last
line because already having per-cpu data structures might help with
fast path optimizations down the line, if contention is still an
issue. But unlikely. So it's not so important. Let's forget it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-02 20:24             ` Johannes Weiner
@ 2023-06-06  1:56               ` Yosry Ahmed
  2023-06-13 20:13                 ` Yosry Ahmed
  0 siblings, 1 reply; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-06  1:56 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Jun 02, 2023 at 12:14:28PM -0700, Yosry Ahmed wrote:
> > On Fri, Jun 2, 2023 at 11:34 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Fri, Jun 02, 2023 at 09:59:20AM -0700, Yosry Ahmed wrote:
> > > > On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > Again, what about the zswap_tree.lock and swap_info_struct.lock?
> > > > > They're the same scope unless you use multiple swap files. Would it
> > > > > make sense to tie pools to trees, so that using multiple swapfiles for
> > > > > concurrency purposes also implies this optimization?
> > > >
> > > > Yeah, using multiple swapfiles helps with those locks, but it doesn't
> > > > help with the zpool lock.
> > > >
> > > > I am reluctant to take this path because I am trying to get rid of
> > > > zswap's dependency on swapfiles to begin with, and have it act as its
> > > > own standalone swapping backend. If I am successful, then having one
> > > > zpool per zswap_tree is just a temporary fix.
> > >
> > > What about making the pools per-cpu?
> > >
> > > This would scale nicely with the machine size. And we commonly deal
> > > with for_each_cpu() loops and per-cpu data structures, so have good
> > > developer intuition about what's reasonable to squeeze into those.
> > >
> > > It would eliminate the lock contention, for everybody, right away, and
> > > without asking questions.
> > >
> > > It would open the door to all kinds of locking optimizations on top.
> >
> > The page can get swapped out on one cpu and swapped in on another, no?
> >
> > We will need to store which zpool the page is stored in in its zswap
> > entry, and potentially grab percpu locks from other cpus in the swap
> > in path. The lock contention would probably be less, but certainly not
> > eliminated.
> >
> > Did I misunderstand?
>
> Sorry, I should have been more precise.
>
> I'm saying that using NR_CPUS pools, and replacing the hash with
> smp_processor_id(), would accomplish your goal of pool concurrency.
> But it would do so with a broadly-used, well-understood scaling
> factor. We might not need a config option at all.
>
> The lock would still be there, but contention would be reduced fairly
> optimally (barring preemption) for store concurrency at least. Not
> fully eliminated due to frees and compaction, though, yes.

Yeah I think we can do that. I looked at the size of the zsmalloc pool
as an example, and it seems to be less than 1K, so having one percpu
seems okay.

There are a few things that we will need to do:
- Rework zswap_update_total_size(). We don't want to loop through all
cpus on each load/store. We can be smarter about it and inc/dec the
total zswap pool size each time we allocate or free a page in the
driver. This might need some plumbing from the drivers to zswap (or
passing a callback from zswap to the drivers).

- Update zsmalloc such that all pool share kmem caches, instead of
creating two kmem caches for zsmalloc percpu. This was a follow-up I
had in mind for multiple zpools support anyway, but I guess it's more
significant if we have NR_CPUS pools.

I was nervous about increasing the size of struct zswap_entry to store
the cpu/zpool where the entry resides, but I realized we can replace
the pointer to zswap_pool in struct zswap_entry with a pointer to
zpool, and add a zswap_pool pointer in struct zpool. This will
actually trim down the common "entry->pool->zpool" to just
"entry->zpool", and then we can replace any "entry->pool" with
"entry->zpool->pool".

@Yu Zhao, any thoughts on this? The multiple zpools support was
initially your idea (and did the initial implementation) -- so your
input is very valuable here.

>
> I'm not proposing more than that at this point. I only wrote the last
> line because already having per-cpu data structures might help with
> fast path optimizations down the line, if contention is still an
> issue. But unlikely. So it's not so important. Let's forget it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-06  1:56               ` Yosry Ahmed
@ 2023-06-13 20:13                 ` Yosry Ahmed
  2023-06-14 14:59                   ` Johannes Weiner
  0 siblings, 1 reply; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-13 20:13 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Fri, Jun 02, 2023 at 12:14:28PM -0700, Yosry Ahmed wrote:
> > > On Fri, Jun 2, 2023 at 11:34 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Fri, Jun 02, 2023 at 09:59:20AM -0700, Yosry Ahmed wrote:
> > > > > On Fri, Jun 2, 2023 at 9:49 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > > Again, what about the zswap_tree.lock and swap_info_struct.lock?
> > > > > > They're the same scope unless you use multiple swap files. Would it
> > > > > > make sense to tie pools to trees, so that using multiple swapfiles for
> > > > > > concurrency purposes also implies this optimization?
> > > > >
> > > > > Yeah, using multiple swapfiles helps with those locks, but it doesn't
> > > > > help with the zpool lock.
> > > > >
> > > > > I am reluctant to take this path because I am trying to get rid of
> > > > > zswap's dependency on swapfiles to begin with, and have it act as its
> > > > > own standalone swapping backend. If I am successful, then having one
> > > > > zpool per zswap_tree is just a temporary fix.
> > > >
> > > > What about making the pools per-cpu?
> > > >
> > > > This would scale nicely with the machine size. And we commonly deal
> > > > with for_each_cpu() loops and per-cpu data structures, so have good
> > > > developer intuition about what's reasonable to squeeze into those.
> > > >
> > > > It would eliminate the lock contention, for everybody, right away, and
> > > > without asking questions.
> > > >
> > > > It would open the door to all kinds of locking optimizations on top.
> > >
> > > The page can get swapped out on one cpu and swapped in on another, no?
> > >
> > > We will need to store which zpool the page is stored in in its zswap
> > > entry, and potentially grab percpu locks from other cpus in the swap
> > > in path. The lock contention would probably be less, but certainly not
> > > eliminated.
> > >
> > > Did I misunderstand?
> >
> > Sorry, I should have been more precise.
> >
> > I'm saying that using NR_CPUS pools, and replacing the hash with
> > smp_processor_id(), would accomplish your goal of pool concurrency.
> > But it would do so with a broadly-used, well-understood scaling
> > factor. We might not need a config option at all.
> >
> > The lock would still be there, but contention would be reduced fairly
> > optimally (barring preemption) for store concurrency at least. Not
> > fully eliminated due to frees and compaction, though, yes.

I thought about this again and had some internal discussions, and I am
more unsure about it. Beyond the memory overhead, having too many
zpools might result in higher fragmentation within the zpools. For
zsmalloc, we do not compact across multiple zpools for example.

We have been using a specific number of zpools in our production for
years now, we know it can be tuned to achieve performance gains. OTOH,
percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
probably too many zpools in a lot of cases, and we wouldn't know how
many zpools actually fits our workloads.

I see value in allowing the number of zpools to be directly
configurable (it can always be left as 1), and am worried that with
percpu we would be throwing away years of production testing for an
unknown.

I am obviously biased, but I don't think this adds significant
complexity to the zswap code as-is (or as v2 is to be precise).

WDYT?

>
> Yeah I think we can do that. I looked at the size of the zsmalloc pool
> as an example, and it seems to be less than 1K, so having one percpu
> seems okay.
>
> There are a few things that we will need to do:
> - Rework zswap_update_total_size(). We don't want to loop through all
> cpus on each load/store. We can be smarter about it and inc/dec the
> total zswap pool size each time we allocate or free a page in the
> driver. This might need some plumbing from the drivers to zswap (or
> passing a callback from zswap to the drivers).
>
> - Update zsmalloc such that all pool share kmem caches, instead of
> creating two kmem caches for zsmalloc percpu. This was a follow-up I
> had in mind for multiple zpools support anyway, but I guess it's more
> significant if we have NR_CPUS pools.
>
> I was nervous about increasing the size of struct zswap_entry to store
> the cpu/zpool where the entry resides, but I realized we can replace
> the pointer to zswap_pool in struct zswap_entry with a pointer to
> zpool, and add a zswap_pool pointer in struct zpool. This will
> actually trim down the common "entry->pool->zpool" to just
> "entry->zpool", and then we can replace any "entry->pool" with
> "entry->zpool->pool".
>
> @Yu Zhao, any thoughts on this? The multiple zpools support was
> initially your idea (and did the initial implementation) -- so your
> input is very valuable here.
>
> >
> > I'm not proposing more than that at this point. I only wrote the last
> > line because already having per-cpu data structures might help with
> > fast path optimizations down the line, if contention is still an
> > issue. But unlikely. So it's not so important. Let's forget it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-13 20:13                 ` Yosry Ahmed
@ 2023-06-14 14:59                   ` Johannes Weiner
  2023-06-14 20:50                     ` Yosry Ahmed
  0 siblings, 1 reply; 14+ messages in thread
From: Johannes Weiner @ 2023-06-14 14:59 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Tue, Jun 13, 2023 at 01:13:59PM -0700, Yosry Ahmed wrote:
> On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > Sorry, I should have been more precise.
> > >
> > > I'm saying that using NR_CPUS pools, and replacing the hash with
> > > smp_processor_id(), would accomplish your goal of pool concurrency.
> > > But it would do so with a broadly-used, well-understood scaling
> > > factor. We might not need a config option at all.
> > >
> > > The lock would still be there, but contention would be reduced fairly
> > > optimally (barring preemption) for store concurrency at least. Not
> > > fully eliminated due to frees and compaction, though, yes.
> 
> I thought about this again and had some internal discussions, and I am
> more unsure about it. Beyond the memory overhead, having too many
> zpools might result in higher fragmentation within the zpools. For
> zsmalloc, we do not compact across multiple zpools for example.
> 
> We have been using a specific number of zpools in our production for
> years now, we know it can be tuned to achieve performance gains. OTOH,
> percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
> probably too many zpools in a lot of cases, and we wouldn't know how
> many zpools actually fits our workloads.

Is it the same number across your entire fleet and all workloads?

How large *is* the number in relation to CPUs?

> I see value in allowing the number of zpools to be directly
> configurable (it can always be left as 1), and am worried that with
> percpu we would be throwing away years of production testing for an
> unknown.
> 
> I am obviously biased, but I don't think this adds significant
> complexity to the zswap code as-is (or as v2 is to be precise).

I had typed out this long list of reasons why I hate this change, and
then deleted it to suggest the per-cpu scaling factor.

But to summarize my POV, I think a user-facing config option for this
is completely inappropriate. There are no limits, no guidance, no sane
default. And it's very selective about micro-optimizing this one lock
when there are several locks and datastructures of the same scope in
the swap path. This isn't a reasonable question to ask people building
kernels. It's writing code through the Kconfig file.

Data structure scalability should be solved in code, not with config
options.

My vote on the patch as proposed is NAK.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-14 14:59                   ` Johannes Weiner
@ 2023-06-14 20:50                     ` Yosry Ahmed
  2023-06-20 19:48                       ` Yosry Ahmed
  0 siblings, 1 reply; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-14 20:50 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Wed, Jun 14, 2023 at 7:59 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jun 13, 2023 at 01:13:59PM -0700, Yosry Ahmed wrote:
> > On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > Sorry, I should have been more precise.
> > > >
> > > > I'm saying that using NR_CPUS pools, and replacing the hash with
> > > > smp_processor_id(), would accomplish your goal of pool concurrency.
> > > > But it would do so with a broadly-used, well-understood scaling
> > > > factor. We might not need a config option at all.
> > > >
> > > > The lock would still be there, but contention would be reduced fairly
> > > > optimally (barring preemption) for store concurrency at least. Not
> > > > fully eliminated due to frees and compaction, though, yes.
> >
> > I thought about this again and had some internal discussions, and I am
> > more unsure about it. Beyond the memory overhead, having too many
> > zpools might result in higher fragmentation within the zpools. For
> > zsmalloc, we do not compact across multiple zpools for example.
> >
> > We have been using a specific number of zpools in our production for
> > years now, we know it can be tuned to achieve performance gains. OTOH,
> > percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
> > probably too many zpools in a lot of cases, and we wouldn't know how
> > many zpools actually fits our workloads.
>
> Is it the same number across your entire fleet and all workloads?

Yes.

>
> How large *is* the number in relation to CPUs?

It differs based on the number of cpus on the machine. We use 32
zpools on all machines.

>
> > I see value in allowing the number of zpools to be directly
> > configurable (it can always be left as 1), and am worried that with
> > percpu we would be throwing away years of production testing for an
> > unknown.
> >
> > I am obviously biased, but I don't think this adds significant
> > complexity to the zswap code as-is (or as v2 is to be precise).
>
> I had typed out this long list of reasons why I hate this change, and
> then deleted it to suggest the per-cpu scaling factor.
>
> But to summarize my POV, I think a user-facing config option for this
> is completely inappropriate. There are no limits, no guidance, no sane
> default. And it's very selective about micro-optimizing this one lock
> when there are several locks and datastructures of the same scope in
> the swap path. This isn't a reasonable question to ask people building
> kernels. It's writing code through the Kconfig file.

It's not just swap path, it's any contention that happens within the
zpool between its different operations (map, alloc, compaction, etc).
My thought was that if a user observes high contention in any of the
zpool operations, they can increase the number of zpools -- basically
this should be empirically decided. If unsure, the user can just leave
it as a single zpool.

>
> Data structure scalability should be solved in code, not with config
> options.

I agree, but until we have a more fundamental architectural solution,
having multiple zpools to address scalability is a win. We can remove
the config option later if needed.

>
> My vote on the patch as proposed is NAK.

I hear the argument about the config option not being ideal here, but
NR_CPUs is also not ideal.

How about if we introduce it as a constant in the kernel? We have a
lot of other constants around the kernel that do not scale with the
machine size (e.g. SWAP_CLUSTER_MAX). We can start with 32, which is a
value that we have tested in our data centers for many years now and
know to work well. We can revisit later if needed.

WDYT?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: zswap: multiple zpool support
  2023-06-14 20:50                     ` Yosry Ahmed
@ 2023-06-20 19:48                       ` Yosry Ahmed
  0 siblings, 0 replies; 14+ messages in thread
From: Yosry Ahmed @ 2023-06-20 19:48 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Sergey Senozhatsky, Minchan Kim, Konrad Rzeszutek Wilk,
	Andrew Morton, Seth Jennings, Dan Streetman, Vitaly Wool,
	Nhat Pham, Domenico Cerasuolo, Yu Zhao, linux-mm, linux-kernel

On Wed, Jun 14, 2023 at 1:50 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Wed, Jun 14, 2023 at 7:59 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Tue, Jun 13, 2023 at 01:13:59PM -0700, Yosry Ahmed wrote:
> > > On Mon, Jun 5, 2023 at 6:56 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > On Fri, Jun 2, 2023 at 1:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > Sorry, I should have been more precise.
> > > > >
> > > > > I'm saying that using NR_CPUS pools, and replacing the hash with
> > > > > smp_processor_id(), would accomplish your goal of pool concurrency.
> > > > > But it would do so with a broadly-used, well-understood scaling
> > > > > factor. We might not need a config option at all.
> > > > >
> > > > > The lock would still be there, but contention would be reduced fairly
> > > > > optimally (barring preemption) for store concurrency at least. Not
> > > > > fully eliminated due to frees and compaction, though, yes.
> > >
> > > I thought about this again and had some internal discussions, and I am
> > > more unsure about it. Beyond the memory overhead, having too many
> > > zpools might result in higher fragmentation within the zpools. For
> > > zsmalloc, we do not compact across multiple zpools for example.
> > >
> > > We have been using a specific number of zpools in our production for
> > > years now, we know it can be tuned to achieve performance gains. OTOH,
> > > percpu zpools (or NR_CPUS pools) seems like too big of a hammer,
> > > probably too many zpools in a lot of cases, and we wouldn't know how
> > > many zpools actually fits our workloads.
> >
> > Is it the same number across your entire fleet and all workloads?
>
> Yes.
>
> >
> > How large *is* the number in relation to CPUs?
>
> It differs based on the number of cpus on the machine. We use 32
> zpools on all machines.
>
> >
> > > I see value in allowing the number of zpools to be directly
> > > configurable (it can always be left as 1), and am worried that with
> > > percpu we would be throwing away years of production testing for an
> > > unknown.
> > >
> > > I am obviously biased, but I don't think this adds significant
> > > complexity to the zswap code as-is (or as v2 is to be precise).
> >
> > I had typed out this long list of reasons why I hate this change, and
> > then deleted it to suggest the per-cpu scaling factor.
> >
> > But to summarize my POV, I think a user-facing config option for this
> > is completely inappropriate. There are no limits, no guidance, no sane
> > default. And it's very selective about micro-optimizing this one lock
> > when there are several locks and datastructures of the same scope in
> > the swap path. This isn't a reasonable question to ask people building
> > kernels. It's writing code through the Kconfig file.
>
> It's not just swap path, it's any contention that happens within the
> zpool between its different operations (map, alloc, compaction, etc).
> My thought was that if a user observes high contention in any of the
> zpool operations, they can increase the number of zpools -- basically
> this should be empirically decided. If unsure, the user can just leave
> it as a single zpool.
>
> >
> > Data structure scalability should be solved in code, not with config
> > options.
>
> I agree, but until we have a more fundamental architectural solution,
> having multiple zpools to address scalability is a win. We can remove
> the config option later if needed.
>
> >
> > My vote on the patch as proposed is NAK.
>
> I hear the argument about the config option not being ideal here, but
> NR_CPUs is also not ideal.
>
> How about if we introduce it as a constant in the kernel? We have a
> lot of other constants around the kernel that do not scale with the
> machine size (e.g. SWAP_CLUSTER_MAX). We can start with 32, which is a
> value that we have tested in our data centers for many years now and
> know to work well. We can revisit later if needed.
>
> WDYT?

I sent v3 [1] with the proposed constant instead of a config option,
hopefully this is more acceptable.

[1]https://lore.kernel.org/lkml/20230620194644.3142384-1-yosryahmed@google.com/

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-06-20 19:49 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-31  2:29 [PATCH] mm: zswap: multiple zpool support Yosry Ahmed
2023-05-31  2:32 ` Yosry Ahmed
2023-06-01 15:58 ` Johannes Weiner
2023-06-01 17:51   ` Yosry Ahmed
2023-06-02 16:49     ` Johannes Weiner
2023-06-02 16:59       ` Yosry Ahmed
2023-06-02 18:34         ` Johannes Weiner
2023-06-02 19:14           ` Yosry Ahmed
2023-06-02 20:24             ` Johannes Weiner
2023-06-06  1:56               ` Yosry Ahmed
2023-06-13 20:13                 ` Yosry Ahmed
2023-06-14 14:59                   ` Johannes Weiner
2023-06-14 20:50                     ` Yosry Ahmed
2023-06-20 19:48                       ` Yosry Ahmed

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.