linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] mm: zswap: multiple zpools support
@ 2023-06-20 19:46 Yosry Ahmed
  2023-07-09 23:12 ` Nhat Pham
  0 siblings, 1 reply; 9+ messages in thread
From: Yosry Ahmed @ 2023-06-20 19:46 UTC (permalink / raw)
  To: Andrew Morton, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool
  Cc: Johannes Weiner, Nhat Pham, Domenico Cerasuolo, Yu Zhao,
	linux-mm, linux-kernel, Yosry Ahmed

Support using multiple zpools of the same type in zswap, for concurrency
purposes. A fixed number of 32 zpools is suggested by this commit, which
was determined empirically. It can be later changed or made into a
config option if needed.

On a setup with zswap and zsmalloc, comparing a single zpool to 32
zpools shows improvements in the zsmalloc lock contention, especially on
the swap out path.

The following shows the perf analysis of the swapout path when 10
workloads are simultaneously reclaiming and refaulting tmpfs pages.
There are some improvements on the swap in path as well, but less
significant.

1 zpool:

 |--28.99%--zswap_frontswap_store
       |
       <snip>
       |
       |--8.98%--zpool_map_handle
       |     |
       |      --8.98%--zs_zpool_map
       |           |
       |            --8.95%--zs_map_object
       |                 |
       |                  --8.38%--_raw_spin_lock
       |                       |
       |                        --7.39%--queued_spin_lock_slowpath
       |
       |--8.82%--zpool_malloc
       |     |
       |      --8.82%--zs_zpool_malloc
       |           |
       |            --8.80%--zs_malloc
       |                 |
       |                 |--7.21%--_raw_spin_lock
       |                 |     |
       |                 |      --6.81%--queued_spin_lock_slowpath
       <snip>

32 zpools:

 |--16.73%--zswap_frontswap_store
       |
       <snip>
       |
       |--1.81%--zpool_malloc
       |     |
       |      --1.81%--zs_zpool_malloc
       |           |
       |            --1.79%--zs_malloc
       |                 |
       |                  --0.73%--obj_malloc
       |
       |--1.06%--zswap_update_total_size
       |
       |--0.59%--zpool_map_handle
       |     |
       |      --0.59%--zs_zpool_map
       |           |
       |            --0.57%--zs_map_object
       |                 |
       |                  --0.51%--_raw_spin_lock
       <snip>

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---

v2 -> v3:
- Removed config option (Johannes Weiner). Now it's a constant.
- Fixed spelling typos (Yu Zhao).

v1 -> v2:
- Prettified perf graph in commit log.
- Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
  fixed size array instead of a flex array.
- Removed stale comment.

---
 mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 87b204233115..6ee7028497b8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
 		CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
 
+/* Number of zpools in zswap_pool (empirically determined for scalability) */
+#define ZSWAP_NR_ZPOOLS 32
+
 /*********************************
 * data structures
 **********************************/
@@ -161,7 +164,7 @@ struct crypto_acomp_ctx {
  * needs to be verified that it's still valid in the tree.
  */
 struct zswap_pool {
-	struct zpool *zpool;
+	struct zpool *zpools[ZSWAP_NR_ZPOOLS];
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
 	struct kref kref;
 	struct list_head list;
@@ -248,7 +251,7 @@ static bool zswap_has_pool;
 
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
-		 zpool_get_type((p)->zpool))
+		 zpool_get_type((p)->zpools[0]))
 
 static int zswap_writeback_entry(struct zswap_entry *entry,
 				 struct zswap_tree *tree);
@@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
 {
 	struct zswap_pool *pool;
 	u64 total = 0;
+	int i;
 
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		total += zpool_get_total_size(pool->zpool);
+		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+			total += zpool_get_total_size(pool->zpools[i]);
 
 	rcu_read_unlock();
 
@@ -363,6 +368,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	}
 }
 
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+	int i = 0;
+
+	if (ZSWAP_NR_ZPOOLS > 1)
+		i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+	return entry->pool->zpools[i];
+}
+
 /*
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
@@ -379,7 +394,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
 		spin_lock(&entry->pool->lru_lock);
 		list_del(&entry->lru);
 		spin_unlock(&entry->pool->lru_lock);
-		zpool_free(entry->pool->zpool, entry->handle);
+		zpool_free(zswap_find_zpool(entry), entry->handle);
 		zswap_pool_put(entry->pool);
 	}
 	zswap_entry_cache_free(entry);
@@ -588,7 +603,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
 		if (strcmp(pool->tfm_name, compressor))
 			continue;
-		if (strcmp(zpool_get_type(pool->zpool), type))
+		/* all zpools share the same type */
+		if (strcmp(zpool_get_type(pool->zpools[0]), type))
 			continue;
 		/* if we can't get it, it's about to be destroyed */
 		if (!zswap_pool_get(pool))
@@ -692,6 +708,7 @@ static void shrink_worker(struct work_struct *w)
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
+	int i;
 	struct zswap_pool *pool;
 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -712,15 +729,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	if (!pool)
 		return NULL;
 
-	/* unique name for each pool specifically required by zsmalloc */
-	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+		/* unique name for each pool specifically required by zsmalloc */
+		snprintf(name, 38, "zswap%x",
+			 atomic_inc_return(&zswap_pools_count));
 
-	pool->zpool = zpool_create_pool(type, name, gfp);
-	if (!pool->zpool) {
-		pr_err("%s zpool not available\n", type);
-		goto error;
+		pool->zpools[i] = zpool_create_pool(type, name, gfp);
+		if (!pool->zpools[i]) {
+			pr_err("%s zpool not available\n", type);
+			goto error;
+		}
 	}
-	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
 
 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 
@@ -752,8 +772,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
-	if (pool->zpool)
-		zpool_destroy_pool(pool->zpool);
+	while (i--)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 	return NULL;
 }
@@ -802,11 +822,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
 
 static void zswap_pool_destroy(struct zswap_pool *pool)
 {
+	int i;
+
 	zswap_pool_debug("destroying", pool);
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
-	zpool_destroy_pool(pool->zpool);
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 }
 
@@ -1070,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct page *page;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
-	struct zpool *pool = entry->pool->zpool;
+	struct zpool *pool = zswap_find_zpool(entry);
 
 	u8 *src, *tmp = NULL;
 	unsigned int dlen;
@@ -1211,6 +1234,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
 	struct zswap_pool *pool;
+	struct zpool *zpool;
 	int ret;
 	unsigned int dlen = PAGE_SIZE;
 	unsigned long handle, value;
@@ -1321,10 +1345,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	}
 
 	/* store */
+	zpool = zswap_find_zpool(entry);
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(entry->pool->zpool))
+	if (zpool_malloc_support_movable(zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
 	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
 		goto put_dstmem;
@@ -1333,9 +1358,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		zswap_reject_alloc_fail++;
 		goto put_dstmem;
 	}
-	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(entry->pool->zpool, handle);
+	zpool_unmap_handle(zpool, handle);
 	mutex_unlock(acomp_ctx->mutex);
 
 	/* populate entry */
@@ -1406,6 +1431,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src, *dst, *tmp;
+	struct zpool *zpool;
 	unsigned int dlen;
 	int ret;
 
@@ -1427,7 +1453,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 		goto stats;
 	}
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	zpool = zswap_find_zpool(entry);
+	if (!zpool_can_sleep_mapped(zpool)) {
 		tmp = kmalloc(entry->length, GFP_KERNEL);
 		if (!tmp) {
 			ret = -ENOMEM;
@@ -1437,12 +1464,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	if (!zpool_can_sleep_mapped(zpool)) {
 		memcpy(tmp, src, entry->length);
 		src = tmp;
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1454,8 +1481,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
 	mutex_unlock(acomp_ctx->mutex);
 
-	if (zpool_can_sleep_mapped(entry->pool->zpool))
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	if (zpool_can_sleep_mapped(zpool))
+		zpool_unmap_handle(zpool, entry->handle);
 	else
 		kfree(tmp);
 
@@ -1616,7 +1643,7 @@ static int zswap_setup(void)
 	pool = __zswap_pool_create_fallback();
 	if (pool) {
 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
-			zpool_get_type(pool->zpool));
+			zpool_get_type(pool->zpools[0]));
 		list_add(&pool->list, &zswap_pools);
 		zswap_has_pool = true;
 	} else {
-- 
2.41.0.162.gfafddb0af9-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-06-20 19:46 [PATCH v3] mm: zswap: multiple zpools support Yosry Ahmed
@ 2023-07-09 23:12 ` Nhat Pham
  2023-07-13 10:35   ` Yosry Ahmed
  0 siblings, 1 reply; 9+ messages in thread
From: Nhat Pham @ 2023-07-09 23:12 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Andrew Morton, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool, Johannes Weiner, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Tue, Jun 20, 2023 at 12:46 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> Support using multiple zpools of the same type in zswap, for concurrency
> purposes. A fixed number of 32 zpools is suggested by this commit, which
> was determined empirically. It can be later changed or made into a
> config option if needed.
>
> On a setup with zswap and zsmalloc, comparing a single zpool to 32
> zpools shows improvements in the zsmalloc lock contention, especially on
> the swap out path.
>
> The following shows the perf analysis of the swapout path when 10
> workloads are simultaneously reclaiming and refaulting tmpfs pages.
> There are some improvements on the swap in path as well, but less
> significant.
>
> 1 zpool:
>
>  |--28.99%--zswap_frontswap_store
>        |
>        <snip>
>        |
>        |--8.98%--zpool_map_handle
>        |     |
>        |      --8.98%--zs_zpool_map
>        |           |
>        |            --8.95%--zs_map_object
>        |                 |
>        |                  --8.38%--_raw_spin_lock
>        |                       |
>        |                        --7.39%--queued_spin_lock_slowpath
>        |
>        |--8.82%--zpool_malloc
>        |     |
>        |      --8.82%--zs_zpool_malloc
>        |           |
>        |            --8.80%--zs_malloc
>        |                 |
>        |                 |--7.21%--_raw_spin_lock
>        |                 |     |
>        |                 |      --6.81%--queued_spin_lock_slowpath
>        <snip>
>
> 32 zpools:
>
>  |--16.73%--zswap_frontswap_store
>        |
>        <snip>
>        |
>        |--1.81%--zpool_malloc
>        |     |
>        |      --1.81%--zs_zpool_malloc
>        |           |
>        |            --1.79%--zs_malloc
>        |                 |
>        |                  --0.73%--obj_malloc
>        |
>        |--1.06%--zswap_update_total_size
>        |
>        |--0.59%--zpool_map_handle
>        |     |
>        |      --0.59%--zs_zpool_map
>        |           |
>        |            --0.57%--zs_map_object
>        |                 |
>        |                  --0.51%--_raw_spin_lock
>        <snip>
>
> Suggested-by: Yu Zhao <yuzhao@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>
> v2 -> v3:
> - Removed config option (Johannes Weiner). Now it's a constant.
> - Fixed spelling typos (Yu Zhao).
>
> v1 -> v2:
> - Prettified perf graph in commit log.
> - Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
>   fixed size array instead of a flex array.
> - Removed stale comment.
>
> ---
>  mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 54 insertions(+), 27 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 87b204233115..6ee7028497b8 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
>                 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
>  module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
>
> +/* Number of zpools in zswap_pool (empirically determined for scalability) */
> +#define ZSWAP_NR_ZPOOLS 32
> +
>  /*********************************
>  * data structures
>  **********************************/
> @@ -161,7 +164,7 @@ struct crypto_acomp_ctx {
>   * needs to be verified that it's still valid in the tree.
>   */
>  struct zswap_pool {
> -       struct zpool *zpool;
> +       struct zpool *zpools[ZSWAP_NR_ZPOOLS];
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
>         struct kref kref;
>         struct list_head list;
> @@ -248,7 +251,7 @@ static bool zswap_has_pool;
>
>  #define zswap_pool_debug(msg, p)                               \
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> -                zpool_get_type((p)->zpool))
> +                zpool_get_type((p)->zpools[0]))
>
>  static int zswap_writeback_entry(struct zswap_entry *entry,
>                                  struct zswap_tree *tree);
> @@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
>  {
>         struct zswap_pool *pool;
>         u64 total = 0;
> +       int i;
>
>         rcu_read_lock();
>
>         list_for_each_entry_rcu(pool, &zswap_pools, list)
> -               total += zpool_get_total_size(pool->zpool);
> +               for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> +                       total += zpool_get_total_size(pool->zpools[i]);
>
>         rcu_read_unlock();
>
> @@ -363,6 +368,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
>         }
>  }
>
> +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> +{
> +       int i = 0;
> +
> +       if (ZSWAP_NR_ZPOOLS > 1)
> +               i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
> +
> +       return entry->pool->zpools[i];
> +}
> +
>  /*
>   * Carries out the common pattern of freeing and entry's zpool allocation,
>   * freeing the entry itself, and decrementing the number of stored pages.
> @@ -379,7 +394,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
>                 spin_lock(&entry->pool->lru_lock);
>                 list_del(&entry->lru);
>                 spin_unlock(&entry->pool->lru_lock);
> -               zpool_free(entry->pool->zpool, entry->handle);
> +               zpool_free(zswap_find_zpool(entry), entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
>         zswap_entry_cache_free(entry);
> @@ -588,7 +603,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         list_for_each_entry_rcu(pool, &zswap_pools, list) {
>                 if (strcmp(pool->tfm_name, compressor))
>                         continue;
> -               if (strcmp(zpool_get_type(pool->zpool), type))
> +               /* all zpools share the same type */
> +               if (strcmp(zpool_get_type(pool->zpools[0]), type))
>                         continue;
>                 /* if we can't get it, it's about to be destroyed */
>                 if (!zswap_pool_get(pool))
> @@ -692,6 +708,7 @@ static void shrink_worker(struct work_struct *w)
>
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
> +       int i;
>         struct zswap_pool *pool;
>         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
>         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> @@ -712,15 +729,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>         if (!pool)
>                 return NULL;
>
> -       /* unique name for each pool specifically required by zsmalloc */
> -       snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> +               /* unique name for each pool specifically required by zsmalloc */
> +               snprintf(name, 38, "zswap%x",
> +                        atomic_inc_return(&zswap_pools_count));
>
> -       pool->zpool = zpool_create_pool(type, name, gfp);
> -       if (!pool->zpool) {
> -               pr_err("%s zpool not available\n", type);
> -               goto error;
> +               pool->zpools[i] = zpool_create_pool(type, name, gfp);
> +               if (!pool->zpools[i]) {
> +                       pr_err("%s zpool not available\n", type);
> +                       goto error;
> +               }
>         }
> -       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> +       pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
>
>         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
>
> @@ -752,8 +772,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> -       if (pool->zpool)
> -               zpool_destroy_pool(pool->zpool);
> +       while (i--)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>         return NULL;
>  }
> @@ -802,11 +822,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
>
>  static void zswap_pool_destroy(struct zswap_pool *pool)
>  {
> +       int i;
> +
>         zswap_pool_debug("destroying", pool);
>
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>         free_percpu(pool->acomp_ctx);
> -       zpool_destroy_pool(pool->zpool);
> +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>  }
>
> @@ -1070,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>         struct page *page;
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> -       struct zpool *pool = entry->pool->zpool;
> +       struct zpool *pool = zswap_find_zpool(entry);
>
>         u8 *src, *tmp = NULL;
>         unsigned int dlen;
> @@ -1211,6 +1234,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         struct crypto_acomp_ctx *acomp_ctx;
>         struct obj_cgroup *objcg = NULL;
>         struct zswap_pool *pool;
> +       struct zpool *zpool;
>         int ret;
>         unsigned int dlen = PAGE_SIZE;
>         unsigned long handle, value;
> @@ -1321,10 +1345,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* store */
> +       zpool = zswap_find_zpool(entry);
>         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> -       if (zpool_malloc_support_movable(entry->pool->zpool))
> +       if (zpool_malloc_support_movable(zpool))
>                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> -       ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
> +       ret = zpool_malloc(zpool, dlen, gfp, &handle);
>         if (ret == -ENOSPC) {
>                 zswap_reject_compress_poor++;
>                 goto put_dstmem;
> @@ -1333,9 +1358,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>                 zswap_reject_alloc_fail++;
>                 goto put_dstmem;
>         }
> -       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> +       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
>         memcpy(buf, dst, dlen);
> -       zpool_unmap_handle(entry->pool->zpool, handle);
> +       zpool_unmap_handle(zpool, handle);
>         mutex_unlock(acomp_ctx->mutex);
>
>         /* populate entry */
> @@ -1406,6 +1431,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
>         u8 *src, *dst, *tmp;
> +       struct zpool *zpool;
>         unsigned int dlen;
>         int ret;
>
> @@ -1427,7 +1453,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>                 goto stats;
>         }
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       zpool = zswap_find_zpool(entry);
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 tmp = kmalloc(entry->length, GFP_KERNEL);
>                 if (!tmp) {
>                         ret = -ENOMEM;
> @@ -1437,12 +1464,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>
>         /* decompress */
>         dlen = PAGE_SIZE;
> -       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 memcpy(tmp, src, entry->length);
>                 src = tmp;
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +               zpool_unmap_handle(zpool, entry->handle);
>         }
>
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -1454,8 +1481,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
>         mutex_unlock(acomp_ctx->mutex);
>
> -       if (zpool_can_sleep_mapped(entry->pool->zpool))
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +       if (zpool_can_sleep_mapped(zpool))
> +               zpool_unmap_handle(zpool, entry->handle);
>         else
>                 kfree(tmp);
>
> @@ -1616,7 +1643,7 @@ static int zswap_setup(void)
>         pool = __zswap_pool_create_fallback();
>         if (pool) {
>                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> -                       zpool_get_type(pool->zpool));
> +                       zpool_get_type(pool->zpools[0]));
>                 list_add(&pool->list, &zswap_pools);
>                 zswap_has_pool = true;
>         } else {
> --
> 2.41.0.162.gfafddb0af9-goog
>

In terms of correctness, the code LGTM.
However, I do share Johannes' concern about this change.

May I ask how sensitive is the system performance to the number of pools?
i.e during the tuning process, did you see lots of performance
variability as you vary the number of pools? Is 32 a number that
works well across workloads, hardware, etc?

Personally, I prefer the per-CPU pool idea - or some way to automatically
determine the number of pools that are more generic than this 32 value
(what if we have new hardware with more CPUs? Would 32 still be valid,
or do we have to change it?).

I'm experimenting with some other zswap changes - if I have
extra cycles and resources I'll try to apply this patch and see how the
numbers play out.

I'll defer to Johannes and other reviewers for further comments.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-07-09 23:12 ` Nhat Pham
@ 2023-07-13 10:35   ` Yosry Ahmed
  2023-08-11 21:19     ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Yosry Ahmed @ 2023-07-13 10:35 UTC (permalink / raw)
  To: Nhat Pham
  Cc: Andrew Morton, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool, Johannes Weiner, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Sun, Jul 9, 2023 at 4:12 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Jun 20, 2023 at 12:46 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > Support using multiple zpools of the same type in zswap, for concurrency
> > purposes. A fixed number of 32 zpools is suggested by this commit, which
> > was determined empirically. It can be later changed or made into a
> > config option if needed.
> >
> > On a setup with zswap and zsmalloc, comparing a single zpool to 32
> > zpools shows improvements in the zsmalloc lock contention, especially on
> > the swap out path.
> >
> > The following shows the perf analysis of the swapout path when 10
> > workloads are simultaneously reclaiming and refaulting tmpfs pages.
> > There are some improvements on the swap in path as well, but less
> > significant.
> >
> > 1 zpool:
> >
> >  |--28.99%--zswap_frontswap_store
> >        |
> >        <snip>
> >        |
> >        |--8.98%--zpool_map_handle
> >        |     |
> >        |      --8.98%--zs_zpool_map
> >        |           |
> >        |            --8.95%--zs_map_object
> >        |                 |
> >        |                  --8.38%--_raw_spin_lock
> >        |                       |
> >        |                        --7.39%--queued_spin_lock_slowpath
> >        |
> >        |--8.82%--zpool_malloc
> >        |     |
> >        |      --8.82%--zs_zpool_malloc
> >        |           |
> >        |            --8.80%--zs_malloc
> >        |                 |
> >        |                 |--7.21%--_raw_spin_lock
> >        |                 |     |
> >        |                 |      --6.81%--queued_spin_lock_slowpath
> >        <snip>
> >
> > 32 zpools:
> >
> >  |--16.73%--zswap_frontswap_store
> >        |
> >        <snip>
> >        |
> >        |--1.81%--zpool_malloc
> >        |     |
> >        |      --1.81%--zs_zpool_malloc
> >        |           |
> >        |            --1.79%--zs_malloc
> >        |                 |
> >        |                  --0.73%--obj_malloc
> >        |
> >        |--1.06%--zswap_update_total_size
> >        |
> >        |--0.59%--zpool_map_handle
> >        |     |
> >        |      --0.59%--zs_zpool_map
> >        |           |
> >        |            --0.57%--zs_map_object
> >        |                 |
> >        |                  --0.51%--_raw_spin_lock
> >        <snip>
> >
> > Suggested-by: Yu Zhao <yuzhao@google.com>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >
> > v2 -> v3:
> > - Removed config option (Johannes Weiner). Now it's a constant.
> > - Fixed spelling typos (Yu Zhao).
> >
> > v1 -> v2:
> > - Prettified perf graph in commit log.
> > - Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
> >   fixed size array instead of a flex array.
> > - Removed stale comment.
> >
> > ---
> >  mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
> >  1 file changed, 54 insertions(+), 27 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 87b204233115..6ee7028497b8 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
> >                 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
> >  module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
> >
> > +/* Number of zpools in zswap_pool (empirically determined for scalability) */
> > +#define ZSWAP_NR_ZPOOLS 32
> > +
> >  /*********************************
> >  * data structures
> >  **********************************/
> > @@ -161,7 +164,7 @@ struct crypto_acomp_ctx {
> >   * needs to be verified that it's still valid in the tree.
> >   */
> >  struct zswap_pool {
> > -       struct zpool *zpool;
> > +       struct zpool *zpools[ZSWAP_NR_ZPOOLS];
> >         struct crypto_acomp_ctx __percpu *acomp_ctx;
> >         struct kref kref;
> >         struct list_head list;
> > @@ -248,7 +251,7 @@ static bool zswap_has_pool;
> >
> >  #define zswap_pool_debug(msg, p)                               \
> >         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> > -                zpool_get_type((p)->zpool))
> > +                zpool_get_type((p)->zpools[0]))
> >
> >  static int zswap_writeback_entry(struct zswap_entry *entry,
> >                                  struct zswap_tree *tree);
> > @@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
> >  {
> >         struct zswap_pool *pool;
> >         u64 total = 0;
> > +       int i;
> >
> >         rcu_read_lock();
> >
> >         list_for_each_entry_rcu(pool, &zswap_pools, list)
> > -               total += zpool_get_total_size(pool->zpool);
> > +               for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > +                       total += zpool_get_total_size(pool->zpools[i]);
> >
> >         rcu_read_unlock();
> >
> > @@ -363,6 +368,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> >         }
> >  }
> >
> > +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> > +{
> > +       int i = 0;
> > +
> > +       if (ZSWAP_NR_ZPOOLS > 1)
> > +               i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
> > +
> > +       return entry->pool->zpools[i];
> > +}
> > +
> >  /*
> >   * Carries out the common pattern of freeing and entry's zpool allocation,
> >   * freeing the entry itself, and decrementing the number of stored pages.
> > @@ -379,7 +394,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
> >                 spin_lock(&entry->pool->lru_lock);
> >                 list_del(&entry->lru);
> >                 spin_unlock(&entry->pool->lru_lock);
> > -               zpool_free(entry->pool->zpool, entry->handle);
> > +               zpool_free(zswap_find_zpool(entry), entry->handle);
> >                 zswap_pool_put(entry->pool);
> >         }
> >         zswap_entry_cache_free(entry);
> > @@ -588,7 +603,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> >         list_for_each_entry_rcu(pool, &zswap_pools, list) {
> >                 if (strcmp(pool->tfm_name, compressor))
> >                         continue;
> > -               if (strcmp(zpool_get_type(pool->zpool), type))
> > +               /* all zpools share the same type */
> > +               if (strcmp(zpool_get_type(pool->zpools[0]), type))
> >                         continue;
> >                 /* if we can't get it, it's about to be destroyed */
> >                 if (!zswap_pool_get(pool))
> > @@ -692,6 +708,7 @@ static void shrink_worker(struct work_struct *w)
> >
> >  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >  {
> > +       int i;
> >         struct zswap_pool *pool;
> >         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
> >         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > @@ -712,15 +729,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >         if (!pool)
> >                 return NULL;
> >
> > -       /* unique name for each pool specifically required by zsmalloc */
> > -       snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> > +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> > +               /* unique name for each pool specifically required by zsmalloc */
> > +               snprintf(name, 38, "zswap%x",
> > +                        atomic_inc_return(&zswap_pools_count));
> >
> > -       pool->zpool = zpool_create_pool(type, name, gfp);
> > -       if (!pool->zpool) {
> > -               pr_err("%s zpool not available\n", type);
> > -               goto error;
> > +               pool->zpools[i] = zpool_create_pool(type, name, gfp);
> > +               if (!pool->zpools[i]) {
> > +                       pr_err("%s zpool not available\n", type);
> > +                       goto error;
> > +               }
> >         }
> > -       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> > +       pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
> >
> >         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
> >
> > @@ -752,8 +772,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >  error:
> >         if (pool->acomp_ctx)
> >                 free_percpu(pool->acomp_ctx);
> > -       if (pool->zpool)
> > -               zpool_destroy_pool(pool->zpool);
> > +       while (i--)
> > +               zpool_destroy_pool(pool->zpools[i]);
> >         kfree(pool);
> >         return NULL;
> >  }
> > @@ -802,11 +822,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
> >
> >  static void zswap_pool_destroy(struct zswap_pool *pool)
> >  {
> > +       int i;
> > +
> >         zswap_pool_debug("destroying", pool);
> >
> >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> >         free_percpu(pool->acomp_ctx);
> > -       zpool_destroy_pool(pool->zpool);
> > +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > +               zpool_destroy_pool(pool->zpools[i]);
> >         kfree(pool);
> >  }
> >
> > @@ -1070,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >         struct page *page;
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> > -       struct zpool *pool = entry->pool->zpool;
> > +       struct zpool *pool = zswap_find_zpool(entry);
> >
> >         u8 *src, *tmp = NULL;
> >         unsigned int dlen;
> > @@ -1211,6 +1234,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >         struct crypto_acomp_ctx *acomp_ctx;
> >         struct obj_cgroup *objcg = NULL;
> >         struct zswap_pool *pool;
> > +       struct zpool *zpool;
> >         int ret;
> >         unsigned int dlen = PAGE_SIZE;
> >         unsigned long handle, value;
> > @@ -1321,10 +1345,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >         }
> >
> >         /* store */
> > +       zpool = zswap_find_zpool(entry);
> >         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > -       if (zpool_malloc_support_movable(entry->pool->zpool))
> > +       if (zpool_malloc_support_movable(zpool))
> >                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > -       ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
> > +       ret = zpool_malloc(zpool, dlen, gfp, &handle);
> >         if (ret == -ENOSPC) {
> >                 zswap_reject_compress_poor++;
> >                 goto put_dstmem;
> > @@ -1333,9 +1358,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >                 zswap_reject_alloc_fail++;
> >                 goto put_dstmem;
> >         }
> > -       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> > +       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> >         memcpy(buf, dst, dlen);
> > -       zpool_unmap_handle(entry->pool->zpool, handle);
> > +       zpool_unmap_handle(zpool, handle);
> >         mutex_unlock(acomp_ctx->mutex);
> >
> >         /* populate entry */
> > @@ -1406,6 +1431,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> >         u8 *src, *dst, *tmp;
> > +       struct zpool *zpool;
> >         unsigned int dlen;
> >         int ret;
> >
> > @@ -1427,7 +1453,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >                 goto stats;
> >         }
> >
> > -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> > +       zpool = zswap_find_zpool(entry);
> > +       if (!zpool_can_sleep_mapped(zpool)) {
> >                 tmp = kmalloc(entry->length, GFP_KERNEL);
> >                 if (!tmp) {
> >                         ret = -ENOMEM;
> > @@ -1437,12 +1464,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >
> >         /* decompress */
> >         dlen = PAGE_SIZE;
> > -       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> > +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >
> > -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> > +       if (!zpool_can_sleep_mapped(zpool)) {
> >                 memcpy(tmp, src, entry->length);
> >                 src = tmp;
> > -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> > +               zpool_unmap_handle(zpool, entry->handle);
> >         }
> >
> >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > @@ -1454,8 +1481,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> >         mutex_unlock(acomp_ctx->mutex);
> >
> > -       if (zpool_can_sleep_mapped(entry->pool->zpool))
> > -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> > +       if (zpool_can_sleep_mapped(zpool))
> > +               zpool_unmap_handle(zpool, entry->handle);
> >         else
> >                 kfree(tmp);
> >
> > @@ -1616,7 +1643,7 @@ static int zswap_setup(void)
> >         pool = __zswap_pool_create_fallback();
> >         if (pool) {
> >                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> > -                       zpool_get_type(pool->zpool));
> > +                       zpool_get_type(pool->zpools[0]));
> >                 list_add(&pool->list, &zswap_pools);
> >                 zswap_has_pool = true;
> >         } else {
> > --
> > 2.41.0.162.gfafddb0af9-goog
> >
>
> In terms of correctness, the code LGTM.
> However, I do share Johannes' concern about this change.
>
> May I ask how sensitive is the system performance to the number of pools?
> i.e during the tuning process, did you see lots of performance
> variability as you vary the number of pools? Is 32 a number that
> works well across workloads, hardware, etc?

I did not tune this myself, it has been used in our fleet for many
years now, and honestly I am not sure what range of values was tried
out. For us, 32 is a number that works well across our entire fleet
(that uses zswap ofc).

>
> Personally, I prefer the per-CPU pool idea - or some way to automatically
> determine the number of pools that are more generic than this 32 value
> (what if we have new hardware with more CPUs? Would 32 still be valid,
> or do we have to change it?).

Ideally, we would have the number of zpools be a function of the
system specs, or even autotune based on the workload. However, I don't
think we have a clear idea about what this should look like. While a
constant value is suboptimal, we have multiple constants in MM that
seem to be working relatively well across different machines and
workloads (e.g. SWAP_CLUSTER_MAX) -- so it's not unheard of.

We have been using 32 zpools across different machines and workloads
for years now. I would be hesitant to throw away years of production
testing right away, without data to back that something else is
better. I would prefer to start with something that (at least in our
fleet) is proven to be good, and we can easily extend it later to
replace 32 with a more sophisticated formula or something that is
calculated at boot or even tuned by userspace. Having the support in
the code to have multiple zpools is valuable as-is imo.

>
> I'm experimenting with some other zswap changes - if I have
> extra cycles and resources I'll try to apply this patch and see how the
> numbers play out.

That would be amazing. Looking forward to any numbers you can dig :)

>
> I'll defer to Johannes and other reviewers for further comments.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-07-13 10:35   ` Yosry Ahmed
@ 2023-08-11 21:19     ` Andrew Morton
  2023-08-11 23:20       ` Yosry Ahmed
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2023-08-11 21:19 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Nhat Pham, Konrad Rzeszutek Wilk, Seth Jennings, Dan Streetman,
	Vitaly Wool, Johannes Weiner, Domenico Cerasuolo, Yu Zhao,
	linux-mm, linux-kernel

On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:

> >
> > I'm experimenting with some other zswap changes - if I have
> > extra cycles and resources I'll try to apply this patch and see how the
> > numbers play out.
> 
> That would be amazing. Looking forward to any numbers you can dig :)

So this patch seems stuck.  I can keep it in mm.git until the fog
clears, but would prefer not to.  Can we please revisit and decide on a
way forward?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-08-11 21:19     ` Andrew Morton
@ 2023-08-11 23:20       ` Yosry Ahmed
  2023-08-15 22:22         ` Chris Li
  2023-08-17 22:53         ` Nhat Pham
  0 siblings, 2 replies; 9+ messages in thread
From: Yosry Ahmed @ 2023-08-11 23:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nhat Pham, Konrad Rzeszutek Wilk, Seth Jennings, Dan Streetman,
	Vitaly Wool, Johannes Weiner, Domenico Cerasuolo, Yu Zhao,
	linux-mm, linux-kernel

On Fri, Aug 11, 2023 at 2:19 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:
>
> > >
> > > I'm experimenting with some other zswap changes - if I have
> > > extra cycles and resources I'll try to apply this patch and see how the
> > > numbers play out.
> >
> > That would be amazing. Looking forward to any numbers you can dig :)
>
> So this patch seems stuck.  I can keep it in mm.git until the fog
> clears, but would prefer not to.  Can we please revisit and decide on a
> way forward?

Johannes did not like a config option so I proposed it here as a
constant (like SWAP_CLUSTER_MAX and others we have). This is a value
that we have been using in our data centers for almost a decade, so it
has seen a ton of testing. I was hoping Johannes would get time to
take a look, or Nhat would get time to test it out, but neither of
these things happen.

I obviously want it to be merged, but hopefully someone will chime in here :)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-08-11 23:20       ` Yosry Ahmed
@ 2023-08-15 22:22         ` Chris Li
  2023-08-15 22:29           ` Yosry Ahmed
  2023-08-17 22:53         ` Nhat Pham
  1 sibling, 1 reply; 9+ messages in thread
From: Chris Li @ 2023-08-15 22:22 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Andrew Morton, Nhat Pham, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool, Johannes Weiner, Domenico Cerasuolo,
	Yu Zhao, linux-mm, LKML

Hi Yosry,

On Fri, Aug 11, 2023 at 4:21 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Fri, Aug 11, 2023 at 2:19 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > > >
> > > > I'm experimenting with some other zswap changes - if I have
> > > > extra cycles and resources I'll try to apply this patch and see how the
> > > > numbers play out.
> > >
> > > That would be amazing. Looking forward to any numbers you can dig :)
> >
> > So this patch seems stuck.  I can keep it in mm.git until the fog
> > clears, but would prefer not to.  Can we please revisit and decide on a
> > way forward?
>
> Johannes did not like a config option so I proposed it here as a
> constant (like SWAP_CLUSTER_MAX and others we have). This is a value
> that we have been using in our data centers for almost a decade, so it

I dug up the previous V1 discussion and this V3 discussion thread.
It seems obvious having multiple pools having locking contention advantage.
The number does not lie.

However the number of pools is hard to decide at compile time.

Regarding the per CPU pool. That might work well for a small number of CPUs.
When the system has many CPUs e.g. a few hundreds of CPUs. It means having
hundreds of pools which is  a bad idea.

How about just setting it as a run time value(size/bits) and can only
change pool
(size/bits) when zswap does not have any active stores.

Chris

> has seen a ton of testing. I was hoping Johannes would get time to
> take a look, or Nhat would get time to test it out, but neither of
> these things happen.
>
> I obviously want it to be merged, but hopefully someone will chime in here :)
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-08-15 22:22         ` Chris Li
@ 2023-08-15 22:29           ` Yosry Ahmed
  2023-08-15 22:59             ` Chris Li
  0 siblings, 1 reply; 9+ messages in thread
From: Yosry Ahmed @ 2023-08-15 22:29 UTC (permalink / raw)
  To: Chris Li
  Cc: Andrew Morton, Nhat Pham, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool, Johannes Weiner, Domenico Cerasuolo,
	Yu Zhao, linux-mm, LKML

On Tue, Aug 15, 2023 at 3:22 PM Chris Li <chrisl@kernel.org> wrote:
>
> Hi Yosry,
>
> On Fri, Aug 11, 2023 at 4:21 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Fri, Aug 11, 2023 at 2:19 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > > >
> > > > > I'm experimenting with some other zswap changes - if I have
> > > > > extra cycles and resources I'll try to apply this patch and see how the
> > > > > numbers play out.
> > > >
> > > > That would be amazing. Looking forward to any numbers you can dig :)
> > >
> > > So this patch seems stuck.  I can keep it in mm.git until the fog
> > > clears, but would prefer not to.  Can we please revisit and decide on a
> > > way forward?
> >
> > Johannes did not like a config option so I proposed it here as a
> > constant (like SWAP_CLUSTER_MAX and others we have). This is a value
> > that we have been using in our data centers for almost a decade, so it
>
> I dug up the previous V1 discussion and this V3 discussion thread.
> It seems obvious having multiple pools having locking contention advantage.
> The number does not lie.
>
> However the number of pools is hard to decide at compile time.
>
> Regarding the per CPU pool. That might work well for a small number of CPUs.
> When the system has many CPUs e.g. a few hundreds of CPUs. It means having
> hundreds of pools which is  a bad idea.
>
> How about just setting it as a run time value(size/bits) and can only
> change pool
> (size/bits) when zswap does not have any active stores.

I was hoping we can add the basic support here for multiple zpools,
and then later, if needed, extend to support runtime dynamic tuning.
Adding this will introduce more complexity as we will need to lock all
trees and make sure there is no activity and alloc/free zpools. If a
limitation for compile-time constant is observed we can do that,
otherwise let's keep it simple and incremental for now.

FWIW, we have been running with 32 zpools in Google's fleet for ~a
decade now and it seems to work well for various workloads and machine
configurations.

>
> Chris
>
> > has seen a ton of testing. I was hoping Johannes would get time to
> > take a look, or Nhat would get time to test it out, but neither of
> > these things happen.
> >
> > I obviously want it to be merged, but hopefully someone will chime in here :)
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-08-15 22:29           ` Yosry Ahmed
@ 2023-08-15 22:59             ` Chris Li
  0 siblings, 0 replies; 9+ messages in thread
From: Chris Li @ 2023-08-15 22:59 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Andrew Morton, Nhat Pham, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool, Johannes Weiner, Domenico Cerasuolo,
	Yu Zhao, linux-mm, LKML

Hi Yosry,

Sure.

Ack-by: Chris Li (Google) <chrisl@kernel.org>

Chris

On Tue, Aug 15, 2023 at 3:30 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> I was hoping we can add the basic support here for multiple zpools,
> and then later, if needed, extend to support runtime dynamic tuning.
> Adding this will introduce more complexity as we will need to lock all
> trees and make sure there is no activity and alloc/free zpools. If a
> limitation for compile-time constant is observed we can do that,
> otherwise let's keep it simple and incremental for now.
>
> FWIW, we have been running with 32 zpools in Google's fleet for ~a
> decade now and it seems to work well for various workloads and machine
> configurations.
>
> >
> > Chris
> >
> > > has seen a ton of testing. I was hoping Johannes would get time to
> > > take a look, or Nhat would get time to test it out, but neither of
> > > these things happen.
> > >
> > > I obviously want it to be merged, but hopefully someone will chime in here :)
> > >
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] mm: zswap: multiple zpools support
  2023-08-11 23:20       ` Yosry Ahmed
  2023-08-15 22:22         ` Chris Li
@ 2023-08-17 22:53         ` Nhat Pham
  1 sibling, 0 replies; 9+ messages in thread
From: Nhat Pham @ 2023-08-17 22:53 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Andrew Morton, Konrad Rzeszutek Wilk, Seth Jennings,
	Dan Streetman, Vitaly Wool, Johannes Weiner, Domenico Cerasuolo,
	Yu Zhao, linux-mm, linux-kernel

On Fri, Aug 11, 2023 at 4:21 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Fri, Aug 11, 2023 at 2:19 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > > >
> > > > I'm experimenting with some other zswap changes - if I have
> > > > extra cycles and resources I'll try to apply this patch and see how the
> > > > numbers play out.
> > >
> > > That would be amazing. Looking forward to any numbers you can dig :)
> >
> > So this patch seems stuck.  I can keep it in mm.git until the fog
> > clears, but would prefer not to.  Can we please revisit and decide on a
> > way forward?
>
> Johannes did not like a config option so I proposed it here as a
> constant (like SWAP_CLUSTER_MAX and others we have). This is a value
> that we have been using in our data centers for almost a decade, so it
> has seen a ton of testing. I was hoping Johannes would get time to
> take a look, or Nhat would get time to test it out, but neither of
> these things happen.
Apologies - finally have some time + freed experiment machine cycles
to put in your patch :P And gotta wait a couple of days to obtain sufficient
data.

Result is quite unexciting - no tremendous gains or significant regression
in a bunch of internal metrics I was observing.

Of course, it's just one particular workload that I tested on - there could
be regression/gains in other workloads (or other metrics). But we can
always revisit this when it happens :)

With all that said, the code itself looks solid. And while I'm still not in
love with the change, I don't have any further objections, as of now.
I'll let you (and Johannes) continue from here.

Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Nhat Pham <nphamcs@gmail.com>

>
> I obviously want it to be merged, but hopefully someone will chime in here :)

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-08-17 22:54 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-20 19:46 [PATCH v3] mm: zswap: multiple zpools support Yosry Ahmed
2023-07-09 23:12 ` Nhat Pham
2023-07-13 10:35   ` Yosry Ahmed
2023-08-11 21:19     ` Andrew Morton
2023-08-11 23:20       ` Yosry Ahmed
2023-08-15 22:22         ` Chris Li
2023-08-15 22:29           ` Yosry Ahmed
2023-08-15 22:59             ` Chris Li
2023-08-17 22:53         ` Nhat Pham

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).