linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/2] mm/zsmalloc: add statistics support
@ 2014-12-13 13:45 Ganesh Mahendran
  2014-12-16  2:45 ` Minchan Kim
  0 siblings, 1 reply; 10+ messages in thread
From: Ganesh Mahendran @ 2014-12-13 13:45 UTC (permalink / raw)
  To: minchan, ngupta, akpm; +Cc: linux-mm, linux-kernel, Ganesh Mahendran

As a ram based memory allocator, keep the fragmentation in a low level
is our target. But now we still need to add the debug code in zsmalloc
to get the quantitative data.

After the RFC patch [1], Minchan Kim gave some suggestions.
  [1] https://patchwork.kernel.org/patch/5469301/

This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
collection for developers. Currently only the objects information in each class
are collected. User can get the information via debugfs. For example:

After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
 class  size obj_allocated   obj_used pages_used
     0    32             0          0          0
     1    48           256         12          3
     2    64            64         14          1
     3    80            51          7          1
     4    96           128          5          3
     5   112            73          5          2
     6   128            32          4          1
     7   144             0          0          0
     8   160             0          0          0
     9   176             0          0          0
    10   192             0          0          0
    11   208             0          0          0
    12   224             0          0          0
    13   240             0          0          0
    14   256            16          1          1
    15   272            15          9          1
    16   288             0          0          0
    17   304             0          0          0
    18   320             0          0          0
    19   336             0          0          0
    20   352             0          0          0
    21   368             0          0          0
    22   384             0          0          0
    23   400             0          0          0
    24   416             0          0          0
    25   432             0          0          0
    26   448             0          0          0
    27   464             0          0          0
    28   480             0          0          0
    29   496            33          1          4
    30   512             0          0          0
    31   528             0          0          0
    32   544             0          0          0
    33   560             0          0          0
    34   576             0          0          0
    35   592             0          0          0
    36   608             0          0          0
    37   624             0          0          0
    38   640             0          0          0
    40   672             0          0          0
    42   704             0          0          0
    43   720            17          1          3
    44   736             0          0          0
    46   768             0          0          0
    49   816             0          0          0
    51   848             0          0          0
    52   864            14          1          3
    54   896             0          0          0
    57   944            13          1          3
    58   960             0          0          0
    62  1024             4          1          1
    66  1088            15          2          4
    67  1104             0          0          0
    71  1168             0          0          0
    74  1216             0          0          0
    76  1248             0          0          0
    83  1360             3          1          1
    91  1488            11          1          4
    94  1536             0          0          0
   100  1632             5          1          2
   107  1744             0          0          0
   111  1808             9          1          4
   126  2048             4          4          2
   144  2336             7          3          4
   151  2448             0          0          0
   168  2720            15         15         10
   190  3072            28         27         21
   202  3264             0          0          0
   254  4096         36209      36209      36209

 Total               37022      36326      36288

We can see the overall fragentation is:
    (37022 - 36326) / 37022 = 1.87%

Also from the statistics we know why we got so low fragmentation:
Most of the objects is in class 254 with size 4096 Bytes. The pages in
zspage is 1. And there is only one object in a page. So, No fragmentation
will be produced.

Also we can collect other information and show it to user in the future.

Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
---
 mm/Kconfig    |   10 ++++
 mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b..95c5728 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -601,6 +601,16 @@ config PGTABLE_MAPPING
 	  You can check speed with zsmalloc benchmark:
 	  https://github.com/spartacus06/zsmapbench
 
+config ZSMALLOC_STAT
+	bool "Export zsmalloc statistics"
+	depends on ZSMALLOC
+	select DEBUG_FS
+	help
+	  This option enables code in the zsmalloc to collect various
+	  statistics about whats happening in zsmalloc and exports that
+	  information to userspace via debugfs.
+	  If unsure, say N.
+
 config GENERIC_EARLY_IOREMAP
 	bool
 
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b724039..a8d0020 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -168,6 +168,8 @@ enum fullness_group {
 	ZS_FULL
 };
 
+static int zs_pool_num;
+
 /*
  * number of size_classes
  */
@@ -200,6 +202,11 @@ struct size_class {
 	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
 	int pages_per_zspage;
 
+#ifdef CONFIG_ZSMALLOC_STAT
+	unsigned long obj_allocated;
+	unsigned long obj_used;
+#endif
+
 	spinlock_t lock;
 
 	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -221,6 +228,10 @@ struct zs_pool {
 
 	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+	struct dentry *stat_dentry;
+#endif
 };
 
 /*
@@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 	return true;
 }
 
+
+#ifdef CONFIG_ZSMALLOC_STAT
+#include <linux/debugfs.h>
+
+static struct dentry *zs_stat_root;
+
+static int __init zs_stat_init(void)
+{
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+	if (!zs_stat_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+	debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_show(struct seq_file *s, void *v)
+{
+	int i;
+	struct zs_pool *pool = (struct zs_pool *)s->private;
+	struct size_class *class;
+	int objs_per_zspage;
+	unsigned long obj_allocated, obj_used, pages_used;
+	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+	seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+				"obj_allocated", "obj_used", "pages_used");
+
+	for (i = 0; i < zs_size_classes; i++) {
+		class = pool->size_class[i];
+
+		if (class->index != i)
+			continue;
+
+		spin_lock(&class->lock);
+
+		obj_allocated = class->obj_allocated;
+		obj_used = class->obj_used;
+		objs_per_zspage = get_maxobj_per_zspage(class->size,
+				class->pages_per_zspage);
+		pages_used = obj_allocated / objs_per_zspage *
+				class->pages_per_zspage;
+
+		spin_unlock(&class->lock);
+
+		seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
+					obj_allocated, obj_used, pages_used);
+
+		total_objs += class->obj_allocated;
+		total_used_objs += class->obj_used;
+		total_pages += pages_used;
+	}
+
+	seq_puts(s, "\n");
+	seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
+			total_objs, total_used_objs, total_pages);
+
+	return 0;
+}
+
+static int zs_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, zs_stats_show, inode->i_private);
+}
+
+static const struct file_operations zs_stats_operations = {
+	.open           = zs_stats_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+static int zs_pool_stat_create(struct zs_pool *pool, int index)
+{
+	char name[10];
+	int ret = 0;
+
+	if (!zs_stat_root) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	snprintf(name, sizeof(name), "pool-%d", index);
+	pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
+	if (!pool->stat_dentry) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+			pool->stat_dentry, pool, &zs_stats_operations);
+
+out:
+	return ret;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+	debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static int __init zs_stat_init(void)
+{
+	return 0;
+}
+
+static void __exit zs_stat_exit(void) { }
+
+static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
+{
+	return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
+
+#endif
+
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
 	return atomic_long_read(&pool->pages_allocated);
@@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 		atomic_long_add(class->pages_per_zspage,
 					&pool->pages_allocated);
 		spin_lock(&class->lock);
+#ifdef CONFIG_ZSMALLOC_STAT
+		class->obj_allocated += get_maxobj_per_zspage(class->size,
+				class->pages_per_zspage);
+#endif
 	}
 
 	obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	kunmap_atomic(vaddr);
 
 	first_page->inuse++;
+#ifdef CONFIG_ZSMALLOC_STAT
+	class->obj_used++;
+#endif
 	/* Now move the zspage to another fullness group, if required */
 	fix_fullness_group(pool, first_page);
 	spin_unlock(&class->lock);
@@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
 	first_page->freelist = (void *)obj;
 
 	first_page->inuse--;
+#ifdef CONFIG_ZSMALLOC_STAT
+	class->obj_used--;
+#endif
 	fullness = fix_fullness_group(pool, first_page);
 	spin_unlock(&class->lock);
 
 	if (fullness == ZS_EMPTY) {
 		atomic_long_sub(class->pages_per_zspage,
 				&pool->pages_allocated);
+#ifdef CONFIG_ZSMALLOC_STAT
+		class->obj_allocated -= get_maxobj_per_zspage(class->size,
+				class->pages_per_zspage);
+#endif
 		free_zspage(first_page);
 	}
 }
@@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
 	}
 
 	pool->flags = flags;
+	zs_pool_num++;
+
+	if (zs_pool_stat_create(pool, zs_pool_num))
+		pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
 
 	return pool;
 
@@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
 		kfree(class);
 	}
 
+	zs_pool_stat_destroy(pool);
+	zs_pool_num--;
+
 	kfree(pool->size_class);
 	kfree(pool);
 }
@@ -1260,6 +1418,10 @@ static int __init zs_init(void)
 #ifdef CONFIG_ZPOOL
 	zpool_register_driver(&zs_zpool_driver);
 #endif
+
+	if (zs_stat_init())
+		pr_warn("zs stat initialization failed\n");
+
 	return 0;
 }
 
@@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
 	zpool_unregister_driver(&zs_zpool_driver);
 #endif
 	zs_unregister_cpu_notifier();
+
+	zs_stat_exit();
 }
 
 module_init(zs_init);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-13 13:45 [PATCH 2/2] mm/zsmalloc: add statistics support Ganesh Mahendran
@ 2014-12-16  2:45 ` Minchan Kim
  2014-12-17  7:05   ` Ganesh Mahendran
  0 siblings, 1 reply; 10+ messages in thread
From: Minchan Kim @ 2014-12-16  2:45 UTC (permalink / raw)
  To: Ganesh Mahendran; +Cc: ngupta, akpm, linux-mm, linux-kernel

On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
> As a ram based memory allocator, keep the fragmentation in a low level

Just say, zsmalloc.

> is our target. But now we still need to add the debug code in zsmalloc
> to get the quantitative data.
> 
> After the RFC patch [1], Minchan Kim gave some suggestions.
>   [1] https://patchwork.kernel.org/patch/5469301/
> 
> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
> collection for developers. Currently only the objects information in each class
> are collected. User can get the information via debugfs. For example:
> 
> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
>  class  size obj_allocated   obj_used pages_used
>      0    32             0          0          0
>      1    48           256         12          3
>      2    64            64         14          1
>      3    80            51          7          1
>      4    96           128          5          3
>      5   112            73          5          2
>      6   128            32          4          1
>      7   144             0          0          0
>      8   160             0          0          0
>      9   176             0          0          0
>     10   192             0          0          0
>     11   208             0          0          0
>     12   224             0          0          0
>     13   240             0          0          0
>     14   256            16          1          1
>     15   272            15          9          1
>     16   288             0          0          0
>     17   304             0          0          0
>     18   320             0          0          0
>     19   336             0          0          0
>     20   352             0          0          0
>     21   368             0          0          0
>     22   384             0          0          0
>     23   400             0          0          0
>     24   416             0          0          0
>     25   432             0          0          0
>     26   448             0          0          0
>     27   464             0          0          0
>     28   480             0          0          0
>     29   496            33          1          4
>     30   512             0          0          0
>     31   528             0          0          0
>     32   544             0          0          0
>     33   560             0          0          0
>     34   576             0          0          0
>     35   592             0          0          0
>     36   608             0          0          0
>     37   624             0          0          0
>     38   640             0          0          0
>     40   672             0          0          0
>     42   704             0          0          0
>     43   720            17          1          3
>     44   736             0          0          0
>     46   768             0          0          0
>     49   816             0          0          0
>     51   848             0          0          0
>     52   864            14          1          3
>     54   896             0          0          0
>     57   944            13          1          3
>     58   960             0          0          0
>     62  1024             4          1          1
>     66  1088            15          2          4
>     67  1104             0          0          0
>     71  1168             0          0          0
>     74  1216             0          0          0
>     76  1248             0          0          0
>     83  1360             3          1          1
>     91  1488            11          1          4
>     94  1536             0          0          0
>    100  1632             5          1          2
>    107  1744             0          0          0
>    111  1808             9          1          4
>    126  2048             4          4          2
>    144  2336             7          3          4
>    151  2448             0          0          0
>    168  2720            15         15         10
>    190  3072            28         27         21
>    202  3264             0          0          0
>    254  4096         36209      36209      36209
> 
>  Total               37022      36326      36288
> 
> We can see the overall fragentation is:
>     (37022 - 36326) / 37022 = 1.87%
> 
> Also from the statistics we know why we got so low fragmentation:
> Most of the objects is in class 254 with size 4096 Bytes. The pages in
> zspage is 1. And there is only one object in a page. So, No fragmentation
> will be produced.
> 
> Also we can collect other information and show it to user in the future.

So, could you make zs
> 
> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
> Suggested-by: Minchan Kim <minchan@kernel.org>
> ---
>  mm/Kconfig    |   10 ++++
>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 174 insertions(+)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 1d1ae6b..95c5728 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
>  	  You can check speed with zsmalloc benchmark:
>  	  https://github.com/spartacus06/zsmapbench
>  
> +config ZSMALLOC_STAT
> +	bool "Export zsmalloc statistics"
> +	depends on ZSMALLOC
> +	select DEBUG_FS
> +	help
> +	  This option enables code in the zsmalloc to collect various
> +	  statistics about whats happening in zsmalloc and exports that
> +	  information to userspace via debugfs.
> +	  If unsure, say N.
> +
>  config GENERIC_EARLY_IOREMAP
>  	bool
>  
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index b724039..a8d0020 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -168,6 +168,8 @@ enum fullness_group {
>  	ZS_FULL
>  };
>  
> +static int zs_pool_num;

What's this? What protects the race?
It means description. 

> +
>  /*
>   * number of size_classes
>   */
> @@ -200,6 +202,11 @@ struct size_class {
>  	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
>  	int pages_per_zspage;
>  
> +#ifdef CONFIG_ZSMALLOC_STAT
> +	unsigned long obj_allocated;
> +	unsigned long obj_used;
> +#endif

I perfer creating new struct.

struct zs_size_stat {
	unsigend long obj_allocated;
	unsignged long obj_used;
};

> +
>  	spinlock_t lock;
>  
>  	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
> @@ -221,6 +228,10 @@ struct zs_pool {
>  
>  	gfp_t flags;	/* allocation flags used when growing pool */
>  	atomic_long_t pages_allocated;
> +
> +#ifdef CONFIG_ZSMALLOC_STAT
> +	struct dentry *stat_dentry;
> +#endif
>  };
>  
>  /*
> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
>  	return true;
>  }
>  
> +
> +#ifdef CONFIG_ZSMALLOC_STAT
> +#include <linux/debugfs.h>

A question:
Why "#include" is here instead of top on the source file?

> +
> +static struct dentry *zs_stat_root;
> +
> +static int __init zs_stat_init(void)
> +{
> +	if (!debugfs_initialized())
> +		return -ENODEV;

Do we need above check?

When I read comment of debugfs_create_dir, it says
"If debugfs is not enabled in the kernel, the value -%ENODEV will be
returned."

> +
> +	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
> +	if (!zs_stat_root)
> +		return -ENOMEM;

On null return of debugfs_create_dir, it means always ENOMEM?

> +
> +	return 0;
> +}
> +
> +static void __exit zs_stat_exit(void)
> +{
> +	debugfs_remove_recursive(zs_stat_root);
> +}
> +
> +static int zs_stats_show(struct seq_file *s, void *v)
> +{
> +	int i;
> +	struct zs_pool *pool = (struct zs_pool *)s->private;
> +	struct size_class *class;
> +	int objs_per_zspage;
> +	unsigned long obj_allocated, obj_used, pages_used;
> +	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
> +
> +	seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
> +				"obj_allocated", "obj_used", "pages_used");
> +
> +	for (i = 0; i < zs_size_classes; i++) {
> +		class = pool->size_class[i];
> +
> +		if (class->index != i)
> +			continue;
> +
> +		spin_lock(&class->lock);
> +
> +		obj_allocated = class->obj_allocated;
> +		obj_used = class->obj_used;
> +		objs_per_zspage = get_maxobj_per_zspage(class->size,
> +				class->pages_per_zspage);
> +		pages_used = obj_allocated / objs_per_zspage *
> +				class->pages_per_zspage;

I think We don't need to protect class->pages_per_zspage with class->lock.

> +
> +		spin_unlock(&class->lock);
> +
> +		seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
> +					obj_allocated, obj_used, pages_used);
> +
> +		total_objs += class->obj_allocated;
> +		total_used_objs += class->obj_used;

You couldn't access class->fields without class lock.
Please, assign them into local variable under the lock and sum them without the lock.

> +		total_pages += pages_used;
> +	}
> +
> +	seq_puts(s, "\n");
> +	seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
> +			total_objs, total_used_objs, total_pages);
> +
> +	return 0;
> +}
> +
> +static int zs_stats_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, zs_stats_show, inode->i_private);
> +}
> +
> +static const struct file_operations zs_stats_operations = {
> +	.open           = zs_stats_open,
> +	.read           = seq_read,
> +	.llseek         = seq_lseek,
> +	.release        = single_release,
> +};
> +
> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
> +{
> +	char name[10];
> +	int ret = 0;
> +
> +	if (!zs_stat_root) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	snprintf(name, sizeof(name), "pool-%d", index);

Hmm, how does admin know any zsmalloc instance is associated with
any block device?
Maybe we need export zspool index to the client and print it
when pool is populated.

> +	pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
> +	if (!pool->stat_dentry) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
> +			pool->stat_dentry, pool, &zs_stats_operations);

No need to check return?

> +
> +out:
> +	return ret;
> +}
> +
> +static void zs_pool_stat_destroy(struct zs_pool *pool)
> +{
> +	debugfs_remove_recursive(pool->stat_dentry);
> +}
> +
> +#else /* CONFIG_ZSMALLOC_STAT */
> +
> +static int __init zs_stat_init(void)
> +{
> +	return 0;
> +}
> +
> +static void __exit zs_stat_exit(void) { }
> +
> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
> +{
> +	return 0;
> +}
> +
> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
> +
> +#endif
> +
>  unsigned long zs_get_total_pages(struct zs_pool *pool)
>  {
>  	return atomic_long_read(&pool->pages_allocated);
> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>  		atomic_long_add(class->pages_per_zspage,
>  					&pool->pages_allocated);
>  		spin_lock(&class->lock);
> +#ifdef CONFIG_ZSMALLOC_STAT
> +		class->obj_allocated += get_maxobj_per_zspage(class->size,
> +				class->pages_per_zspage);
> +#endif

I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());
 
>  	}
>  
>  	obj = (unsigned long)first_page->freelist;
> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>  	kunmap_atomic(vaddr);
>  
>  	first_page->inuse++;
> +#ifdef CONFIG_ZSMALLOC_STAT
> +	class->obj_used++;
> +#endif

zs_stat_inc(class, OBJ_USED, 1)


>  	/* Now move the zspage to another fullness group, if required */
>  	fix_fullness_group(pool, first_page);
>  	spin_unlock(&class->lock);
> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>  	first_page->freelist = (void *)obj;
>  
>  	first_page->inuse--;
> +#ifdef CONFIG_ZSMALLOC_STAT
> +	class->obj_used--;
> +#endif

zs_stat_dec(class, OBJ_USED, 1)

>  	fullness = fix_fullness_group(pool, first_page);
>  	spin_unlock(&class->lock);
>  
>  	if (fullness == ZS_EMPTY) {
>  		atomic_long_sub(class->pages_per_zspage,
>  				&pool->pages_allocated);
> +#ifdef CONFIG_ZSMALLOC_STAT
> +		class->obj_allocated -= get_maxobj_per_zspage(class->size,
> +				class->pages_per_zspage);
> +#endif

zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());

>  		free_zspage(first_page);
>  	}
>  }
> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
>  	}
>  
>  	pool->flags = flags;
> +	zs_pool_num++;

Who protect the race?
And manybe we should keep the index in zspool and export it to the user
to let them know what zsmalloc instance is thiers.

> +
> +	if (zs_pool_stat_create(pool, zs_pool_num))
> +		pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
>  
>  	return pool;
>  
> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
>  		kfree(class);
>  	}
>  
> +	zs_pool_stat_destroy(pool);
> +	zs_pool_num--;
> +
>  	kfree(pool->size_class);
>  	kfree(pool);
>  }
> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
>  #ifdef CONFIG_ZPOOL
>  	zpool_register_driver(&zs_zpool_driver);
>  #endif
> +
> +	if (zs_stat_init())
> +		pr_warn("zs stat initialization failed\n");
> +
>  	return 0;
>  }
>  
> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
>  	zpool_unregister_driver(&zs_zpool_driver);
>  #endif
>  	zs_unregister_cpu_notifier();
> +
> +	zs_stat_exit();
>  }
>  
>  module_init(zs_init);
> -- 
> 1.7.9.5
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-16  2:45 ` Minchan Kim
@ 2014-12-17  7:05   ` Ganesh Mahendran
  2014-12-18 23:44     ` Minchan Kim
  0 siblings, 1 reply; 10+ messages in thread
From: Ganesh Mahendran @ 2014-12-17  7:05 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Nitin Gupta, Andrew Morton, Linux-MM, linux-kernel

Hello, Minchan

Thanks for your review.

2014-12-16 10:45 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
>> As a ram based memory allocator, keep the fragmentation in a low level
>
> Just say, zsmalloc.

Ok.

>
>> is our target. But now we still need to add the debug code in zsmalloc
>> to get the quantitative data.
>>
>> After the RFC patch [1], Minchan Kim gave some suggestions.
>>   [1] https://patchwork.kernel.org/patch/5469301/
>>
>> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
>> collection for developers. Currently only the objects information in each class
>> are collected. User can get the information via debugfs. For example:
>>
>> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
>>  class  size obj_allocated   obj_used pages_used
>>      0    32             0          0          0
>>      1    48           256         12          3
>>      2    64            64         14          1
>>      3    80            51          7          1
>>      4    96           128          5          3
>>      5   112            73          5          2
>>      6   128            32          4          1
>>      7   144             0          0          0
>>      8   160             0          0          0
>>      9   176             0          0          0
>>     10   192             0          0          0
>>     11   208             0          0          0
>>     12   224             0          0          0
>>     13   240             0          0          0
>>     14   256            16          1          1
>>     15   272            15          9          1
>>     16   288             0          0          0
>>     17   304             0          0          0
>>     18   320             0          0          0
>>     19   336             0          0          0
>>     20   352             0          0          0
>>     21   368             0          0          0
>>     22   384             0          0          0
>>     23   400             0          0          0
>>     24   416             0          0          0
>>     25   432             0          0          0
>>     26   448             0          0          0
>>     27   464             0          0          0
>>     28   480             0          0          0
>>     29   496            33          1          4
>>     30   512             0          0          0
>>     31   528             0          0          0
>>     32   544             0          0          0
>>     33   560             0          0          0
>>     34   576             0          0          0
>>     35   592             0          0          0
>>     36   608             0          0          0
>>     37   624             0          0          0
>>     38   640             0          0          0
>>     40   672             0          0          0
>>     42   704             0          0          0
>>     43   720            17          1          3
>>     44   736             0          0          0
>>     46   768             0          0          0
>>     49   816             0          0          0
>>     51   848             0          0          0
>>     52   864            14          1          3
>>     54   896             0          0          0
>>     57   944            13          1          3
>>     58   960             0          0          0
>>     62  1024             4          1          1
>>     66  1088            15          2          4
>>     67  1104             0          0          0
>>     71  1168             0          0          0
>>     74  1216             0          0          0
>>     76  1248             0          0          0
>>     83  1360             3          1          1
>>     91  1488            11          1          4
>>     94  1536             0          0          0
>>    100  1632             5          1          2
>>    107  1744             0          0          0
>>    111  1808             9          1          4
>>    126  2048             4          4          2
>>    144  2336             7          3          4
>>    151  2448             0          0          0
>>    168  2720            15         15         10
>>    190  3072            28         27         21
>>    202  3264             0          0          0
>>    254  4096         36209      36209      36209
>>
>>  Total               37022      36326      36288
>>
>> We can see the overall fragentation is:
>>     (37022 - 36326) / 37022 = 1.87%
>>
>> Also from the statistics we know why we got so low fragmentation:
>> Most of the objects is in class 254 with size 4096 Bytes. The pages in
>> zspage is 1. And there is only one object in a page. So, No fragmentation
>> will be produced.
>>
>> Also we can collect other information and show it to user in the future.
>
> So, could you make zs
Ok
>>
>> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
>> Suggested-by: Minchan Kim <minchan@kernel.org>
>> ---
>>  mm/Kconfig    |   10 ++++
>>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 174 insertions(+)
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 1d1ae6b..95c5728 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
>>         You can check speed with zsmalloc benchmark:
>>         https://github.com/spartacus06/zsmapbench
>>
>> +config ZSMALLOC_STAT
>> +     bool "Export zsmalloc statistics"
>> +     depends on ZSMALLOC
>> +     select DEBUG_FS
>> +     help
>> +       This option enables code in the zsmalloc to collect various
>> +       statistics about whats happening in zsmalloc and exports that
>> +       information to userspace via debugfs.
>> +       If unsure, say N.
>> +
>>  config GENERIC_EARLY_IOREMAP
>>       bool
>>
>> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> index b724039..a8d0020 100644
>> --- a/mm/zsmalloc.c
>> +++ b/mm/zsmalloc.c
>> @@ -168,6 +168,8 @@ enum fullness_group {
>>       ZS_FULL
>>  };
>>
>> +static int zs_pool_num;
>
> What's this? What protects the race?
It is the pool index. Yes, there is problem here.
I will change it to atomic and increased every time a new zs pool created.
And then name the /sys/kernel/debug/pool-x using this index.

static atomic_t zs_pool_index = ATOMIC_INIT(0);
...
pool->index = atomic_inc_return(&zs_pool_index);

> It means description.
>
>> +
>>  /*
>>   * number of size_classes
>>   */
>> @@ -200,6 +202,11 @@ struct size_class {
>>       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
>>       int pages_per_zspage;
>>
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +     unsigned long obj_allocated;
>> +     unsigned long obj_used;
>> +#endif
>
> I perfer creating new struct.
>
> struct zs_size_stat {
>         unsigend long obj_allocated;
>         unsignged long obj_used;
> };

Got it, I will redo this.

>
>> +
>>       spinlock_t lock;
>>
>>       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
>> @@ -221,6 +228,10 @@ struct zs_pool {
>>
>>       gfp_t flags;    /* allocation flags used when growing pool */
>>       atomic_long_t pages_allocated;
>> +
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +     struct dentry *stat_dentry;
>> +#endif
>>  };
>>
>>  /*
>> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
>>       return true;
>>  }
>>
>> +
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +#include <linux/debugfs.h>
>
> A question:
> Why "#include" is here instead of top on the source file?

Yes, the "#include ..." should be on the top of the source file.
I will modify it.

>
>> +
>> +static struct dentry *zs_stat_root;
>> +
>> +static int __init zs_stat_init(void)
>> +{
>> +     if (!debugfs_initialized())
>> +             return -ENODEV;
>
> Do we need above check?
Yes, I think we need this check.
When debugfs module init failed, we should not go ahead here.

>
> When I read comment of debugfs_create_dir, it says
> "If debugfs is not enabled in the kernel, the value -%ENODEV will be
> returned."

This check is not for the situation when the debugfs is not enabled.
But for if we failed in debugfs_init(), then
we should not use any API of debugfs.

And I think "-%ENODEV will be returned" means below code in "debugfs.h"

static inline struct dentry *debugfs_create_dir(const char *name,
struct dentry *parent)
{
    return ERR_PTR(-ENODEV);
}

>
>> +
>> +     zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
>> +     if (!zs_stat_root)
>> +             return -ENOMEM;
>
> On null return of debugfs_create_dir, it means always ENOMEM?

Yes, you are right.  -ENOMEM is not the only reason for the failure.
But debugfs_create_dir does not bring back the errno.
And for zsmalloc, we indeed have the permission(-EPERM) to create the entry and
also we will not create duplicate(-EEXIST) entry in debufs.

So, I think -ENOMEM is suitable.

>
>> +
>> +     return 0;
>> +}
>> +
>> +static void __exit zs_stat_exit(void)
>> +{
>> +     debugfs_remove_recursive(zs_stat_root);
>> +}
>> +
>> +static int zs_stats_show(struct seq_file *s, void *v)
>> +{
>> +     int i;
>> +     struct zs_pool *pool = (struct zs_pool *)s->private;
>> +     struct size_class *class;
>> +     int objs_per_zspage;
>> +     unsigned long obj_allocated, obj_used, pages_used;
>> +     unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
>> +
>> +     seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
>> +                             "obj_allocated", "obj_used", "pages_used");
>> +
>> +     for (i = 0; i < zs_size_classes; i++) {
>> +             class = pool->size_class[i];
>> +
>> +             if (class->index != i)
>> +                     continue;
>> +
>> +             spin_lock(&class->lock);
>> +
>> +             obj_allocated = class->obj_allocated;
>> +             obj_used = class->obj_used;
>> +             objs_per_zspage = get_maxobj_per_zspage(class->size,
>> +                             class->pages_per_zspage);
>> +             pages_used = obj_allocated / objs_per_zspage *
>> +                             class->pages_per_zspage;
>
> I think We don't need to protect class->pages_per_zspage with class->lock.

Yes, you are right.

>
>> +
>> +             spin_unlock(&class->lock);
>> +
>> +             seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
>> +                                     obj_allocated, obj_used, pages_used);
>> +
>> +             total_objs += class->obj_allocated;
>> +             total_used_objs += class->obj_used;
>
> You couldn't access class->fields without class lock.
> Please, assign them into local variable under the lock and sum them without the lock.

Got it. I will redo this.

>
>> +             total_pages += pages_used;
>> +     }
>> +
>> +     seq_puts(s, "\n");
>> +     seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
>> +                     total_objs, total_used_objs, total_pages);
>> +
>> +     return 0;
>> +}
>> +
>> +static int zs_stats_open(struct inode *inode, struct file *file)
>> +{
>> +     return single_open(file, zs_stats_show, inode->i_private);
>> +}
>> +
>> +static const struct file_operations zs_stats_operations = {
>> +     .open           = zs_stats_open,
>> +     .read           = seq_read,
>> +     .llseek         = seq_lseek,
>> +     .release        = single_release,
>> +};
>> +
>> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
>> +{
>> +     char name[10];
>> +     int ret = 0;
>> +
>> +     if (!zs_stat_root) {
>> +             ret = -ENODEV;
>> +             goto out;
>> +     }
>> +
>> +     snprintf(name, sizeof(name), "pool-%d", index);
>
> Hmm, how does admin know any zsmalloc instance is associated with
> any block device?
> Maybe we need export zspool index to the client and print it
> when pool is populated.

Thanks for your suggestion.

>
>> +     pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
>> +     if (!pool->stat_dentry) {
>> +             ret = -ENOMEM;
>> +             goto out;
>> +     }
>> +
>> +     debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
>> +                     pool->stat_dentry, pool, &zs_stats_operations);
>
> No need to check return?

It is better to check the return value and give user some information
about the failure.

>
>> +
>> +out:
>> +     return ret;
>> +}
>> +
>> +static void zs_pool_stat_destroy(struct zs_pool *pool)
>> +{
>> +     debugfs_remove_recursive(pool->stat_dentry);
>> +}
>> +
>> +#else /* CONFIG_ZSMALLOC_STAT */
>> +
>> +static int __init zs_stat_init(void)
>> +{
>> +     return 0;
>> +}
>> +
>> +static void __exit zs_stat_exit(void) { }
>> +
>> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
>> +{
>> +     return 0;
>> +}
>> +
>> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
>> +
>> +#endif
>> +
>>  unsigned long zs_get_total_pages(struct zs_pool *pool)
>>  {
>>       return atomic_long_read(&pool->pages_allocated);
>> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>>               atomic_long_add(class->pages_per_zspage,
>>                                       &pool->pages_allocated);
>>               spin_lock(&class->lock);
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +             class->obj_allocated += get_maxobj_per_zspage(class->size,
>> +                             class->pages_per_zspage);
>> +#endif
>
> I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());

Got it. thanks

>
>>       }
>>
>>       obj = (unsigned long)first_page->freelist;
>> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>>       kunmap_atomic(vaddr);
>>
>>       first_page->inuse++;
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +     class->obj_used++;
>> +#endif
>
> zs_stat_inc(class, OBJ_USED, 1)

OK

>
>
>>       /* Now move the zspage to another fullness group, if required */
>>       fix_fullness_group(pool, first_page);
>>       spin_unlock(&class->lock);
>> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>>       first_page->freelist = (void *)obj;
>>
>>       first_page->inuse--;
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +     class->obj_used--;
>> +#endif
>
> zs_stat_dec(class, OBJ_USED, 1)

OK

>
>>       fullness = fix_fullness_group(pool, first_page);
>>       spin_unlock(&class->lock);
>>
>>       if (fullness == ZS_EMPTY) {
>>               atomic_long_sub(class->pages_per_zspage,
>>                               &pool->pages_allocated);
>> +#ifdef CONFIG_ZSMALLOC_STAT
>> +             class->obj_allocated -= get_maxobj_per_zspage(class->size,
>> +                             class->pages_per_zspage);
>> +#endif
>
> zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());
>
>>               free_zspage(first_page);
>>       }
>>  }
>> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
>>       }
>>
>>       pool->flags = flags;
>> +     zs_pool_num++;
>
> Who protect the race?
> And manybe we should keep the index in zspool and export it to the user
> to let them know what zsmalloc instance is thiers.

OK

>
>> +
>> +     if (zs_pool_stat_create(pool, zs_pool_num))
>> +             pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
>>
>>       return pool;
>>
>> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
>>               kfree(class);
>>       }
>>
>> +     zs_pool_stat_destroy(pool);
>> +     zs_pool_num--;
>> +
>>       kfree(pool->size_class);
>>       kfree(pool);
>>  }
>> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
>>  #ifdef CONFIG_ZPOOL
>>       zpool_register_driver(&zs_zpool_driver);
>>  #endif
>> +
>> +     if (zs_stat_init())
>> +             pr_warn("zs stat initialization failed\n");
>> +
>>       return 0;
>>  }
>>
>> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
>>       zpool_unregister_driver(&zs_zpool_driver);
>>  #endif
>>       zs_unregister_cpu_notifier();
>> +
>> +     zs_stat_exit();
>>  }
>>
>>  module_init(zs_init);
>> --
>> 1.7.9.5
>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-17  7:05   ` Ganesh Mahendran
@ 2014-12-18 23:44     ` Minchan Kim
  2014-12-19  1:21       ` Ganesh Mahendran
  0 siblings, 1 reply; 10+ messages in thread
From: Minchan Kim @ 2014-12-18 23:44 UTC (permalink / raw)
  To: Ganesh Mahendran; +Cc: Nitin Gupta, Andrew Morton, Linux-MM, linux-kernel

Hello Ganesh,

On Wed, Dec 17, 2014 at 03:05:19PM +0800, Ganesh Mahendran wrote:
> Hello, Minchan
> 
> Thanks for your review.
> 
> 2014-12-16 10:45 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> > On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
> >> As a ram based memory allocator, keep the fragmentation in a low level
> >
> > Just say, zsmalloc.
> 
> Ok.
> 
> >
> >> is our target. But now we still need to add the debug code in zsmalloc
> >> to get the quantitative data.
> >>
> >> After the RFC patch [1], Minchan Kim gave some suggestions.
> >>   [1] https://patchwork.kernel.org/patch/5469301/
> >>
> >> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
> >> collection for developers. Currently only the objects information in each class
> >> are collected. User can get the information via debugfs. For example:
> >>
> >> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
> >>  class  size obj_allocated   obj_used pages_used
> >>      0    32             0          0          0
> >>      1    48           256         12          3
> >>      2    64            64         14          1
> >>      3    80            51          7          1
> >>      4    96           128          5          3
> >>      5   112            73          5          2
> >>      6   128            32          4          1
> >>      7   144             0          0          0
> >>      8   160             0          0          0
> >>      9   176             0          0          0
> >>     10   192             0          0          0
> >>     11   208             0          0          0
> >>     12   224             0          0          0
> >>     13   240             0          0          0
> >>     14   256            16          1          1
> >>     15   272            15          9          1
> >>     16   288             0          0          0
> >>     17   304             0          0          0
> >>     18   320             0          0          0
> >>     19   336             0          0          0
> >>     20   352             0          0          0
> >>     21   368             0          0          0
> >>     22   384             0          0          0
> >>     23   400             0          0          0
> >>     24   416             0          0          0
> >>     25   432             0          0          0
> >>     26   448             0          0          0
> >>     27   464             0          0          0
> >>     28   480             0          0          0
> >>     29   496            33          1          4
> >>     30   512             0          0          0
> >>     31   528             0          0          0
> >>     32   544             0          0          0
> >>     33   560             0          0          0
> >>     34   576             0          0          0
> >>     35   592             0          0          0
> >>     36   608             0          0          0
> >>     37   624             0          0          0
> >>     38   640             0          0          0
> >>     40   672             0          0          0
> >>     42   704             0          0          0
> >>     43   720            17          1          3
> >>     44   736             0          0          0
> >>     46   768             0          0          0
> >>     49   816             0          0          0
> >>     51   848             0          0          0
> >>     52   864            14          1          3
> >>     54   896             0          0          0
> >>     57   944            13          1          3
> >>     58   960             0          0          0
> >>     62  1024             4          1          1
> >>     66  1088            15          2          4
> >>     67  1104             0          0          0
> >>     71  1168             0          0          0
> >>     74  1216             0          0          0
> >>     76  1248             0          0          0
> >>     83  1360             3          1          1
> >>     91  1488            11          1          4
> >>     94  1536             0          0          0
> >>    100  1632             5          1          2
> >>    107  1744             0          0          0
> >>    111  1808             9          1          4
> >>    126  2048             4          4          2
> >>    144  2336             7          3          4
> >>    151  2448             0          0          0
> >>    168  2720            15         15         10
> >>    190  3072            28         27         21
> >>    202  3264             0          0          0
> >>    254  4096         36209      36209      36209
> >>
> >>  Total               37022      36326      36288
> >>
> >> We can see the overall fragentation is:
> >>     (37022 - 36326) / 37022 = 1.87%
> >>
> >> Also from the statistics we know why we got so low fragmentation:
> >> Most of the objects is in class 254 with size 4096 Bytes. The pages in
> >> zspage is 1. And there is only one object in a page. So, No fragmentation
> >> will be produced.
> >>
> >> Also we can collect other information and show it to user in the future.
> >
> > So, could you make zs
> Ok
> >>
> >> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
> >> Suggested-by: Minchan Kim <minchan@kernel.org>
> >> ---
> >>  mm/Kconfig    |   10 ++++
> >>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  2 files changed, 174 insertions(+)
> >>
> >> diff --git a/mm/Kconfig b/mm/Kconfig
> >> index 1d1ae6b..95c5728 100644
> >> --- a/mm/Kconfig
> >> +++ b/mm/Kconfig
> >> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
> >>         You can check speed with zsmalloc benchmark:
> >>         https://github.com/spartacus06/zsmapbench
> >>
> >> +config ZSMALLOC_STAT
> >> +     bool "Export zsmalloc statistics"
> >> +     depends on ZSMALLOC
> >> +     select DEBUG_FS
> >> +     help
> >> +       This option enables code in the zsmalloc to collect various
> >> +       statistics about whats happening in zsmalloc and exports that
> >> +       information to userspace via debugfs.
> >> +       If unsure, say N.
> >> +
> >>  config GENERIC_EARLY_IOREMAP
> >>       bool
> >>
> >> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> >> index b724039..a8d0020 100644
> >> --- a/mm/zsmalloc.c
> >> +++ b/mm/zsmalloc.c
> >> @@ -168,6 +168,8 @@ enum fullness_group {
> >>       ZS_FULL
> >>  };
> >>
> >> +static int zs_pool_num;
> >
> > What's this? What protects the race?
> It is the pool index. Yes, there is problem here.
> I will change it to atomic and increased every time a new zs pool created.
> And then name the /sys/kernel/debug/pool-x using this index.
> 
> static atomic_t zs_pool_index = ATOMIC_INIT(0);
> ...
> pool->index = atomic_inc_return(&zs_pool_index);
> 
> > It means description.
> >
> >> +
> >>  /*
> >>   * number of size_classes
> >>   */
> >> @@ -200,6 +202,11 @@ struct size_class {
> >>       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
> >>       int pages_per_zspage;
> >>
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +     unsigned long obj_allocated;
> >> +     unsigned long obj_used;
> >> +#endif
> >
> > I perfer creating new struct.
> >
> > struct zs_size_stat {
> >         unsigend long obj_allocated;
> >         unsignged long obj_used;
> > };
> 
> Got it, I will redo this.
> 
> >
> >> +
> >>       spinlock_t lock;
> >>
> >>       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
> >> @@ -221,6 +228,10 @@ struct zs_pool {
> >>
> >>       gfp_t flags;    /* allocation flags used when growing pool */
> >>       atomic_long_t pages_allocated;
> >> +
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +     struct dentry *stat_dentry;
> >> +#endif
> >>  };
> >>
> >>  /*
> >> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
> >>       return true;
> >>  }
> >>
> >> +
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +#include <linux/debugfs.h>
> >
> > A question:
> > Why "#include" is here instead of top on the source file?
> 
> Yes, the "#include ..." should be on the top of the source file.
> I will modify it.
> 
> >
> >> +
> >> +static struct dentry *zs_stat_root;
> >> +
> >> +static int __init zs_stat_init(void)
> >> +{
> >> +     if (!debugfs_initialized())
> >> +             return -ENODEV;
> >
> > Do we need above check?
> Yes, I think we need this check.
> When debugfs module init failed, we should not go ahead here.
> 
> >
> > When I read comment of debugfs_create_dir, it says
> > "If debugfs is not enabled in the kernel, the value -%ENODEV will be
> > returned."
> 
> This check is not for the situation when the debugfs is not enabled.
> But for if we failed in debugfs_init(), then
> we should not use any API of debugfs.
> 
> And I think "-%ENODEV will be returned" means below code in "debugfs.h"
> 
> static inline struct dentry *debugfs_create_dir(const char *name,
> struct dentry *parent)
> {
>     return ERR_PTR(-ENODEV);
> }
> 
> >
> >> +
> >> +     zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
> >> +     if (!zs_stat_root)
> >> +             return -ENOMEM;
> >
> > On null return of debugfs_create_dir, it means always ENOMEM?
> 
> Yes, you are right.  -ENOMEM is not the only reason for the failure.
> But debugfs_create_dir does not bring back the errno.
> And for zsmalloc, we indeed have the permission(-EPERM) to create the entry and
> also we will not create duplicate(-EEXIST) entry in debufs.
> 
> So, I think -ENOMEM is suitable.

It seems you are saying why debugfs_create_dir can fail but zsmalloc will not
fail by such reasons.
I don't know the internal of debugfs_create_dir but description just says
"If an error occurs, %NULL will be returned" but everyone returns ENOMEM
blindly then. Hmm, I don't think it's good but I'm okay too because it's not
our fault and others have been in there. :(

When I look at zs_init, you don't propagate the error to user. Just missing?

> 
> >
> >> +
> >> +     return 0;
> >> +}
> >> +
> >> +static void __exit zs_stat_exit(void)
> >> +{
> >> +     debugfs_remove_recursive(zs_stat_root);
> >> +}
> >> +
> >> +static int zs_stats_show(struct seq_file *s, void *v)
> >> +{
> >> +     int i;
> >> +     struct zs_pool *pool = (struct zs_pool *)s->private;
> >> +     struct size_class *class;
> >> +     int objs_per_zspage;
> >> +     unsigned long obj_allocated, obj_used, pages_used;
> >> +     unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
> >> +
> >> +     seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
> >> +                             "obj_allocated", "obj_used", "pages_used");
> >> +
> >> +     for (i = 0; i < zs_size_classes; i++) {
> >> +             class = pool->size_class[i];
> >> +
> >> +             if (class->index != i)
> >> +                     continue;
> >> +
> >> +             spin_lock(&class->lock);
> >> +
> >> +             obj_allocated = class->obj_allocated;
> >> +             obj_used = class->obj_used;
> >> +             objs_per_zspage = get_maxobj_per_zspage(class->size,
> >> +                             class->pages_per_zspage);
> >> +             pages_used = obj_allocated / objs_per_zspage *
> >> +                             class->pages_per_zspage;
> >
> > I think We don't need to protect class->pages_per_zspage with class->lock.
> 
> Yes, you are right.
> 
> >
> >> +
> >> +             spin_unlock(&class->lock);
> >> +
> >> +             seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
> >> +                                     obj_allocated, obj_used, pages_used);
> >> +
> >> +             total_objs += class->obj_allocated;
> >> +             total_used_objs += class->obj_used;
> >
> > You couldn't access class->fields without class lock.
> > Please, assign them into local variable under the lock and sum them without the lock.
> 
> Got it. I will redo this.
> 
> >
> >> +             total_pages += pages_used;
> >> +     }
> >> +
> >> +     seq_puts(s, "\n");
> >> +     seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
> >> +                     total_objs, total_used_objs, total_pages);
> >> +
> >> +     return 0;
> >> +}
> >> +
> >> +static int zs_stats_open(struct inode *inode, struct file *file)
> >> +{
> >> +     return single_open(file, zs_stats_show, inode->i_private);
> >> +}
> >> +
> >> +static const struct file_operations zs_stats_operations = {
> >> +     .open           = zs_stats_open,
> >> +     .read           = seq_read,
> >> +     .llseek         = seq_lseek,
> >> +     .release        = single_release,
> >> +};
> >> +
> >> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
> >> +{
> >> +     char name[10];
> >> +     int ret = 0;
> >> +
> >> +     if (!zs_stat_root) {
> >> +             ret = -ENODEV;
> >> +             goto out;
> >> +     }
> >> +
> >> +     snprintf(name, sizeof(name), "pool-%d", index);
> >
> > Hmm, how does admin know any zsmalloc instance is associated with
> > any block device?
> > Maybe we need export zspool index to the client and print it
> > when pool is populated.
> 
> Thanks for your suggestion.
> 
> >
> >> +     pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
> >> +     if (!pool->stat_dentry) {
> >> +             ret = -ENOMEM;
> >> +             goto out;
> >> +     }
> >> +
> >> +     debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
> >> +                     pool->stat_dentry, pool, &zs_stats_operations);
> >
> > No need to check return?
> 
> It is better to check the return value and give user some information
> about the failure.
> 
> >
> >> +
> >> +out:
> >> +     return ret;
> >> +}
> >> +
> >> +static void zs_pool_stat_destroy(struct zs_pool *pool)
> >> +{
> >> +     debugfs_remove_recursive(pool->stat_dentry);
> >> +}
> >> +
> >> +#else /* CONFIG_ZSMALLOC_STAT */
> >> +
> >> +static int __init zs_stat_init(void)
> >> +{
> >> +     return 0;
> >> +}
> >> +
> >> +static void __exit zs_stat_exit(void) { }
> >> +
> >> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
> >> +{
> >> +     return 0;
> >> +}
> >> +
> >> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
> >> +
> >> +#endif
> >> +
> >>  unsigned long zs_get_total_pages(struct zs_pool *pool)
> >>  {
> >>       return atomic_long_read(&pool->pages_allocated);
> >> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >>               atomic_long_add(class->pages_per_zspage,
> >>                                       &pool->pages_allocated);
> >>               spin_lock(&class->lock);
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +             class->obj_allocated += get_maxobj_per_zspage(class->size,
> >> +                             class->pages_per_zspage);
> >> +#endif
> >
> > I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());
> 
> Got it. thanks
> 
> >
> >>       }
> >>
> >>       obj = (unsigned long)first_page->freelist;
> >> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >>       kunmap_atomic(vaddr);
> >>
> >>       first_page->inuse++;
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +     class->obj_used++;
> >> +#endif
> >
> > zs_stat_inc(class, OBJ_USED, 1)
> 
> OK
> 
> >
> >
> >>       /* Now move the zspage to another fullness group, if required */
> >>       fix_fullness_group(pool, first_page);
> >>       spin_unlock(&class->lock);
> >> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >>       first_page->freelist = (void *)obj;
> >>
> >>       first_page->inuse--;
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +     class->obj_used--;
> >> +#endif
> >
> > zs_stat_dec(class, OBJ_USED, 1)
> 
> OK
> 
> >
> >>       fullness = fix_fullness_group(pool, first_page);
> >>       spin_unlock(&class->lock);
> >>
> >>       if (fullness == ZS_EMPTY) {
> >>               atomic_long_sub(class->pages_per_zspage,
> >>                               &pool->pages_allocated);
> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> +             class->obj_allocated -= get_maxobj_per_zspage(class->size,
> >> +                             class->pages_per_zspage);
> >> +#endif
> >
> > zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());
> >
> >>               free_zspage(first_page);
> >>       }
> >>  }
> >> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
> >>       }
> >>
> >>       pool->flags = flags;
> >> +     zs_pool_num++;
> >
> > Who protect the race?
> > And manybe we should keep the index in zspool and export it to the user
> > to let them know what zsmalloc instance is thiers.
> 
> OK
> 
> >
> >> +
> >> +     if (zs_pool_stat_create(pool, zs_pool_num))
> >> +             pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
> >>
> >>       return pool;
> >>
> >> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
> >>               kfree(class);
> >>       }
> >>
> >> +     zs_pool_stat_destroy(pool);
> >> +     zs_pool_num--;
> >> +
> >>       kfree(pool->size_class);
> >>       kfree(pool);
> >>  }
> >> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
> >>  #ifdef CONFIG_ZPOOL
> >>       zpool_register_driver(&zs_zpool_driver);
> >>  #endif
> >> +
> >> +     if (zs_stat_init())
> >> +             pr_warn("zs stat initialization failed\n");
> >> +
> >>       return 0;
> >>  }
> >>
> >> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
> >>       zpool_unregister_driver(&zs_zpool_driver);
> >>  #endif
> >>       zs_unregister_cpu_notifier();
> >> +
> >> +     zs_stat_exit();
> >>  }
> >>
> >>  module_init(zs_init);
> >> --
> >> 1.7.9.5
> >>
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-18 23:44     ` Minchan Kim
@ 2014-12-19  1:21       ` Ganesh Mahendran
  2014-12-19  1:30         ` Minchan Kim
  0 siblings, 1 reply; 10+ messages in thread
From: Ganesh Mahendran @ 2014-12-19  1:21 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Nitin Gupta, Andrew Morton, Linux-MM, linux-kernel

Hello, Minchan

2014-12-19 7:44 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> Hello Ganesh,
>
> On Wed, Dec 17, 2014 at 03:05:19PM +0800, Ganesh Mahendran wrote:
>> Hello, Minchan
>>
>> Thanks for your review.
>>
>> 2014-12-16 10:45 GMT+08:00 Minchan Kim <minchan@kernel.org>:
>> > On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
>> >> As a ram based memory allocator, keep the fragmentation in a low level
>> >
>> > Just say, zsmalloc.
>>
>> Ok.
>>
>> >
>> >> is our target. But now we still need to add the debug code in zsmalloc
>> >> to get the quantitative data.
>> >>
>> >> After the RFC patch [1], Minchan Kim gave some suggestions.
>> >>   [1] https://patchwork.kernel.org/patch/5469301/
>> >>
>> >> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
>> >> collection for developers. Currently only the objects information in each class
>> >> are collected. User can get the information via debugfs. For example:
>> >>
>> >> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
>> >>  class  size obj_allocated   obj_used pages_used
>> >>      0    32             0          0          0
>> >>      1    48           256         12          3
>> >>      2    64            64         14          1
>> >>      3    80            51          7          1
>> >>      4    96           128          5          3
>> >>      5   112            73          5          2
>> >>      6   128            32          4          1
>> >>      7   144             0          0          0
>> >>      8   160             0          0          0
>> >>      9   176             0          0          0
>> >>     10   192             0          0          0
>> >>     11   208             0          0          0
>> >>     12   224             0          0          0
>> >>     13   240             0          0          0
>> >>     14   256            16          1          1
>> >>     15   272            15          9          1
>> >>     16   288             0          0          0
>> >>     17   304             0          0          0
>> >>     18   320             0          0          0
>> >>     19   336             0          0          0
>> >>     20   352             0          0          0
>> >>     21   368             0          0          0
>> >>     22   384             0          0          0
>> >>     23   400             0          0          0
>> >>     24   416             0          0          0
>> >>     25   432             0          0          0
>> >>     26   448             0          0          0
>> >>     27   464             0          0          0
>> >>     28   480             0          0          0
>> >>     29   496            33          1          4
>> >>     30   512             0          0          0
>> >>     31   528             0          0          0
>> >>     32   544             0          0          0
>> >>     33   560             0          0          0
>> >>     34   576             0          0          0
>> >>     35   592             0          0          0
>> >>     36   608             0          0          0
>> >>     37   624             0          0          0
>> >>     38   640             0          0          0
>> >>     40   672             0          0          0
>> >>     42   704             0          0          0
>> >>     43   720            17          1          3
>> >>     44   736             0          0          0
>> >>     46   768             0          0          0
>> >>     49   816             0          0          0
>> >>     51   848             0          0          0
>> >>     52   864            14          1          3
>> >>     54   896             0          0          0
>> >>     57   944            13          1          3
>> >>     58   960             0          0          0
>> >>     62  1024             4          1          1
>> >>     66  1088            15          2          4
>> >>     67  1104             0          0          0
>> >>     71  1168             0          0          0
>> >>     74  1216             0          0          0
>> >>     76  1248             0          0          0
>> >>     83  1360             3          1          1
>> >>     91  1488            11          1          4
>> >>     94  1536             0          0          0
>> >>    100  1632             5          1          2
>> >>    107  1744             0          0          0
>> >>    111  1808             9          1          4
>> >>    126  2048             4          4          2
>> >>    144  2336             7          3          4
>> >>    151  2448             0          0          0
>> >>    168  2720            15         15         10
>> >>    190  3072            28         27         21
>> >>    202  3264             0          0          0
>> >>    254  4096         36209      36209      36209
>> >>
>> >>  Total               37022      36326      36288
>> >>
>> >> We can see the overall fragentation is:
>> >>     (37022 - 36326) / 37022 = 1.87%
>> >>
>> >> Also from the statistics we know why we got so low fragmentation:
>> >> Most of the objects is in class 254 with size 4096 Bytes. The pages in
>> >> zspage is 1. And there is only one object in a page. So, No fragmentation
>> >> will be produced.
>> >>
>> >> Also we can collect other information and show it to user in the future.
>> >
>> > So, could you make zs
>> Ok
>> >>
>> >> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
>> >> Suggested-by: Minchan Kim <minchan@kernel.org>
>> >> ---
>> >>  mm/Kconfig    |   10 ++++
>> >>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >>  2 files changed, 174 insertions(+)
>> >>
>> >> diff --git a/mm/Kconfig b/mm/Kconfig
>> >> index 1d1ae6b..95c5728 100644
>> >> --- a/mm/Kconfig
>> >> +++ b/mm/Kconfig
>> >> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
>> >>         You can check speed with zsmalloc benchmark:
>> >>         https://github.com/spartacus06/zsmapbench
>> >>
>> >> +config ZSMALLOC_STAT
>> >> +     bool "Export zsmalloc statistics"
>> >> +     depends on ZSMALLOC
>> >> +     select DEBUG_FS
>> >> +     help
>> >> +       This option enables code in the zsmalloc to collect various
>> >> +       statistics about whats happening in zsmalloc and exports that
>> >> +       information to userspace via debugfs.
>> >> +       If unsure, say N.
>> >> +
>> >>  config GENERIC_EARLY_IOREMAP
>> >>       bool
>> >>
>> >> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> >> index b724039..a8d0020 100644
>> >> --- a/mm/zsmalloc.c
>> >> +++ b/mm/zsmalloc.c
>> >> @@ -168,6 +168,8 @@ enum fullness_group {
>> >>       ZS_FULL
>> >>  };
>> >>
>> >> +static int zs_pool_num;
>> >
>> > What's this? What protects the race?
>> It is the pool index. Yes, there is problem here.
>> I will change it to atomic and increased every time a new zs pool created.
>> And then name the /sys/kernel/debug/pool-x using this index.
>>
>> static atomic_t zs_pool_index = ATOMIC_INIT(0);
>> ...
>> pool->index = atomic_inc_return(&zs_pool_index);
>>
>> > It means description.
>> >
>> >> +
>> >>  /*
>> >>   * number of size_classes
>> >>   */
>> >> @@ -200,6 +202,11 @@ struct size_class {
>> >>       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
>> >>       int pages_per_zspage;
>> >>
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +     unsigned long obj_allocated;
>> >> +     unsigned long obj_used;
>> >> +#endif
>> >
>> > I perfer creating new struct.
>> >
>> > struct zs_size_stat {
>> >         unsigend long obj_allocated;
>> >         unsignged long obj_used;
>> > };
>>
>> Got it, I will redo this.
>>
>> >
>> >> +
>> >>       spinlock_t lock;
>> >>
>> >>       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
>> >> @@ -221,6 +228,10 @@ struct zs_pool {
>> >>
>> >>       gfp_t flags;    /* allocation flags used when growing pool */
>> >>       atomic_long_t pages_allocated;
>> >> +
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +     struct dentry *stat_dentry;
>> >> +#endif
>> >>  };
>> >>
>> >>  /*
>> >> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
>> >>       return true;
>> >>  }
>> >>
>> >> +
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +#include <linux/debugfs.h>
>> >
>> > A question:
>> > Why "#include" is here instead of top on the source file?
>>
>> Yes, the "#include ..." should be on the top of the source file.
>> I will modify it.
>>
>> >
>> >> +
>> >> +static struct dentry *zs_stat_root;
>> >> +
>> >> +static int __init zs_stat_init(void)
>> >> +{
>> >> +     if (!debugfs_initialized())
>> >> +             return -ENODEV;
>> >
>> > Do we need above check?
>> Yes, I think we need this check.
>> When debugfs module init failed, we should not go ahead here.
>>
>> >
>> > When I read comment of debugfs_create_dir, it says
>> > "If debugfs is not enabled in the kernel, the value -%ENODEV will be
>> > returned."
>>
>> This check is not for the situation when the debugfs is not enabled.
>> But for if we failed in debugfs_init(), then
>> we should not use any API of debugfs.
>>
>> And I think "-%ENODEV will be returned" means below code in "debugfs.h"
>>
>> static inline struct dentry *debugfs_create_dir(const char *name,
>> struct dentry *parent)
>> {
>>     return ERR_PTR(-ENODEV);
>> }
>>
>> >
>> >> +
>> >> +     zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
>> >> +     if (!zs_stat_root)
>> >> +             return -ENOMEM;
>> >
>> > On null return of debugfs_create_dir, it means always ENOMEM?
>>
>> Yes, you are right.  -ENOMEM is not the only reason for the failure.
>> But debugfs_create_dir does not bring back the errno.
>> And for zsmalloc, we indeed have the permission(-EPERM) to create the entry and
>> also we will not create duplicate(-EEXIST) entry in debufs.
>>
>> So, I think -ENOMEM is suitable.
>
> It seems you are saying why debugfs_create_dir can fail but zsmalloc will not
> fail by such reasons.
> I don't know the internal of debugfs_create_dir but description just says
> "If an error occurs, %NULL will be returned" but everyone returns ENOMEM
> blindly then. Hmm, I don't think it's good but I'm okay too because it's not
> our fault and others have been in there. :(
>
> When I look at zs_init, you don't propagate the error to user. Just missing?

As the statistics collection is not essential to user, and I think we
do not need to break
the zsmalloc module loading when error happens in
zs_stat_init()/debugfs_create_dir().
So I do not propagate the error to user and just give the message:
       pr_warn("zs stat initialization failed\n");

Thanks.

>
>>
>> >
>> >> +
>> >> +     return 0;
>> >> +}
>> >> +
>> >> +static void __exit zs_stat_exit(void)
>> >> +{
>> >> +     debugfs_remove_recursive(zs_stat_root);
>> >> +}
>> >> +
>> >> +static int zs_stats_show(struct seq_file *s, void *v)
>> >> +{
>> >> +     int i;
>> >> +     struct zs_pool *pool = (struct zs_pool *)s->private;
>> >> +     struct size_class *class;
>> >> +     int objs_per_zspage;
>> >> +     unsigned long obj_allocated, obj_used, pages_used;
>> >> +     unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
>> >> +
>> >> +     seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
>> >> +                             "obj_allocated", "obj_used", "pages_used");
>> >> +
>> >> +     for (i = 0; i < zs_size_classes; i++) {
>> >> +             class = pool->size_class[i];
>> >> +
>> >> +             if (class->index != i)
>> >> +                     continue;
>> >> +
>> >> +             spin_lock(&class->lock);
>> >> +
>> >> +             obj_allocated = class->obj_allocated;
>> >> +             obj_used = class->obj_used;
>> >> +             objs_per_zspage = get_maxobj_per_zspage(class->size,
>> >> +                             class->pages_per_zspage);
>> >> +             pages_used = obj_allocated / objs_per_zspage *
>> >> +                             class->pages_per_zspage;
>> >
>> > I think We don't need to protect class->pages_per_zspage with class->lock.
>>
>> Yes, you are right.
>>
>> >
>> >> +
>> >> +             spin_unlock(&class->lock);
>> >> +
>> >> +             seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
>> >> +                                     obj_allocated, obj_used, pages_used);
>> >> +
>> >> +             total_objs += class->obj_allocated;
>> >> +             total_used_objs += class->obj_used;
>> >
>> > You couldn't access class->fields without class lock.
>> > Please, assign them into local variable under the lock and sum them without the lock.
>>
>> Got it. I will redo this.
>>
>> >
>> >> +             total_pages += pages_used;
>> >> +     }
>> >> +
>> >> +     seq_puts(s, "\n");
>> >> +     seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
>> >> +                     total_objs, total_used_objs, total_pages);
>> >> +
>> >> +     return 0;
>> >> +}
>> >> +
>> >> +static int zs_stats_open(struct inode *inode, struct file *file)
>> >> +{
>> >> +     return single_open(file, zs_stats_show, inode->i_private);
>> >> +}
>> >> +
>> >> +static const struct file_operations zs_stats_operations = {
>> >> +     .open           = zs_stats_open,
>> >> +     .read           = seq_read,
>> >> +     .llseek         = seq_lseek,
>> >> +     .release        = single_release,
>> >> +};
>> >> +
>> >> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
>> >> +{
>> >> +     char name[10];
>> >> +     int ret = 0;
>> >> +
>> >> +     if (!zs_stat_root) {
>> >> +             ret = -ENODEV;
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     snprintf(name, sizeof(name), "pool-%d", index);
>> >
>> > Hmm, how does admin know any zsmalloc instance is associated with
>> > any block device?
>> > Maybe we need export zspool index to the client and print it
>> > when pool is populated.
>>
>> Thanks for your suggestion.
>>
>> >
>> >> +     pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
>> >> +     if (!pool->stat_dentry) {
>> >> +             ret = -ENOMEM;
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
>> >> +                     pool->stat_dentry, pool, &zs_stats_operations);
>> >
>> > No need to check return?
>>
>> It is better to check the return value and give user some information
>> about the failure.
>>
>> >
>> >> +
>> >> +out:
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +static void zs_pool_stat_destroy(struct zs_pool *pool)
>> >> +{
>> >> +     debugfs_remove_recursive(pool->stat_dentry);
>> >> +}
>> >> +
>> >> +#else /* CONFIG_ZSMALLOC_STAT */
>> >> +
>> >> +static int __init zs_stat_init(void)
>> >> +{
>> >> +     return 0;
>> >> +}
>> >> +
>> >> +static void __exit zs_stat_exit(void) { }
>> >> +
>> >> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
>> >> +{
>> >> +     return 0;
>> >> +}
>> >> +
>> >> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
>> >> +
>> >> +#endif
>> >> +
>> >>  unsigned long zs_get_total_pages(struct zs_pool *pool)
>> >>  {
>> >>       return atomic_long_read(&pool->pages_allocated);
>> >> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >>               atomic_long_add(class->pages_per_zspage,
>> >>                                       &pool->pages_allocated);
>> >>               spin_lock(&class->lock);
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +             class->obj_allocated += get_maxobj_per_zspage(class->size,
>> >> +                             class->pages_per_zspage);
>> >> +#endif
>> >
>> > I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());
>>
>> Got it. thanks
>>
>> >
>> >>       }
>> >>
>> >>       obj = (unsigned long)first_page->freelist;
>> >> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >>       kunmap_atomic(vaddr);
>> >>
>> >>       first_page->inuse++;
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +     class->obj_used++;
>> >> +#endif
>> >
>> > zs_stat_inc(class, OBJ_USED, 1)
>>
>> OK
>>
>> >
>> >
>> >>       /* Now move the zspage to another fullness group, if required */
>> >>       fix_fullness_group(pool, first_page);
>> >>       spin_unlock(&class->lock);
>> >> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> >>       first_page->freelist = (void *)obj;
>> >>
>> >>       first_page->inuse--;
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +     class->obj_used--;
>> >> +#endif
>> >
>> > zs_stat_dec(class, OBJ_USED, 1)
>>
>> OK
>>
>> >
>> >>       fullness = fix_fullness_group(pool, first_page);
>> >>       spin_unlock(&class->lock);
>> >>
>> >>       if (fullness == ZS_EMPTY) {
>> >>               atomic_long_sub(class->pages_per_zspage,
>> >>                               &pool->pages_allocated);
>> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> +             class->obj_allocated -= get_maxobj_per_zspage(class->size,
>> >> +                             class->pages_per_zspage);
>> >> +#endif
>> >
>> > zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());
>> >
>> >>               free_zspage(first_page);
>> >>       }
>> >>  }
>> >> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
>> >>       }
>> >>
>> >>       pool->flags = flags;
>> >> +     zs_pool_num++;
>> >
>> > Who protect the race?
>> > And manybe we should keep the index in zspool and export it to the user
>> > to let them know what zsmalloc instance is thiers.
>>
>> OK
>>
>> >
>> >> +
>> >> +     if (zs_pool_stat_create(pool, zs_pool_num))
>> >> +             pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
>> >>
>> >>       return pool;
>> >>
>> >> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
>> >>               kfree(class);
>> >>       }
>> >>
>> >> +     zs_pool_stat_destroy(pool);
>> >> +     zs_pool_num--;
>> >> +
>> >>       kfree(pool->size_class);
>> >>       kfree(pool);
>> >>  }
>> >> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
>> >>  #ifdef CONFIG_ZPOOL
>> >>       zpool_register_driver(&zs_zpool_driver);
>> >>  #endif
>> >> +
>> >> +     if (zs_stat_init())
>> >> +             pr_warn("zs stat initialization failed\n");
>> >> +
>> >>       return 0;
>> >>  }
>> >>
>> >> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
>> >>       zpool_unregister_driver(&zs_zpool_driver);
>> >>  #endif
>> >>       zs_unregister_cpu_notifier();
>> >> +
>> >> +     zs_stat_exit();
>> >>  }
>> >>
>> >>  module_init(zs_init);
>> >> --
>> >> 1.7.9.5
>> >>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
> --
> Kind regards,
> Minchan Kim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-19  1:21       ` Ganesh Mahendran
@ 2014-12-19  1:30         ` Minchan Kim
  2014-12-19  1:49           ` Ganesh Mahendran
  0 siblings, 1 reply; 10+ messages in thread
From: Minchan Kim @ 2014-12-19  1:30 UTC (permalink / raw)
  To: Ganesh Mahendran; +Cc: Nitin Gupta, Andrew Morton, Linux-MM, linux-kernel

On Fri, Dec 19, 2014 at 09:21:52AM +0800, Ganesh Mahendran wrote:
> Hello, Minchan
> 
> 2014-12-19 7:44 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> > Hello Ganesh,
> >
> > On Wed, Dec 17, 2014 at 03:05:19PM +0800, Ganesh Mahendran wrote:
> >> Hello, Minchan
> >>
> >> Thanks for your review.
> >>
> >> 2014-12-16 10:45 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> >> > On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
> >> >> As a ram based memory allocator, keep the fragmentation in a low level
> >> >
> >> > Just say, zsmalloc.
> >>
> >> Ok.
> >>
> >> >
> >> >> is our target. But now we still need to add the debug code in zsmalloc
> >> >> to get the quantitative data.
> >> >>
> >> >> After the RFC patch [1], Minchan Kim gave some suggestions.
> >> >>   [1] https://patchwork.kernel.org/patch/5469301/
> >> >>
> >> >> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
> >> >> collection for developers. Currently only the objects information in each class
> >> >> are collected. User can get the information via debugfs. For example:
> >> >>
> >> >> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
> >> >>  class  size obj_allocated   obj_used pages_used
> >> >>      0    32             0          0          0
> >> >>      1    48           256         12          3
> >> >>      2    64            64         14          1
> >> >>      3    80            51          7          1
> >> >>      4    96           128          5          3
> >> >>      5   112            73          5          2
> >> >>      6   128            32          4          1
> >> >>      7   144             0          0          0
> >> >>      8   160             0          0          0
> >> >>      9   176             0          0          0
> >> >>     10   192             0          0          0
> >> >>     11   208             0          0          0
> >> >>     12   224             0          0          0
> >> >>     13   240             0          0          0
> >> >>     14   256            16          1          1
> >> >>     15   272            15          9          1
> >> >>     16   288             0          0          0
> >> >>     17   304             0          0          0
> >> >>     18   320             0          0          0
> >> >>     19   336             0          0          0
> >> >>     20   352             0          0          0
> >> >>     21   368             0          0          0
> >> >>     22   384             0          0          0
> >> >>     23   400             0          0          0
> >> >>     24   416             0          0          0
> >> >>     25   432             0          0          0
> >> >>     26   448             0          0          0
> >> >>     27   464             0          0          0
> >> >>     28   480             0          0          0
> >> >>     29   496            33          1          4
> >> >>     30   512             0          0          0
> >> >>     31   528             0          0          0
> >> >>     32   544             0          0          0
> >> >>     33   560             0          0          0
> >> >>     34   576             0          0          0
> >> >>     35   592             0          0          0
> >> >>     36   608             0          0          0
> >> >>     37   624             0          0          0
> >> >>     38   640             0          0          0
> >> >>     40   672             0          0          0
> >> >>     42   704             0          0          0
> >> >>     43   720            17          1          3
> >> >>     44   736             0          0          0
> >> >>     46   768             0          0          0
> >> >>     49   816             0          0          0
> >> >>     51   848             0          0          0
> >> >>     52   864            14          1          3
> >> >>     54   896             0          0          0
> >> >>     57   944            13          1          3
> >> >>     58   960             0          0          0
> >> >>     62  1024             4          1          1
> >> >>     66  1088            15          2          4
> >> >>     67  1104             0          0          0
> >> >>     71  1168             0          0          0
> >> >>     74  1216             0          0          0
> >> >>     76  1248             0          0          0
> >> >>     83  1360             3          1          1
> >> >>     91  1488            11          1          4
> >> >>     94  1536             0          0          0
> >> >>    100  1632             5          1          2
> >> >>    107  1744             0          0          0
> >> >>    111  1808             9          1          4
> >> >>    126  2048             4          4          2
> >> >>    144  2336             7          3          4
> >> >>    151  2448             0          0          0
> >> >>    168  2720            15         15         10
> >> >>    190  3072            28         27         21
> >> >>    202  3264             0          0          0
> >> >>    254  4096         36209      36209      36209
> >> >>
> >> >>  Total               37022      36326      36288
> >> >>
> >> >> We can see the overall fragentation is:
> >> >>     (37022 - 36326) / 37022 = 1.87%
> >> >>
> >> >> Also from the statistics we know why we got so low fragmentation:
> >> >> Most of the objects is in class 254 with size 4096 Bytes. The pages in
> >> >> zspage is 1. And there is only one object in a page. So, No fragmentation
> >> >> will be produced.
> >> >>
> >> >> Also we can collect other information and show it to user in the future.
> >> >
> >> > So, could you make zs
> >> Ok
> >> >>
> >> >> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
> >> >> Suggested-by: Minchan Kim <minchan@kernel.org>
> >> >> ---
> >> >>  mm/Kconfig    |   10 ++++
> >> >>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >>  2 files changed, 174 insertions(+)
> >> >>
> >> >> diff --git a/mm/Kconfig b/mm/Kconfig
> >> >> index 1d1ae6b..95c5728 100644
> >> >> --- a/mm/Kconfig
> >> >> +++ b/mm/Kconfig
> >> >> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
> >> >>         You can check speed with zsmalloc benchmark:
> >> >>         https://github.com/spartacus06/zsmapbench
> >> >>
> >> >> +config ZSMALLOC_STAT
> >> >> +     bool "Export zsmalloc statistics"
> >> >> +     depends on ZSMALLOC
> >> >> +     select DEBUG_FS
> >> >> +     help
> >> >> +       This option enables code in the zsmalloc to collect various
> >> >> +       statistics about whats happening in zsmalloc and exports that
> >> >> +       information to userspace via debugfs.
> >> >> +       If unsure, say N.
> >> >> +
> >> >>  config GENERIC_EARLY_IOREMAP
> >> >>       bool
> >> >>
> >> >> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> >> >> index b724039..a8d0020 100644
> >> >> --- a/mm/zsmalloc.c
> >> >> +++ b/mm/zsmalloc.c
> >> >> @@ -168,6 +168,8 @@ enum fullness_group {
> >> >>       ZS_FULL
> >> >>  };
> >> >>
> >> >> +static int zs_pool_num;
> >> >
> >> > What's this? What protects the race?
> >> It is the pool index. Yes, there is problem here.
> >> I will change it to atomic and increased every time a new zs pool created.
> >> And then name the /sys/kernel/debug/pool-x using this index.
> >>
> >> static atomic_t zs_pool_index = ATOMIC_INIT(0);
> >> ...
> >> pool->index = atomic_inc_return(&zs_pool_index);
> >>
> >> > It means description.
> >> >
> >> >> +
> >> >>  /*
> >> >>   * number of size_classes
> >> >>   */
> >> >> @@ -200,6 +202,11 @@ struct size_class {
> >> >>       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
> >> >>       int pages_per_zspage;
> >> >>
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +     unsigned long obj_allocated;
> >> >> +     unsigned long obj_used;
> >> >> +#endif
> >> >
> >> > I perfer creating new struct.
> >> >
> >> > struct zs_size_stat {
> >> >         unsigend long obj_allocated;
> >> >         unsignged long obj_used;
> >> > };
> >>
> >> Got it, I will redo this.
> >>
> >> >
> >> >> +
> >> >>       spinlock_t lock;
> >> >>
> >> >>       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
> >> >> @@ -221,6 +228,10 @@ struct zs_pool {
> >> >>
> >> >>       gfp_t flags;    /* allocation flags used when growing pool */
> >> >>       atomic_long_t pages_allocated;
> >> >> +
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +     struct dentry *stat_dentry;
> >> >> +#endif
> >> >>  };
> >> >>
> >> >>  /*
> >> >> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
> >> >>       return true;
> >> >>  }
> >> >>
> >> >> +
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +#include <linux/debugfs.h>
> >> >
> >> > A question:
> >> > Why "#include" is here instead of top on the source file?
> >>
> >> Yes, the "#include ..." should be on the top of the source file.
> >> I will modify it.
> >>
> >> >
> >> >> +
> >> >> +static struct dentry *zs_stat_root;
> >> >> +
> >> >> +static int __init zs_stat_init(void)
> >> >> +{
> >> >> +     if (!debugfs_initialized())
> >> >> +             return -ENODEV;
> >> >
> >> > Do we need above check?
> >> Yes, I think we need this check.
> >> When debugfs module init failed, we should not go ahead here.
> >>
> >> >
> >> > When I read comment of debugfs_create_dir, it says
> >> > "If debugfs is not enabled in the kernel, the value -%ENODEV will be
> >> > returned."
> >>
> >> This check is not for the situation when the debugfs is not enabled.
> >> But for if we failed in debugfs_init(), then
> >> we should not use any API of debugfs.
> >>
> >> And I think "-%ENODEV will be returned" means below code in "debugfs.h"
> >>
> >> static inline struct dentry *debugfs_create_dir(const char *name,
> >> struct dentry *parent)
> >> {
> >>     return ERR_PTR(-ENODEV);
> >> }
> >>
> >> >
> >> >> +
> >> >> +     zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
> >> >> +     if (!zs_stat_root)
> >> >> +             return -ENOMEM;
> >> >
> >> > On null return of debugfs_create_dir, it means always ENOMEM?
> >>
> >> Yes, you are right.  -ENOMEM is not the only reason for the failure.
> >> But debugfs_create_dir does not bring back the errno.
> >> And for zsmalloc, we indeed have the permission(-EPERM) to create the entry and
> >> also we will not create duplicate(-EEXIST) entry in debufs.
> >>
> >> So, I think -ENOMEM is suitable.
> >
> > It seems you are saying why debugfs_create_dir can fail but zsmalloc will not
> > fail by such reasons.
> > I don't know the internal of debugfs_create_dir but description just says
> > "If an error occurs, %NULL will be returned" but everyone returns ENOMEM
> > blindly then. Hmm, I don't think it's good but I'm okay too because it's not
> > our fault and others have been in there. :(
> >
> > When I look at zs_init, you don't propagate the error to user. Just missing?
> 
> As the statistics collection is not essential to user, and I think we
> do not need to break
> the zsmalloc module loading when error happens in
> zs_stat_init()/debugfs_create_dir().
> So I do not propagate the error to user and just give the message:
>        pr_warn("zs stat initialization failed\n");

Hmm? So you want to work without stat although user want?
It means every zs_stat_[inc|dec] should check if it was initialized properly?

> 
> Thanks.
> 
> >
> >>
> >> >
> >> >> +
> >> >> +     return 0;
> >> >> +}
> >> >> +
> >> >> +static void __exit zs_stat_exit(void)
> >> >> +{
> >> >> +     debugfs_remove_recursive(zs_stat_root);
> >> >> +}
> >> >> +
> >> >> +static int zs_stats_show(struct seq_file *s, void *v)
> >> >> +{
> >> >> +     int i;
> >> >> +     struct zs_pool *pool = (struct zs_pool *)s->private;
> >> >> +     struct size_class *class;
> >> >> +     int objs_per_zspage;
> >> >> +     unsigned long obj_allocated, obj_used, pages_used;
> >> >> +     unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
> >> >> +
> >> >> +     seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
> >> >> +                             "obj_allocated", "obj_used", "pages_used");
> >> >> +
> >> >> +     for (i = 0; i < zs_size_classes; i++) {
> >> >> +             class = pool->size_class[i];
> >> >> +
> >> >> +             if (class->index != i)
> >> >> +                     continue;
> >> >> +
> >> >> +             spin_lock(&class->lock);
> >> >> +
> >> >> +             obj_allocated = class->obj_allocated;
> >> >> +             obj_used = class->obj_used;
> >> >> +             objs_per_zspage = get_maxobj_per_zspage(class->size,
> >> >> +                             class->pages_per_zspage);
> >> >> +             pages_used = obj_allocated / objs_per_zspage *
> >> >> +                             class->pages_per_zspage;
> >> >
> >> > I think We don't need to protect class->pages_per_zspage with class->lock.
> >>
> >> Yes, you are right.
> >>
> >> >
> >> >> +
> >> >> +             spin_unlock(&class->lock);
> >> >> +
> >> >> +             seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
> >> >> +                                     obj_allocated, obj_used, pages_used);
> >> >> +
> >> >> +             total_objs += class->obj_allocated;
> >> >> +             total_used_objs += class->obj_used;
> >> >
> >> > You couldn't access class->fields without class lock.
> >> > Please, assign them into local variable under the lock and sum them without the lock.
> >>
> >> Got it. I will redo this.
> >>
> >> >
> >> >> +             total_pages += pages_used;
> >> >> +     }
> >> >> +
> >> >> +     seq_puts(s, "\n");
> >> >> +     seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
> >> >> +                     total_objs, total_used_objs, total_pages);
> >> >> +
> >> >> +     return 0;
> >> >> +}
> >> >> +
> >> >> +static int zs_stats_open(struct inode *inode, struct file *file)
> >> >> +{
> >> >> +     return single_open(file, zs_stats_show, inode->i_private);
> >> >> +}
> >> >> +
> >> >> +static const struct file_operations zs_stats_operations = {
> >> >> +     .open           = zs_stats_open,
> >> >> +     .read           = seq_read,
> >> >> +     .llseek         = seq_lseek,
> >> >> +     .release        = single_release,
> >> >> +};
> >> >> +
> >> >> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
> >> >> +{
> >> >> +     char name[10];
> >> >> +     int ret = 0;
> >> >> +
> >> >> +     if (!zs_stat_root) {
> >> >> +             ret = -ENODEV;
> >> >> +             goto out;
> >> >> +     }
> >> >> +
> >> >> +     snprintf(name, sizeof(name), "pool-%d", index);
> >> >
> >> > Hmm, how does admin know any zsmalloc instance is associated with
> >> > any block device?
> >> > Maybe we need export zspool index to the client and print it
> >> > when pool is populated.
> >>
> >> Thanks for your suggestion.
> >>
> >> >
> >> >> +     pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
> >> >> +     if (!pool->stat_dentry) {
> >> >> +             ret = -ENOMEM;
> >> >> +             goto out;
> >> >> +     }
> >> >> +
> >> >> +     debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
> >> >> +                     pool->stat_dentry, pool, &zs_stats_operations);
> >> >
> >> > No need to check return?
> >>
> >> It is better to check the return value and give user some information
> >> about the failure.
> >>
> >> >
> >> >> +
> >> >> +out:
> >> >> +     return ret;
> >> >> +}
> >> >> +
> >> >> +static void zs_pool_stat_destroy(struct zs_pool *pool)
> >> >> +{
> >> >> +     debugfs_remove_recursive(pool->stat_dentry);
> >> >> +}
> >> >> +
> >> >> +#else /* CONFIG_ZSMALLOC_STAT */
> >> >> +
> >> >> +static int __init zs_stat_init(void)
> >> >> +{
> >> >> +     return 0;
> >> >> +}
> >> >> +
> >> >> +static void __exit zs_stat_exit(void) { }
> >> >> +
> >> >> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
> >> >> +{
> >> >> +     return 0;
> >> >> +}
> >> >> +
> >> >> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
> >> >> +
> >> >> +#endif
> >> >> +
> >> >>  unsigned long zs_get_total_pages(struct zs_pool *pool)
> >> >>  {
> >> >>       return atomic_long_read(&pool->pages_allocated);
> >> >> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >>               atomic_long_add(class->pages_per_zspage,
> >> >>                                       &pool->pages_allocated);
> >> >>               spin_lock(&class->lock);
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +             class->obj_allocated += get_maxobj_per_zspage(class->size,
> >> >> +                             class->pages_per_zspage);
> >> >> +#endif
> >> >
> >> > I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());
> >>
> >> Got it. thanks
> >>
> >> >
> >> >>       }
> >> >>
> >> >>       obj = (unsigned long)first_page->freelist;
> >> >> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >>       kunmap_atomic(vaddr);
> >> >>
> >> >>       first_page->inuse++;
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +     class->obj_used++;
> >> >> +#endif
> >> >
> >> > zs_stat_inc(class, OBJ_USED, 1)
> >>
> >> OK
> >>
> >> >
> >> >
> >> >>       /* Now move the zspage to another fullness group, if required */
> >> >>       fix_fullness_group(pool, first_page);
> >> >>       spin_unlock(&class->lock);
> >> >> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> >>       first_page->freelist = (void *)obj;
> >> >>
> >> >>       first_page->inuse--;
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +     class->obj_used--;
> >> >> +#endif
> >> >
> >> > zs_stat_dec(class, OBJ_USED, 1)
> >>
> >> OK
> >>
> >> >
> >> >>       fullness = fix_fullness_group(pool, first_page);
> >> >>       spin_unlock(&class->lock);
> >> >>
> >> >>       if (fullness == ZS_EMPTY) {
> >> >>               atomic_long_sub(class->pages_per_zspage,
> >> >>                               &pool->pages_allocated);
> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> +             class->obj_allocated -= get_maxobj_per_zspage(class->size,
> >> >> +                             class->pages_per_zspage);
> >> >> +#endif
> >> >
> >> > zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());
> >> >
> >> >>               free_zspage(first_page);
> >> >>       }
> >> >>  }
> >> >> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
> >> >>       }
> >> >>
> >> >>       pool->flags = flags;
> >> >> +     zs_pool_num++;
> >> >
> >> > Who protect the race?
> >> > And manybe we should keep the index in zspool and export it to the user
> >> > to let them know what zsmalloc instance is thiers.
> >>
> >> OK
> >>
> >> >
> >> >> +
> >> >> +     if (zs_pool_stat_create(pool, zs_pool_num))
> >> >> +             pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
> >> >>
> >> >>       return pool;
> >> >>
> >> >> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
> >> >>               kfree(class);
> >> >>       }
> >> >>
> >> >> +     zs_pool_stat_destroy(pool);
> >> >> +     zs_pool_num--;
> >> >> +
> >> >>       kfree(pool->size_class);
> >> >>       kfree(pool);
> >> >>  }
> >> >> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
> >> >>  #ifdef CONFIG_ZPOOL
> >> >>       zpool_register_driver(&zs_zpool_driver);
> >> >>  #endif
> >> >> +
> >> >> +     if (zs_stat_init())
> >> >> +             pr_warn("zs stat initialization failed\n");
> >> >> +
> >> >>       return 0;
> >> >>  }
> >> >>
> >> >> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
> >> >>       zpool_unregister_driver(&zs_zpool_driver);
> >> >>  #endif
> >> >>       zs_unregister_cpu_notifier();
> >> >> +
> >> >> +     zs_stat_exit();
> >> >>  }
> >> >>
> >> >>  module_init(zs_init);
> >> >> --
> >> >> 1.7.9.5
> >> >>
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to majordomo@kvack.org.  For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-19  1:30         ` Minchan Kim
@ 2014-12-19  1:49           ` Ganesh Mahendran
  2014-12-19  1:59             ` Minchan Kim
  0 siblings, 1 reply; 10+ messages in thread
From: Ganesh Mahendran @ 2014-12-19  1:49 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Nitin Gupta, Andrew Morton, Linux-MM, linux-kernel

2014-12-19 9:30 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> On Fri, Dec 19, 2014 at 09:21:52AM +0800, Ganesh Mahendran wrote:
>> Hello, Minchan
>>
>> 2014-12-19 7:44 GMT+08:00 Minchan Kim <minchan@kernel.org>:
>> > Hello Ganesh,
>> >
>> > On Wed, Dec 17, 2014 at 03:05:19PM +0800, Ganesh Mahendran wrote:
>> >> Hello, Minchan
>> >>
>> >> Thanks for your review.
>> >>
>> >> 2014-12-16 10:45 GMT+08:00 Minchan Kim <minchan@kernel.org>:
>> >> > On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
>> >> >> As a ram based memory allocator, keep the fragmentation in a low level
>> >> >
>> >> > Just say, zsmalloc.
>> >>
>> >> Ok.
>> >>
>> >> >
>> >> >> is our target. But now we still need to add the debug code in zsmalloc
>> >> >> to get the quantitative data.
>> >> >>
>> >> >> After the RFC patch [1], Minchan Kim gave some suggestions.
>> >> >>   [1] https://patchwork.kernel.org/patch/5469301/
>> >> >>
>> >> >> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
>> >> >> collection for developers. Currently only the objects information in each class
>> >> >> are collected. User can get the information via debugfs. For example:
>> >> >>
>> >> >> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
>> >> >>  class  size obj_allocated   obj_used pages_used
>> >> >>      0    32             0          0          0
>> >> >>      1    48           256         12          3
>> >> >>      2    64            64         14          1
>> >> >>      3    80            51          7          1
>> >> >>      4    96           128          5          3
>> >> >>      5   112            73          5          2
>> >> >>      6   128            32          4          1
>> >> >>      7   144             0          0          0
>> >> >>      8   160             0          0          0
>> >> >>      9   176             0          0          0
>> >> >>     10   192             0          0          0
>> >> >>     11   208             0          0          0
>> >> >>     12   224             0          0          0
>> >> >>     13   240             0          0          0
>> >> >>     14   256            16          1          1
>> >> >>     15   272            15          9          1
>> >> >>     16   288             0          0          0
>> >> >>     17   304             0          0          0
>> >> >>     18   320             0          0          0
>> >> >>     19   336             0          0          0
>> >> >>     20   352             0          0          0
>> >> >>     21   368             0          0          0
>> >> >>     22   384             0          0          0
>> >> >>     23   400             0          0          0
>> >> >>     24   416             0          0          0
>> >> >>     25   432             0          0          0
>> >> >>     26   448             0          0          0
>> >> >>     27   464             0          0          0
>> >> >>     28   480             0          0          0
>> >> >>     29   496            33          1          4
>> >> >>     30   512             0          0          0
>> >> >>     31   528             0          0          0
>> >> >>     32   544             0          0          0
>> >> >>     33   560             0          0          0
>> >> >>     34   576             0          0          0
>> >> >>     35   592             0          0          0
>> >> >>     36   608             0          0          0
>> >> >>     37   624             0          0          0
>> >> >>     38   640             0          0          0
>> >> >>     40   672             0          0          0
>> >> >>     42   704             0          0          0
>> >> >>     43   720            17          1          3
>> >> >>     44   736             0          0          0
>> >> >>     46   768             0          0          0
>> >> >>     49   816             0          0          0
>> >> >>     51   848             0          0          0
>> >> >>     52   864            14          1          3
>> >> >>     54   896             0          0          0
>> >> >>     57   944            13          1          3
>> >> >>     58   960             0          0          0
>> >> >>     62  1024             4          1          1
>> >> >>     66  1088            15          2          4
>> >> >>     67  1104             0          0          0
>> >> >>     71  1168             0          0          0
>> >> >>     74  1216             0          0          0
>> >> >>     76  1248             0          0          0
>> >> >>     83  1360             3          1          1
>> >> >>     91  1488            11          1          4
>> >> >>     94  1536             0          0          0
>> >> >>    100  1632             5          1          2
>> >> >>    107  1744             0          0          0
>> >> >>    111  1808             9          1          4
>> >> >>    126  2048             4          4          2
>> >> >>    144  2336             7          3          4
>> >> >>    151  2448             0          0          0
>> >> >>    168  2720            15         15         10
>> >> >>    190  3072            28         27         21
>> >> >>    202  3264             0          0          0
>> >> >>    254  4096         36209      36209      36209
>> >> >>
>> >> >>  Total               37022      36326      36288
>> >> >>
>> >> >> We can see the overall fragentation is:
>> >> >>     (37022 - 36326) / 37022 = 1.87%
>> >> >>
>> >> >> Also from the statistics we know why we got so low fragmentation:
>> >> >> Most of the objects is in class 254 with size 4096 Bytes. The pages in
>> >> >> zspage is 1. And there is only one object in a page. So, No fragmentation
>> >> >> will be produced.
>> >> >>
>> >> >> Also we can collect other information and show it to user in the future.
>> >> >
>> >> > So, could you make zs
>> >> Ok
>> >> >>
>> >> >> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
>> >> >> Suggested-by: Minchan Kim <minchan@kernel.org>
>> >> >> ---
>> >> >>  mm/Kconfig    |   10 ++++
>> >> >>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >> >>  2 files changed, 174 insertions(+)
>> >> >>
>> >> >> diff --git a/mm/Kconfig b/mm/Kconfig
>> >> >> index 1d1ae6b..95c5728 100644
>> >> >> --- a/mm/Kconfig
>> >> >> +++ b/mm/Kconfig
>> >> >> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
>> >> >>         You can check speed with zsmalloc benchmark:
>> >> >>         https://github.com/spartacus06/zsmapbench
>> >> >>
>> >> >> +config ZSMALLOC_STAT
>> >> >> +     bool "Export zsmalloc statistics"
>> >> >> +     depends on ZSMALLOC
>> >> >> +     select DEBUG_FS
>> >> >> +     help
>> >> >> +       This option enables code in the zsmalloc to collect various
>> >> >> +       statistics about whats happening in zsmalloc and exports that
>> >> >> +       information to userspace via debugfs.
>> >> >> +       If unsure, say N.
>> >> >> +
>> >> >>  config GENERIC_EARLY_IOREMAP
>> >> >>       bool
>> >> >>
>> >> >> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> >> >> index b724039..a8d0020 100644
>> >> >> --- a/mm/zsmalloc.c
>> >> >> +++ b/mm/zsmalloc.c
>> >> >> @@ -168,6 +168,8 @@ enum fullness_group {
>> >> >>       ZS_FULL
>> >> >>  };
>> >> >>
>> >> >> +static int zs_pool_num;
>> >> >
>> >> > What's this? What protects the race?
>> >> It is the pool index. Yes, there is problem here.
>> >> I will change it to atomic and increased every time a new zs pool created.
>> >> And then name the /sys/kernel/debug/pool-x using this index.
>> >>
>> >> static atomic_t zs_pool_index = ATOMIC_INIT(0);
>> >> ...
>> >> pool->index = atomic_inc_return(&zs_pool_index);
>> >>
>> >> > It means description.
>> >> >
>> >> >> +
>> >> >>  /*
>> >> >>   * number of size_classes
>> >> >>   */
>> >> >> @@ -200,6 +202,11 @@ struct size_class {
>> >> >>       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
>> >> >>       int pages_per_zspage;
>> >> >>
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +     unsigned long obj_allocated;
>> >> >> +     unsigned long obj_used;
>> >> >> +#endif
>> >> >
>> >> > I perfer creating new struct.
>> >> >
>> >> > struct zs_size_stat {
>> >> >         unsigend long obj_allocated;
>> >> >         unsignged long obj_used;
>> >> > };
>> >>
>> >> Got it, I will redo this.
>> >>
>> >> >
>> >> >> +
>> >> >>       spinlock_t lock;
>> >> >>
>> >> >>       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
>> >> >> @@ -221,6 +228,10 @@ struct zs_pool {
>> >> >>
>> >> >>       gfp_t flags;    /* allocation flags used when growing pool */
>> >> >>       atomic_long_t pages_allocated;
>> >> >> +
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +     struct dentry *stat_dentry;
>> >> >> +#endif
>> >> >>  };
>> >> >>
>> >> >>  /*
>> >> >> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
>> >> >>       return true;
>> >> >>  }
>> >> >>
>> >> >> +
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +#include <linux/debugfs.h>
>> >> >
>> >> > A question:
>> >> > Why "#include" is here instead of top on the source file?
>> >>
>> >> Yes, the "#include ..." should be on the top of the source file.
>> >> I will modify it.
>> >>
>> >> >
>> >> >> +
>> >> >> +static struct dentry *zs_stat_root;
>> >> >> +
>> >> >> +static int __init zs_stat_init(void)
>> >> >> +{
>> >> >> +     if (!debugfs_initialized())
>> >> >> +             return -ENODEV;
>> >> >
>> >> > Do we need above check?
>> >> Yes, I think we need this check.
>> >> When debugfs module init failed, we should not go ahead here.
>> >>
>> >> >
>> >> > When I read comment of debugfs_create_dir, it says
>> >> > "If debugfs is not enabled in the kernel, the value -%ENODEV will be
>> >> > returned."
>> >>
>> >> This check is not for the situation when the debugfs is not enabled.
>> >> But for if we failed in debugfs_init(), then
>> >> we should not use any API of debugfs.
>> >>
>> >> And I think "-%ENODEV will be returned" means below code in "debugfs.h"
>> >>
>> >> static inline struct dentry *debugfs_create_dir(const char *name,
>> >> struct dentry *parent)
>> >> {
>> >>     return ERR_PTR(-ENODEV);
>> >> }
>> >>
>> >> >
>> >> >> +
>> >> >> +     zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
>> >> >> +     if (!zs_stat_root)
>> >> >> +             return -ENOMEM;
>> >> >
>> >> > On null return of debugfs_create_dir, it means always ENOMEM?
>> >>
>> >> Yes, you are right.  -ENOMEM is not the only reason for the failure.
>> >> But debugfs_create_dir does not bring back the errno.
>> >> And for zsmalloc, we indeed have the permission(-EPERM) to create the entry and
>> >> also we will not create duplicate(-EEXIST) entry in debufs.
>> >>
>> >> So, I think -ENOMEM is suitable.
>> >
>> > It seems you are saying why debugfs_create_dir can fail but zsmalloc will not
>> > fail by such reasons.
>> > I don't know the internal of debugfs_create_dir but description just says
>> > "If an error occurs, %NULL will be returned" but everyone returns ENOMEM
>> > blindly then. Hmm, I don't think it's good but I'm okay too because it's not
>> > our fault and others have been in there. :(
>> >
>> > When I look at zs_init, you don't propagate the error to user. Just missing?
>>
>> As the statistics collection is not essential to user, and I think we
>> do not need to break
>> the zsmalloc module loading when error happens in
>> zs_stat_init()/debugfs_create_dir().
>> So I do not propagate the error to user and just give the message:
>>        pr_warn("zs stat initialization failed\n");
>
> Hmm? So you want to work without stat although user want?

It seems we should allow user to know there is something wrong
happened in module loading.

In the beginning, I just considered that it was only a debug option.
When developers enable
this option in the end-user's machine, It should not affect zsmalloc's
basic function.

> It means every zs_stat_[inc|dec] should check if it was initialized properly?

In zs_stat_[inc|dec], these functions only handle the fields in struct
size_class{}. So we do not need
to check whether it was .initialized properly. But when failed in
zs_stat_init(),  there is
no way to show these statistics to user.

>
>>
>> Thanks.
>>
>> >
>> >>
>> >> >
>> >> >> +
>> >> >> +     return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static void __exit zs_stat_exit(void)
>> >> >> +{
>> >> >> +     debugfs_remove_recursive(zs_stat_root);
>> >> >> +}
>> >> >> +
>> >> >> +static int zs_stats_show(struct seq_file *s, void *v)
>> >> >> +{
>> >> >> +     int i;
>> >> >> +     struct zs_pool *pool = (struct zs_pool *)s->private;
>> >> >> +     struct size_class *class;
>> >> >> +     int objs_per_zspage;
>> >> >> +     unsigned long obj_allocated, obj_used, pages_used;
>> >> >> +     unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
>> >> >> +
>> >> >> +     seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
>> >> >> +                             "obj_allocated", "obj_used", "pages_used");
>> >> >> +
>> >> >> +     for (i = 0; i < zs_size_classes; i++) {
>> >> >> +             class = pool->size_class[i];
>> >> >> +
>> >> >> +             if (class->index != i)
>> >> >> +                     continue;
>> >> >> +
>> >> >> +             spin_lock(&class->lock);
>> >> >> +
>> >> >> +             obj_allocated = class->obj_allocated;
>> >> >> +             obj_used = class->obj_used;
>> >> >> +             objs_per_zspage = get_maxobj_per_zspage(class->size,
>> >> >> +                             class->pages_per_zspage);
>> >> >> +             pages_used = obj_allocated / objs_per_zspage *
>> >> >> +                             class->pages_per_zspage;
>> >> >
>> >> > I think We don't need to protect class->pages_per_zspage with class->lock.
>> >>
>> >> Yes, you are right.
>> >>
>> >> >
>> >> >> +
>> >> >> +             spin_unlock(&class->lock);
>> >> >> +
>> >> >> +             seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
>> >> >> +                                     obj_allocated, obj_used, pages_used);
>> >> >> +
>> >> >> +             total_objs += class->obj_allocated;
>> >> >> +             total_used_objs += class->obj_used;
>> >> >
>> >> > You couldn't access class->fields without class lock.
>> >> > Please, assign them into local variable under the lock and sum them without the lock.
>> >>
>> >> Got it. I will redo this.
>> >>
>> >> >
>> >> >> +             total_pages += pages_used;
>> >> >> +     }
>> >> >> +
>> >> >> +     seq_puts(s, "\n");
>> >> >> +     seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
>> >> >> +                     total_objs, total_used_objs, total_pages);
>> >> >> +
>> >> >> +     return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static int zs_stats_open(struct inode *inode, struct file *file)
>> >> >> +{
>> >> >> +     return single_open(file, zs_stats_show, inode->i_private);
>> >> >> +}
>> >> >> +
>> >> >> +static const struct file_operations zs_stats_operations = {
>> >> >> +     .open           = zs_stats_open,
>> >> >> +     .read           = seq_read,
>> >> >> +     .llseek         = seq_lseek,
>> >> >> +     .release        = single_release,
>> >> >> +};
>> >> >> +
>> >> >> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
>> >> >> +{
>> >> >> +     char name[10];
>> >> >> +     int ret = 0;
>> >> >> +
>> >> >> +     if (!zs_stat_root) {
>> >> >> +             ret = -ENODEV;
>> >> >> +             goto out;
>> >> >> +     }
>> >> >> +
>> >> >> +     snprintf(name, sizeof(name), "pool-%d", index);
>> >> >
>> >> > Hmm, how does admin know any zsmalloc instance is associated with
>> >> > any block device?
>> >> > Maybe we need export zspool index to the client and print it
>> >> > when pool is populated.
>> >>
>> >> Thanks for your suggestion.
>> >>
>> >> >
>> >> >> +     pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
>> >> >> +     if (!pool->stat_dentry) {
>> >> >> +             ret = -ENOMEM;
>> >> >> +             goto out;
>> >> >> +     }
>> >> >> +
>> >> >> +     debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
>> >> >> +                     pool->stat_dentry, pool, &zs_stats_operations);
>> >> >
>> >> > No need to check return?
>> >>
>> >> It is better to check the return value and give user some information
>> >> about the failure.
>> >>
>> >> >
>> >> >> +
>> >> >> +out:
>> >> >> +     return ret;
>> >> >> +}
>> >> >> +
>> >> >> +static void zs_pool_stat_destroy(struct zs_pool *pool)
>> >> >> +{
>> >> >> +     debugfs_remove_recursive(pool->stat_dentry);
>> >> >> +}
>> >> >> +
>> >> >> +#else /* CONFIG_ZSMALLOC_STAT */
>> >> >> +
>> >> >> +static int __init zs_stat_init(void)
>> >> >> +{
>> >> >> +     return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static void __exit zs_stat_exit(void) { }
>> >> >> +
>> >> >> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
>> >> >> +{
>> >> >> +     return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
>> >> >> +
>> >> >> +#endif
>> >> >> +
>> >> >>  unsigned long zs_get_total_pages(struct zs_pool *pool)
>> >> >>  {
>> >> >>       return atomic_long_read(&pool->pages_allocated);
>> >> >> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >> >>               atomic_long_add(class->pages_per_zspage,
>> >> >>                                       &pool->pages_allocated);
>> >> >>               spin_lock(&class->lock);
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +             class->obj_allocated += get_maxobj_per_zspage(class->size,
>> >> >> +                             class->pages_per_zspage);
>> >> >> +#endif
>> >> >
>> >> > I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());
>> >>
>> >> Got it. thanks
>> >>
>> >> >
>> >> >>       }
>> >> >>
>> >> >>       obj = (unsigned long)first_page->freelist;
>> >> >> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >> >>       kunmap_atomic(vaddr);
>> >> >>
>> >> >>       first_page->inuse++;
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +     class->obj_used++;
>> >> >> +#endif
>> >> >
>> >> > zs_stat_inc(class, OBJ_USED, 1)
>> >>
>> >> OK
>> >>
>> >> >
>> >> >
>> >> >>       /* Now move the zspage to another fullness group, if required */
>> >> >>       fix_fullness_group(pool, first_page);
>> >> >>       spin_unlock(&class->lock);
>> >> >> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> >> >>       first_page->freelist = (void *)obj;
>> >> >>
>> >> >>       first_page->inuse--;
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +     class->obj_used--;
>> >> >> +#endif
>> >> >
>> >> > zs_stat_dec(class, OBJ_USED, 1)
>> >>
>> >> OK
>> >>
>> >> >
>> >> >>       fullness = fix_fullness_group(pool, first_page);
>> >> >>       spin_unlock(&class->lock);
>> >> >>
>> >> >>       if (fullness == ZS_EMPTY) {
>> >> >>               atomic_long_sub(class->pages_per_zspage,
>> >> >>                               &pool->pages_allocated);
>> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
>> >> >> +             class->obj_allocated -= get_maxobj_per_zspage(class->size,
>> >> >> +                             class->pages_per_zspage);
>> >> >> +#endif
>> >> >
>> >> > zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());
>> >> >
>> >> >>               free_zspage(first_page);
>> >> >>       }
>> >> >>  }
>> >> >> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
>> >> >>       }
>> >> >>
>> >> >>       pool->flags = flags;
>> >> >> +     zs_pool_num++;
>> >> >
>> >> > Who protect the race?
>> >> > And manybe we should keep the index in zspool and export it to the user
>> >> > to let them know what zsmalloc instance is thiers.
>> >>
>> >> OK
>> >>
>> >> >
>> >> >> +
>> >> >> +     if (zs_pool_stat_create(pool, zs_pool_num))
>> >> >> +             pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
>> >> >>
>> >> >>       return pool;
>> >> >>
>> >> >> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
>> >> >>               kfree(class);
>> >> >>       }
>> >> >>
>> >> >> +     zs_pool_stat_destroy(pool);
>> >> >> +     zs_pool_num--;
>> >> >> +
>> >> >>       kfree(pool->size_class);
>> >> >>       kfree(pool);
>> >> >>  }
>> >> >> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
>> >> >>  #ifdef CONFIG_ZPOOL
>> >> >>       zpool_register_driver(&zs_zpool_driver);
>> >> >>  #endif
>> >> >> +
>> >> >> +     if (zs_stat_init())
>> >> >> +             pr_warn("zs stat initialization failed\n");
>> >> >> +
>> >> >>       return 0;
>> >> >>  }
>> >> >>
>> >> >> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
>> >> >>       zpool_unregister_driver(&zs_zpool_driver);
>> >> >>  #endif
>> >> >>       zs_unregister_cpu_notifier();
>> >> >> +
>> >> >> +     zs_stat_exit();
>> >> >>  }
>> >> >>
>> >> >>  module_init(zs_init);
>> >> >> --
>> >> >> 1.7.9.5
>> >> >>
>> >>
>> >> --
>> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> >> the body to majordomo@kvack.org.  For more info on Linux MM,
>> >> see: http://www.linux-mm.org/ .
>> >> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>> >
>> > --
>> > Kind regards,
>> > Minchan Kim
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
> --
> Kind regards,
> Minchan Kim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-19  1:49           ` Ganesh Mahendran
@ 2014-12-19  1:59             ` Minchan Kim
  0 siblings, 0 replies; 10+ messages in thread
From: Minchan Kim @ 2014-12-19  1:59 UTC (permalink / raw)
  To: Ganesh Mahendran; +Cc: Nitin Gupta, Andrew Morton, Linux-MM, linux-kernel

On Fri, Dec 19, 2014 at 09:49:29AM +0800, Ganesh Mahendran wrote:
> 2014-12-19 9:30 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> > On Fri, Dec 19, 2014 at 09:21:52AM +0800, Ganesh Mahendran wrote:
> >> Hello, Minchan
> >>
> >> 2014-12-19 7:44 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> >> > Hello Ganesh,
> >> >
> >> > On Wed, Dec 17, 2014 at 03:05:19PM +0800, Ganesh Mahendran wrote:
> >> >> Hello, Minchan
> >> >>
> >> >> Thanks for your review.
> >> >>
> >> >> 2014-12-16 10:45 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> >> >> > On Sat, Dec 13, 2014 at 09:45:14PM +0800, Ganesh Mahendran wrote:
> >> >> >> As a ram based memory allocator, keep the fragmentation in a low level
> >> >> >
> >> >> > Just say, zsmalloc.
> >> >>
> >> >> Ok.
> >> >>
> >> >> >
> >> >> >> is our target. But now we still need to add the debug code in zsmalloc
> >> >> >> to get the quantitative data.
> >> >> >>
> >> >> >> After the RFC patch [1], Minchan Kim gave some suggestions.
> >> >> >>   [1] https://patchwork.kernel.org/patch/5469301/
> >> >> >>
> >> >> >> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the statistics
> >> >> >> collection for developers. Currently only the objects information in each class
> >> >> >> are collected. User can get the information via debugfs. For example:
> >> >> >>
> >> >> >> After I copy file jdk-8u25-linux-x64.tar.gz to zram with ext4 filesystem.
> >> >> >>  class  size obj_allocated   obj_used pages_used
> >> >> >>      0    32             0          0          0
> >> >> >>      1    48           256         12          3
> >> >> >>      2    64            64         14          1
> >> >> >>      3    80            51          7          1
> >> >> >>      4    96           128          5          3
> >> >> >>      5   112            73          5          2
> >> >> >>      6   128            32          4          1
> >> >> >>      7   144             0          0          0
> >> >> >>      8   160             0          0          0
> >> >> >>      9   176             0          0          0
> >> >> >>     10   192             0          0          0
> >> >> >>     11   208             0          0          0
> >> >> >>     12   224             0          0          0
> >> >> >>     13   240             0          0          0
> >> >> >>     14   256            16          1          1
> >> >> >>     15   272            15          9          1
> >> >> >>     16   288             0          0          0
> >> >> >>     17   304             0          0          0
> >> >> >>     18   320             0          0          0
> >> >> >>     19   336             0          0          0
> >> >> >>     20   352             0          0          0
> >> >> >>     21   368             0          0          0
> >> >> >>     22   384             0          0          0
> >> >> >>     23   400             0          0          0
> >> >> >>     24   416             0          0          0
> >> >> >>     25   432             0          0          0
> >> >> >>     26   448             0          0          0
> >> >> >>     27   464             0          0          0
> >> >> >>     28   480             0          0          0
> >> >> >>     29   496            33          1          4
> >> >> >>     30   512             0          0          0
> >> >> >>     31   528             0          0          0
> >> >> >>     32   544             0          0          0
> >> >> >>     33   560             0          0          0
> >> >> >>     34   576             0          0          0
> >> >> >>     35   592             0          0          0
> >> >> >>     36   608             0          0          0
> >> >> >>     37   624             0          0          0
> >> >> >>     38   640             0          0          0
> >> >> >>     40   672             0          0          0
> >> >> >>     42   704             0          0          0
> >> >> >>     43   720            17          1          3
> >> >> >>     44   736             0          0          0
> >> >> >>     46   768             0          0          0
> >> >> >>     49   816             0          0          0
> >> >> >>     51   848             0          0          0
> >> >> >>     52   864            14          1          3
> >> >> >>     54   896             0          0          0
> >> >> >>     57   944            13          1          3
> >> >> >>     58   960             0          0          0
> >> >> >>     62  1024             4          1          1
> >> >> >>     66  1088            15          2          4
> >> >> >>     67  1104             0          0          0
> >> >> >>     71  1168             0          0          0
> >> >> >>     74  1216             0          0          0
> >> >> >>     76  1248             0          0          0
> >> >> >>     83  1360             3          1          1
> >> >> >>     91  1488            11          1          4
> >> >> >>     94  1536             0          0          0
> >> >> >>    100  1632             5          1          2
> >> >> >>    107  1744             0          0          0
> >> >> >>    111  1808             9          1          4
> >> >> >>    126  2048             4          4          2
> >> >> >>    144  2336             7          3          4
> >> >> >>    151  2448             0          0          0
> >> >> >>    168  2720            15         15         10
> >> >> >>    190  3072            28         27         21
> >> >> >>    202  3264             0          0          0
> >> >> >>    254  4096         36209      36209      36209
> >> >> >>
> >> >> >>  Total               37022      36326      36288
> >> >> >>
> >> >> >> We can see the overall fragentation is:
> >> >> >>     (37022 - 36326) / 37022 = 1.87%
> >> >> >>
> >> >> >> Also from the statistics we know why we got so low fragmentation:
> >> >> >> Most of the objects is in class 254 with size 4096 Bytes. The pages in
> >> >> >> zspage is 1. And there is only one object in a page. So, No fragmentation
> >> >> >> will be produced.
> >> >> >>
> >> >> >> Also we can collect other information and show it to user in the future.
> >> >> >
> >> >> > So, could you make zs
> >> >> Ok
> >> >> >>
> >> >> >> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
> >> >> >> Suggested-by: Minchan Kim <minchan@kernel.org>
> >> >> >> ---
> >> >> >>  mm/Kconfig    |   10 ++++
> >> >> >>  mm/zsmalloc.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >> >>  2 files changed, 174 insertions(+)
> >> >> >>
> >> >> >> diff --git a/mm/Kconfig b/mm/Kconfig
> >> >> >> index 1d1ae6b..95c5728 100644
> >> >> >> --- a/mm/Kconfig
> >> >> >> +++ b/mm/Kconfig
> >> >> >> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
> >> >> >>         You can check speed with zsmalloc benchmark:
> >> >> >>         https://github.com/spartacus06/zsmapbench
> >> >> >>
> >> >> >> +config ZSMALLOC_STAT
> >> >> >> +     bool "Export zsmalloc statistics"
> >> >> >> +     depends on ZSMALLOC
> >> >> >> +     select DEBUG_FS
> >> >> >> +     help
> >> >> >> +       This option enables code in the zsmalloc to collect various
> >> >> >> +       statistics about whats happening in zsmalloc and exports that
> >> >> >> +       information to userspace via debugfs.
> >> >> >> +       If unsure, say N.
> >> >> >> +
> >> >> >>  config GENERIC_EARLY_IOREMAP
> >> >> >>       bool
> >> >> >>
> >> >> >> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> >> >> >> index b724039..a8d0020 100644
> >> >> >> --- a/mm/zsmalloc.c
> >> >> >> +++ b/mm/zsmalloc.c
> >> >> >> @@ -168,6 +168,8 @@ enum fullness_group {
> >> >> >>       ZS_FULL
> >> >> >>  };
> >> >> >>
> >> >> >> +static int zs_pool_num;
> >> >> >
> >> >> > What's this? What protects the race?
> >> >> It is the pool index. Yes, there is problem here.
> >> >> I will change it to atomic and increased every time a new zs pool created.
> >> >> And then name the /sys/kernel/debug/pool-x using this index.
> >> >>
> >> >> static atomic_t zs_pool_index = ATOMIC_INIT(0);
> >> >> ...
> >> >> pool->index = atomic_inc_return(&zs_pool_index);
> >> >>
> >> >> > It means description.
> >> >> >
> >> >> >> +
> >> >> >>  /*
> >> >> >>   * number of size_classes
> >> >> >>   */
> >> >> >> @@ -200,6 +202,11 @@ struct size_class {
> >> >> >>       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
> >> >> >>       int pages_per_zspage;
> >> >> >>
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +     unsigned long obj_allocated;
> >> >> >> +     unsigned long obj_used;
> >> >> >> +#endif
> >> >> >
> >> >> > I perfer creating new struct.
> >> >> >
> >> >> > struct zs_size_stat {
> >> >> >         unsigend long obj_allocated;
> >> >> >         unsignged long obj_used;
> >> >> > };
> >> >>
> >> >> Got it, I will redo this.
> >> >>
> >> >> >
> >> >> >> +
> >> >> >>       spinlock_t lock;
> >> >> >>
> >> >> >>       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
> >> >> >> @@ -221,6 +228,10 @@ struct zs_pool {
> >> >> >>
> >> >> >>       gfp_t flags;    /* allocation flags used when growing pool */
> >> >> >>       atomic_long_t pages_allocated;
> >> >> >> +
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +     struct dentry *stat_dentry;
> >> >> >> +#endif
> >> >> >>  };
> >> >> >>
> >> >> >>  /*
> >> >> >> @@ -942,6 +953,132 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
> >> >> >>       return true;
> >> >> >>  }
> >> >> >>
> >> >> >> +
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +#include <linux/debugfs.h>
> >> >> >
> >> >> > A question:
> >> >> > Why "#include" is here instead of top on the source file?
> >> >>
> >> >> Yes, the "#include ..." should be on the top of the source file.
> >> >> I will modify it.
> >> >>
> >> >> >
> >> >> >> +
> >> >> >> +static struct dentry *zs_stat_root;
> >> >> >> +
> >> >> >> +static int __init zs_stat_init(void)
> >> >> >> +{
> >> >> >> +     if (!debugfs_initialized())
> >> >> >> +             return -ENODEV;
> >> >> >
> >> >> > Do we need above check?
> >> >> Yes, I think we need this check.
> >> >> When debugfs module init failed, we should not go ahead here.
> >> >>
> >> >> >
> >> >> > When I read comment of debugfs_create_dir, it says
> >> >> > "If debugfs is not enabled in the kernel, the value -%ENODEV will be
> >> >> > returned."
> >> >>
> >> >> This check is not for the situation when the debugfs is not enabled.
> >> >> But for if we failed in debugfs_init(), then
> >> >> we should not use any API of debugfs.
> >> >>
> >> >> And I think "-%ENODEV will be returned" means below code in "debugfs.h"
> >> >>
> >> >> static inline struct dentry *debugfs_create_dir(const char *name,
> >> >> struct dentry *parent)
> >> >> {
> >> >>     return ERR_PTR(-ENODEV);
> >> >> }
> >> >>
> >> >> >
> >> >> >> +
> >> >> >> +     zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
> >> >> >> +     if (!zs_stat_root)
> >> >> >> +             return -ENOMEM;
> >> >> >
> >> >> > On null return of debugfs_create_dir, it means always ENOMEM?
> >> >>
> >> >> Yes, you are right.  -ENOMEM is not the only reason for the failure.
> >> >> But debugfs_create_dir does not bring back the errno.
> >> >> And for zsmalloc, we indeed have the permission(-EPERM) to create the entry and
> >> >> also we will not create duplicate(-EEXIST) entry in debufs.
> >> >>
> >> >> So, I think -ENOMEM is suitable.
> >> >
> >> > It seems you are saying why debugfs_create_dir can fail but zsmalloc will not
> >> > fail by such reasons.
> >> > I don't know the internal of debugfs_create_dir but description just says
> >> > "If an error occurs, %NULL will be returned" but everyone returns ENOMEM
> >> > blindly then. Hmm, I don't think it's good but I'm okay too because it's not
> >> > our fault and others have been in there. :(
> >> >
> >> > When I look at zs_init, you don't propagate the error to user. Just missing?
> >>
> >> As the statistics collection is not essential to user, and I think we
> >> do not need to break
> >> the zsmalloc module loading when error happens in
> >> zs_stat_init()/debugfs_create_dir().
> >> So I do not propagate the error to user and just give the message:
> >>        pr_warn("zs stat initialization failed\n");
> >
> > Hmm? So you want to work without stat although user want?
> 
> It seems we should allow user to know there is something wrong
> happened in module loading.

True.

> 
> In the beginning, I just considered that it was only a debug option.
> When developers enable
> this option in the end-user's machine, It should not affect zsmalloc's
> basic function.
> 
> > It means every zs_stat_[inc|dec] should check if it was initialized properly?
> 
> In zs_stat_[inc|dec], these functions only handle the fields in struct
> size_class{}. So we do not need
> to check whether it was .initialized properly. But when failed in
> zs_stat_init(),  there is
> no way to show these statistics to user.

Indeed.

Thanks for the quick response!

> 
> >
> >>
> >> Thanks.
> >>
> >> >
> >> >>
> >> >> >
> >> >> >> +
> >> >> >> +     return 0;
> >> >> >> +}
> >> >> >> +
> >> >> >> +static void __exit zs_stat_exit(void)
> >> >> >> +{
> >> >> >> +     debugfs_remove_recursive(zs_stat_root);
> >> >> >> +}
> >> >> >> +
> >> >> >> +static int zs_stats_show(struct seq_file *s, void *v)
> >> >> >> +{
> >> >> >> +     int i;
> >> >> >> +     struct zs_pool *pool = (struct zs_pool *)s->private;
> >> >> >> +     struct size_class *class;
> >> >> >> +     int objs_per_zspage;
> >> >> >> +     unsigned long obj_allocated, obj_used, pages_used;
> >> >> >> +     unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
> >> >> >> +
> >> >> >> +     seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
> >> >> >> +                             "obj_allocated", "obj_used", "pages_used");
> >> >> >> +
> >> >> >> +     for (i = 0; i < zs_size_classes; i++) {
> >> >> >> +             class = pool->size_class[i];
> >> >> >> +
> >> >> >> +             if (class->index != i)
> >> >> >> +                     continue;
> >> >> >> +
> >> >> >> +             spin_lock(&class->lock);
> >> >> >> +
> >> >> >> +             obj_allocated = class->obj_allocated;
> >> >> >> +             obj_used = class->obj_used;
> >> >> >> +             objs_per_zspage = get_maxobj_per_zspage(class->size,
> >> >> >> +                             class->pages_per_zspage);
> >> >> >> +             pages_used = obj_allocated / objs_per_zspage *
> >> >> >> +                             class->pages_per_zspage;
> >> >> >
> >> >> > I think We don't need to protect class->pages_per_zspage with class->lock.
> >> >>
> >> >> Yes, you are right.
> >> >>
> >> >> >
> >> >> >> +
> >> >> >> +             spin_unlock(&class->lock);
> >> >> >> +
> >> >> >> +             seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i, class->size,
> >> >> >> +                                     obj_allocated, obj_used, pages_used);
> >> >> >> +
> >> >> >> +             total_objs += class->obj_allocated;
> >> >> >> +             total_used_objs += class->obj_used;
> >> >> >
> >> >> > You couldn't access class->fields without class lock.
> >> >> > Please, assign them into local variable under the lock and sum them without the lock.
> >> >>
> >> >> Got it. I will redo this.
> >> >>
> >> >> >
> >> >> >> +             total_pages += pages_used;
> >> >> >> +     }
> >> >> >> +
> >> >> >> +     seq_puts(s, "\n");
> >> >> >> +     seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
> >> >> >> +                     total_objs, total_used_objs, total_pages);
> >> >> >> +
> >> >> >> +     return 0;
> >> >> >> +}
> >> >> >> +
> >> >> >> +static int zs_stats_open(struct inode *inode, struct file *file)
> >> >> >> +{
> >> >> >> +     return single_open(file, zs_stats_show, inode->i_private);
> >> >> >> +}
> >> >> >> +
> >> >> >> +static const struct file_operations zs_stats_operations = {
> >> >> >> +     .open           = zs_stats_open,
> >> >> >> +     .read           = seq_read,
> >> >> >> +     .llseek         = seq_lseek,
> >> >> >> +     .release        = single_release,
> >> >> >> +};
> >> >> >> +
> >> >> >> +static int zs_pool_stat_create(struct zs_pool *pool, int index)
> >> >> >> +{
> >> >> >> +     char name[10];
> >> >> >> +     int ret = 0;
> >> >> >> +
> >> >> >> +     if (!zs_stat_root) {
> >> >> >> +             ret = -ENODEV;
> >> >> >> +             goto out;
> >> >> >> +     }
> >> >> >> +
> >> >> >> +     snprintf(name, sizeof(name), "pool-%d", index);
> >> >> >
> >> >> > Hmm, how does admin know any zsmalloc instance is associated with
> >> >> > any block device?
> >> >> > Maybe we need export zspool index to the client and print it
> >> >> > when pool is populated.
> >> >>
> >> >> Thanks for your suggestion.
> >> >>
> >> >> >
> >> >> >> +     pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
> >> >> >> +     if (!pool->stat_dentry) {
> >> >> >> +             ret = -ENOMEM;
> >> >> >> +             goto out;
> >> >> >> +     }
> >> >> >> +
> >> >> >> +     debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
> >> >> >> +                     pool->stat_dentry, pool, &zs_stats_operations);
> >> >> >
> >> >> > No need to check return?
> >> >>
> >> >> It is better to check the return value and give user some information
> >> >> about the failure.
> >> >>
> >> >> >
> >> >> >> +
> >> >> >> +out:
> >> >> >> +     return ret;
> >> >> >> +}
> >> >> >> +
> >> >> >> +static void zs_pool_stat_destroy(struct zs_pool *pool)
> >> >> >> +{
> >> >> >> +     debugfs_remove_recursive(pool->stat_dentry);
> >> >> >> +}
> >> >> >> +
> >> >> >> +#else /* CONFIG_ZSMALLOC_STAT */
> >> >> >> +
> >> >> >> +static int __init zs_stat_init(void)
> >> >> >> +{
> >> >> >> +     return 0;
> >> >> >> +}
> >> >> >> +
> >> >> >> +static void __exit zs_stat_exit(void) { }
> >> >> >> +
> >> >> >> +static inline int zs_pool_stat_create(struct zs_pool *pool, int index)
> >> >> >> +{
> >> >> >> +     return 0;
> >> >> >> +}
> >> >> >> +
> >> >> >> +static inline void zs_pool_stat_destroy(struct zs_pool *pool) { }
> >> >> >> +
> >> >> >> +#endif
> >> >> >> +
> >> >> >>  unsigned long zs_get_total_pages(struct zs_pool *pool)
> >> >> >>  {
> >> >> >>       return atomic_long_read(&pool->pages_allocated);
> >> >> >> @@ -1075,6 +1212,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >> >>               atomic_long_add(class->pages_per_zspage,
> >> >> >>                                       &pool->pages_allocated);
> >> >> >>               spin_lock(&class->lock);
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +             class->obj_allocated += get_maxobj_per_zspage(class->size,
> >> >> >> +                             class->pages_per_zspage);
> >> >> >> +#endif
> >> >> >
> >> >> > I prefer zs_stat_inc(class, OBJ_ALLOCATED, get_max_obj());
> >> >>
> >> >> Got it. thanks
> >> >>
> >> >> >
> >> >> >>       }
> >> >> >>
> >> >> >>       obj = (unsigned long)first_page->freelist;
> >> >> >> @@ -1088,6 +1229,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >> >>       kunmap_atomic(vaddr);
> >> >> >>
> >> >> >>       first_page->inuse++;
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +     class->obj_used++;
> >> >> >> +#endif
> >> >> >
> >> >> > zs_stat_inc(class, OBJ_USED, 1)
> >> >>
> >> >> OK
> >> >>
> >> >> >
> >> >> >
> >> >> >>       /* Now move the zspage to another fullness group, if required */
> >> >> >>       fix_fullness_group(pool, first_page);
> >> >> >>       spin_unlock(&class->lock);
> >> >> >> @@ -1127,12 +1271,19 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> >> >>       first_page->freelist = (void *)obj;
> >> >> >>
> >> >> >>       first_page->inuse--;
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +     class->obj_used--;
> >> >> >> +#endif
> >> >> >
> >> >> > zs_stat_dec(class, OBJ_USED, 1)
> >> >>
> >> >> OK
> >> >>
> >> >> >
> >> >> >>       fullness = fix_fullness_group(pool, first_page);
> >> >> >>       spin_unlock(&class->lock);
> >> >> >>
> >> >> >>       if (fullness == ZS_EMPTY) {
> >> >> >>               atomic_long_sub(class->pages_per_zspage,
> >> >> >>                               &pool->pages_allocated);
> >> >> >> +#ifdef CONFIG_ZSMALLOC_STAT
> >> >> >> +             class->obj_allocated -= get_maxobj_per_zspage(class->size,
> >> >> >> +                             class->pages_per_zspage);
> >> >> >> +#endif
> >> >> >
> >> >> > zs_stat_dec(class, OBJ_ALLOCATED, get_max_obj());
> >> >> >
> >> >> >>               free_zspage(first_page);
> >> >> >>       }
> >> >> >>  }
> >> >> >> @@ -1209,6 +1360,10 @@ struct zs_pool *zs_create_pool(gfp_t flags)
> >> >> >>       }
> >> >> >>
> >> >> >>       pool->flags = flags;
> >> >> >> +     zs_pool_num++;
> >> >> >
> >> >> > Who protect the race?
> >> >> > And manybe we should keep the index in zspool and export it to the user
> >> >> > to let them know what zsmalloc instance is thiers.
> >> >>
> >> >> OK
> >> >>
> >> >> >
> >> >> >> +
> >> >> >> +     if (zs_pool_stat_create(pool, zs_pool_num))
> >> >> >> +             pr_warn("zs pool %d stat initialization failed\n", zs_pool_num);
> >> >> >>
> >> >> >>       return pool;
> >> >> >>
> >> >> >> @@ -1241,6 +1396,9 @@ void zs_destroy_pool(struct zs_pool *pool)
> >> >> >>               kfree(class);
> >> >> >>       }
> >> >> >>
> >> >> >> +     zs_pool_stat_destroy(pool);
> >> >> >> +     zs_pool_num--;
> >> >> >> +
> >> >> >>       kfree(pool->size_class);
> >> >> >>       kfree(pool);
> >> >> >>  }
> >> >> >> @@ -1260,6 +1418,10 @@ static int __init zs_init(void)
> >> >> >>  #ifdef CONFIG_ZPOOL
> >> >> >>       zpool_register_driver(&zs_zpool_driver);
> >> >> >>  #endif
> >> >> >> +
> >> >> >> +     if (zs_stat_init())
> >> >> >> +             pr_warn("zs stat initialization failed\n");
> >> >> >> +
> >> >> >>       return 0;
> >> >> >>  }
> >> >> >>
> >> >> >> @@ -1269,6 +1431,8 @@ static void __exit zs_exit(void)
> >> >> >>       zpool_unregister_driver(&zs_zpool_driver);
> >> >> >>  #endif
> >> >> >>       zs_unregister_cpu_notifier();
> >> >> >> +
> >> >> >> +     zs_stat_exit();
> >> >> >>  }
> >> >> >>
> >> >> >>  module_init(zs_init);
> >> >> >> --
> >> >> >> 1.7.9.5
> >> >> >>
> >> >>
> >> >> --
> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> >> the body to majordomo@kvack.org.  For more info on Linux MM,
> >> >> see: http://www.linux-mm.org/ .
> >> >> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >> >
> >> > --
> >> > Kind regards,
> >> > Minchan Kim
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to majordomo@kvack.org.  For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc: add statistics support
  2014-12-26 13:06 Ganesh Mahendran
@ 2014-12-29  0:10 ` Minchan Kim
  0 siblings, 0 replies; 10+ messages in thread
From: Minchan Kim @ 2014-12-29  0:10 UTC (permalink / raw)
  To: Ganesh Mahendran; +Cc: ngupta, akpm, linux-mm, linux-kernel

On Fri, Dec 26, 2014 at 09:06:20PM +0800, Ganesh Mahendran wrote:
> Keeping fragmentation of zsmalloc in a low level is our target. But now
> we still need to add the debug code in zsmalloc to get the quantitative data.
> 
> This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the
> statistics collection for developers. Currently only the objects statatitics
> in each class are collected. User can get the information via debugfs.
>      cat /sys/kernel/debug/zsmalloc/zram0/...
> 
> For example:
> 
> After I copied "jdk-8u25-linux-x64.tar.gz" to zram with ext4 filesystem:
>  class  size obj_allocated   obj_used pages_used
>      0    32             0          0          0
>      1    48           256         12          3
>      2    64            64         14          1
>      3    80            51          7          1
>      4    96           128          5          3
>      5   112            73          5          2
>      6   128            32          4          1
>      7   144             0          0          0
>      8   160             0          0          0
>      9   176             0          0          0
>     10   192             0          0          0
>     11   208             0          0          0
>     12   224             0          0          0
>     13   240             0          0          0
>     14   256            16          1          1
>     15   272            15          9          1
>     16   288             0          0          0
>     17   304             0          0          0
>     18   320             0          0          0
>     19   336             0          0          0
>     20   352             0          0          0
>     21   368             0          0          0
>     22   384             0          0          0
>     23   400             0          0          0
>     24   416             0          0          0
>     25   432             0          0          0
>     26   448             0          0          0
>     27   464             0          0          0
>     28   480             0          0          0
>     29   496            33          1          4
>     30   512             0          0          0
>     31   528             0          0          0
>     32   544             0          0          0
>     33   560             0          0          0
>     34   576             0          0          0
>     35   592             0          0          0
>     36   608             0          0          0
>     37   624             0          0          0
>     38   640             0          0          0
>     40   672             0          0          0
>     42   704             0          0          0
>     43   720            17          1          3
>     44   736             0          0          0
>     46   768             0          0          0
>     49   816             0          0          0
>     51   848             0          0          0
>     52   864            14          1          3
>     54   896             0          0          0
>     57   944            13          1          3
>     58   960             0          0          0
>     62  1024             4          1          1
>     66  1088            15          2          4
>     67  1104             0          0          0
>     71  1168             0          0          0
>     74  1216             0          0          0
>     76  1248             0          0          0
>     83  1360             3          1          1
>     91  1488            11          1          4
>     94  1536             0          0          0
>    100  1632             5          1          2
>    107  1744             0          0          0
>    111  1808             9          1          4
>    126  2048             4          4          2
>    144  2336             7          3          4
>    151  2448             0          0          0
>    168  2720            15         15         10
>    190  3072            28         27         21
>    202  3264             0          0          0
>    254  4096         36209      36209      36209
> 
>  Total               37022      36326      36288
> 
> We can calculate the overall fragentation by the last line:
>     Total               37022      36326      36288
>     (37022 - 36326) / 37022 = 1.87%
> 
> Also by analysing objects alocated in every class we know why we got so low fragmentation:
>     Most of the allocated objects is in <class 254>. And there is only 1 page in class
>     254 zspage.  So, No fragmentation will be introduced by allocating objs in class 254.
> 
> And in the future, we can collect other zsmalloc statistics as we need and analyse them.
> 
> Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
> Suggested-by: Minchan Kim <minchan@kernel.org>
> Cc: Nitin Gupta <ngupta@vflare.org>

If you fix minors below,
Acked-by: Minchan Kim <minchan@kernel.org>

> ---
>  mm/Kconfig    |   10 +++
>  mm/zsmalloc.c |  244 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 250 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 1d1ae6b..95c5728 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -601,6 +601,16 @@ config PGTABLE_MAPPING
>  	  You can check speed with zsmalloc benchmark:
>  	  https://github.com/spartacus06/zsmapbench
>  
> +config ZSMALLOC_STAT
> +	bool "Export zsmalloc statistics"
> +	depends on ZSMALLOC
> +	select DEBUG_FS
> +	help
> +	  This option enables code in the zsmalloc to collect various
> +	  statistics about whats happening in zsmalloc and exports that
> +	  information to userspace via debugfs.
> +	  If unsure, say N.
> +
>  config GENERIC_EARLY_IOREMAP
>  	bool
>  
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 2359e61..f5ae7c6 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -91,6 +91,7 @@
>  #include <linux/hardirq.h>
>  #include <linux/spinlock.h>
>  #include <linux/types.h>
> +#include <linux/debugfs.h>
>  #include <linux/zsmalloc.h>
>  #include <linux/zpool.h>
>  
> @@ -168,6 +169,22 @@ enum fullness_group {
>  	ZS_FULL
>  };
>  
> +enum zs_stat_type {
> +	OBJ_ALLOCATED,
> +	OBJ_USED,
> +};
> +
> +#ifdef CONFIG_ZSMALLOC_STAT
> +
> +static struct dentry *zs_stat_root;
> +
> +struct zs_size_stat {
> +	unsigned long obj_allocated;
> +	unsigned long obj_used;
> +};

I hope you makes it array so that we could use zs_stat_type as index.
It could remove if-conditions in zs_stat_xxx.

> +
> +#endif
> +
>  /*
>   * number of size_classes
>   */
> @@ -200,6 +217,10 @@ struct size_class {
>  	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
>  	int pages_per_zspage;
>  
> +#ifdef CONFIG_ZSMALLOC_STAT
> +	struct zs_size_stat stats;
> +#endif
> +
>  	spinlock_t lock;
>  
>  	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
> @@ -217,10 +238,16 @@ struct link_free {
>  };
>  
>  struct zs_pool {
> +	char *name;
> +
>  	struct size_class **size_class;
>  
>  	gfp_t flags;	/* allocation flags used when growing pool */
>  	atomic_long_t pages_allocated;
> +
> +#ifdef CONFIG_ZSMALLOC_STAT
> +	struct dentry *stat_dentry;
> +#endif
>  };
>  
>  /*
> @@ -942,6 +969,177 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
>  	return true;
>  }
>  
> +#ifdef CONFIG_ZSMALLOC_STAT
> +
> +static inline void zs_stat_inc(struct size_class *class,
> +				enum zs_stat_type type, unsigned long cnt)
> +{
> +	if (type == OBJ_ALLOCATED)
> +		class->stats.obj_allocated += cnt;
> +	else if (type == OBJ_USED)
> +		class->stats.obj_used += cnt;
> +}
> +
> +static inline void zs_stat_dec(struct size_class *class,
> +				enum zs_stat_type type, unsigned long cnt)
> +{
> +	if (type == OBJ_ALLOCATED)
> +		class->stats.obj_allocated -= cnt;
> +	else if (type == OBJ_USED)
> +		class->stats.obj_used -= cnt;
> +}
> +
> +static inline unsigned long zs_stat_get(struct size_class *class,
> +				enum zs_stat_type type)
> +{
> +	if (type == OBJ_ALLOCATED)
> +		return class->stats.obj_allocated;
> +	else if (type == OBJ_USED)
> +		return class->stats.obj_used;
> +
> +	return 0;
> +}
> +
> +static int __init zs_stat_init(void)
> +{
> +	if (!debugfs_initialized())
> +		return -ENODEV;
> +
> +	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
> +	if (!zs_stat_root)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void __exit zs_stat_exit(void)
> +{
> +	debugfs_remove_recursive(zs_stat_root);
> +}
> +
> +static int zs_stats_size_show(struct seq_file *s, void *v)
> +{
> +	int i;
> +	struct zs_pool *pool = s->private;
> +	struct size_class *class;
> +	int objs_per_zspage;
> +	unsigned long obj_allocated, obj_used, pages_used;
> +	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
> +
> +	seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
> +				"obj_allocated", "obj_used", "pages_used");
> +
> +	for (i = 0; i < zs_size_classes; i++) {
> +		class = pool->size_class[i];
> +
> +		if (class->index != i)
> +			continue;
> +
> +		spin_lock(&class->lock);
> +		obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
> +		obj_used = zs_stat_get(class, OBJ_USED);
> +		spin_unlock(&class->lock);
> +
> +		objs_per_zspage = get_maxobj_per_zspage(class->size,
> +				class->pages_per_zspage);
> +		pages_used = obj_allocated / objs_per_zspage *
> +				class->pages_per_zspage;
> +
> +		seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
> +			class->size, obj_allocated, obj_used, pages_used);
> +
> +		total_objs += obj_allocated;
> +		total_used_objs += obj_used;
> +		total_pages += pages_used;
> +	}
> +
> +	seq_puts(s, "\n");
> +	seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
> +			total_objs, total_used_objs, total_pages);
> +
> +	return 0;
> +}
> +
> +static int zs_stats_size_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, zs_stats_size_show, inode->i_private);
> +}
> +
> +static const struct file_operations zs_stat_size_ops = {
> +	.open           = zs_stats_size_open,
> +	.read           = seq_read,
> +	.llseek         = seq_lseek,
> +	.release        = single_release,
> +};
> +
> +static int zs_pool_stat_create(char *name, struct zs_pool *pool)
> +{
> +	struct dentry *entry;
> +
> +	if (!zs_stat_root)
> +		return -ENODEV;
> +
> +	entry = debugfs_create_dir(name, zs_stat_root);
> +	if (!entry) {
> +		pr_warn("debugfs dir <%s> creation failed\n", name);
> +		return -ENOMEM;
> +	}
> +	pool->stat_dentry = entry;
> +
> +	entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
> +			pool->stat_dentry, pool, &zs_stat_size_ops);
> +	if (!entry) {
> +		pr_warn("%s: debugfs file entry <%s> creation failed\n",
> +				name, "obj_in_classes");
> +		return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
> +static void zs_pool_stat_destroy(struct zs_pool *pool)
> +{
> +	debugfs_remove_recursive(pool->stat_dentry);
> +}
> +
> +#else /* CONFIG_ZSMALLOC_STAT */
> +
> +static inline void zs_stat_inc(struct size_class *class,
> +				enum zs_stat_type type, unsigned long cnt)
> +{
> +}
> +
> +static inline void zs_stat_dec(struct size_class *class,
> +				enum zs_stat_type type, unsigned long cnt)
> +{
> +}
> +
> +static inline unsigned long zs_stat_get(struct size_class *class,
> +				enum zs_stat_type type)
> +{
> +	return 0;
> +}
> +
> +static int __init zs_stat_init(void)
> +{
> +	return 0;
> +}
> +
> +static void __exit zs_stat_exit(void)
> +{
> +}
> +
> +static inline int zs_pool_stat_create(struct zs_pool *pool)

                                         char *name, struct zs_pool *pool

Thanks for the nice work, Ganesh!

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/2] mm/zsmalloc: add statistics support
@ 2014-12-26 13:06 Ganesh Mahendran
  2014-12-29  0:10 ` Minchan Kim
  0 siblings, 1 reply; 10+ messages in thread
From: Ganesh Mahendran @ 2014-12-26 13:06 UTC (permalink / raw)
  To: minchan, ngupta; +Cc: akpm, linux-mm, linux-kernel, Ganesh Mahendran

Keeping fragmentation of zsmalloc in a low level is our target. But now
we still need to add the debug code in zsmalloc to get the quantitative data.

This patch adds a new configuration CONFIG_ZSMALLOC_STAT to enable the
statistics collection for developers. Currently only the objects statatitics
in each class are collected. User can get the information via debugfs.
     cat /sys/kernel/debug/zsmalloc/zram0/...

For example:

After I copied "jdk-8u25-linux-x64.tar.gz" to zram with ext4 filesystem:
 class  size obj_allocated   obj_used pages_used
     0    32             0          0          0
     1    48           256         12          3
     2    64            64         14          1
     3    80            51          7          1
     4    96           128          5          3
     5   112            73          5          2
     6   128            32          4          1
     7   144             0          0          0
     8   160             0          0          0
     9   176             0          0          0
    10   192             0          0          0
    11   208             0          0          0
    12   224             0          0          0
    13   240             0          0          0
    14   256            16          1          1
    15   272            15          9          1
    16   288             0          0          0
    17   304             0          0          0
    18   320             0          0          0
    19   336             0          0          0
    20   352             0          0          0
    21   368             0          0          0
    22   384             0          0          0
    23   400             0          0          0
    24   416             0          0          0
    25   432             0          0          0
    26   448             0          0          0
    27   464             0          0          0
    28   480             0          0          0
    29   496            33          1          4
    30   512             0          0          0
    31   528             0          0          0
    32   544             0          0          0
    33   560             0          0          0
    34   576             0          0          0
    35   592             0          0          0
    36   608             0          0          0
    37   624             0          0          0
    38   640             0          0          0
    40   672             0          0          0
    42   704             0          0          0
    43   720            17          1          3
    44   736             0          0          0
    46   768             0          0          0
    49   816             0          0          0
    51   848             0          0          0
    52   864            14          1          3
    54   896             0          0          0
    57   944            13          1          3
    58   960             0          0          0
    62  1024             4          1          1
    66  1088            15          2          4
    67  1104             0          0          0
    71  1168             0          0          0
    74  1216             0          0          0
    76  1248             0          0          0
    83  1360             3          1          1
    91  1488            11          1          4
    94  1536             0          0          0
   100  1632             5          1          2
   107  1744             0          0          0
   111  1808             9          1          4
   126  2048             4          4          2
   144  2336             7          3          4
   151  2448             0          0          0
   168  2720            15         15         10
   190  3072            28         27         21
   202  3264             0          0          0
   254  4096         36209      36209      36209

 Total               37022      36326      36288

We can calculate the overall fragentation by the last line:
    Total               37022      36326      36288
    (37022 - 36326) / 37022 = 1.87%

Also by analysing objects alocated in every class we know why we got so low fragmentation:
    Most of the allocated objects is in <class 254>. And there is only 1 page in class
    254 zspage.  So, No fragmentation will be introduced by allocating objs in class 254.

And in the future, we can collect other zsmalloc statistics as we need and analyse them.

Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
---
 mm/Kconfig    |   10 +++
 mm/zsmalloc.c |  244 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 250 insertions(+), 4 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b..95c5728 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -601,6 +601,16 @@ config PGTABLE_MAPPING
 	  You can check speed with zsmalloc benchmark:
 	  https://github.com/spartacus06/zsmapbench
 
+config ZSMALLOC_STAT
+	bool "Export zsmalloc statistics"
+	depends on ZSMALLOC
+	select DEBUG_FS
+	help
+	  This option enables code in the zsmalloc to collect various
+	  statistics about whats happening in zsmalloc and exports that
+	  information to userspace via debugfs.
+	  If unsure, say N.
+
 config GENERIC_EARLY_IOREMAP
 	bool
 
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2359e61..f5ae7c6 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
 #include <linux/hardirq.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
 
@@ -168,6 +169,22 @@ enum fullness_group {
 	ZS_FULL
 };
 
+enum zs_stat_type {
+	OBJ_ALLOCATED,
+	OBJ_USED,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static struct dentry *zs_stat_root;
+
+struct zs_size_stat {
+	unsigned long obj_allocated;
+	unsigned long obj_used;
+};
+
+#endif
+
 /*
  * number of size_classes
  */
@@ -200,6 +217,10 @@ struct size_class {
 	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
 	int pages_per_zspage;
 
+#ifdef CONFIG_ZSMALLOC_STAT
+	struct zs_size_stat stats;
+#endif
+
 	spinlock_t lock;
 
 	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
 };
 
 struct zs_pool {
+	char *name;
+
 	struct size_class **size_class;
 
 	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+	struct dentry *stat_dentry;
+#endif
 };
 
 /*
@@ -942,6 +969,177 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 	return true;
 }
 
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+				enum zs_stat_type type, unsigned long cnt)
+{
+	if (type == OBJ_ALLOCATED)
+		class->stats.obj_allocated += cnt;
+	else if (type == OBJ_USED)
+		class->stats.obj_used += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+				enum zs_stat_type type, unsigned long cnt)
+{
+	if (type == OBJ_ALLOCATED)
+		class->stats.obj_allocated -= cnt;
+	else if (type == OBJ_USED)
+		class->stats.obj_used -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+				enum zs_stat_type type)
+{
+	if (type == OBJ_ALLOCATED)
+		return class->stats.obj_allocated;
+	else if (type == OBJ_USED)
+		return class->stats.obj_used;
+
+	return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+	if (!zs_stat_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+	debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+	int i;
+	struct zs_pool *pool = s->private;
+	struct size_class *class;
+	int objs_per_zspage;
+	unsigned long obj_allocated, obj_used, pages_used;
+	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+	seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+				"obj_allocated", "obj_used", "pages_used");
+
+	for (i = 0; i < zs_size_classes; i++) {
+		class = pool->size_class[i];
+
+		if (class->index != i)
+			continue;
+
+		spin_lock(&class->lock);
+		obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+		obj_used = zs_stat_get(class, OBJ_USED);
+		spin_unlock(&class->lock);
+
+		objs_per_zspage = get_maxobj_per_zspage(class->size,
+				class->pages_per_zspage);
+		pages_used = obj_allocated / objs_per_zspage *
+				class->pages_per_zspage;
+
+		seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
+			class->size, obj_allocated, obj_used, pages_used);
+
+		total_objs += obj_allocated;
+		total_used_objs += obj_used;
+		total_pages += pages_used;
+	}
+
+	seq_puts(s, "\n");
+	seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
+			total_objs, total_used_objs, total_pages);
+
+	return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+	.open           = zs_stats_size_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+	struct dentry *entry;
+
+	if (!zs_stat_root)
+		return -ENODEV;
+
+	entry = debugfs_create_dir(name, zs_stat_root);
+	if (!entry) {
+		pr_warn("debugfs dir <%s> creation failed\n", name);
+		return -ENOMEM;
+	}
+	pool->stat_dentry = entry;
+
+	entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+			pool->stat_dentry, pool, &zs_stat_size_ops);
+	if (!entry) {
+		pr_warn("%s: debugfs file entry <%s> creation failed\n",
+				name, "obj_in_classes");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+	debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+				enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+				enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+				enum zs_stat_type type)
+{
+	return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+	return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(struct zs_pool *pool)
+{
+	return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
 	return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1272,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 		set_zspage_mapping(first_page, class->index, ZS_EMPTY);
 		atomic_long_add(class->pages_per_zspage,
 					&pool->pages_allocated);
+
 		spin_lock(&class->lock);
+		zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+				class->size, class->pages_per_zspage));
 	}
 
 	obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1289,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	kunmap_atomic(vaddr);
 
 	first_page->inuse++;
+	zs_stat_inc(class, OBJ_USED, 1);
 	/* Now move the zspage to another fullness group, if required */
 	fix_fullness_group(pool, first_page);
 	spin_unlock(&class->lock);
@@ -1128,6 +1330,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
 
 	first_page->inuse--;
 	fullness = fix_fullness_group(pool, first_page);
+
+	zs_stat_dec(class, OBJ_USED, 1);
+	if (fullness == ZS_EMPTY)
+		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+				class->size, class->pages_per_zspage));
+
 	spin_unlock(&class->lock);
 
 	if (fullness == ZS_EMPTY) {
@@ -1158,9 +1366,16 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
 	if (!pool)
 		return NULL;
 
+	pool->name = kstrdup(name, GFP_KERNEL);
+	if (!pool->name) {
+		kfree(pool);
+		return NULL;
+	}
+
 	pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
 			GFP_KERNEL);
 	if (!pool->size_class) {
+		kfree(pool->name);
 		kfree(pool);
 		return NULL;
 	}
@@ -1210,6 +1425,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
 
 	pool->flags = flags;
 
+	if (zs_pool_stat_create(name, pool))
+		goto err;
+
 	return pool;
 
 err:
@@ -1222,6 +1440,8 @@ void zs_destroy_pool(struct zs_pool *pool)
 {
 	int i;
 
+	zs_pool_stat_destroy(pool);
+
 	for (i = 0; i < zs_size_classes; i++) {
 		int fg;
 		struct size_class *class = pool->size_class[i];
@@ -1242,6 +1462,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 	}
 
 	kfree(pool->size_class);
+	kfree(pool->name);
 	kfree(pool);
 }
 EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1471,30 @@ static int __init zs_init(void)
 {
 	int ret = zs_register_cpu_notifier();
 
-	if (ret) {
-		zs_unregister_cpu_notifier();
-		return ret;
-	}
+	if (ret)
+		goto notifier_fail;
 
 	init_zs_size_classes();
 
 #ifdef CONFIG_ZPOOL
 	zpool_register_driver(&zs_zpool_driver);
 #endif
+
+	ret = zs_stat_init();
+	if (ret) {
+		pr_err("zs stat initialization failed\n");
+		goto stat_fail;
+	}
 	return 0;
+
+stat_fail:
+#ifdef CONFIG_ZPOOL
+	zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+	zs_unregister_cpu_notifier();
+
+	return ret;
 }
 
 static void __exit zs_exit(void)
@@ -1269,6 +1503,8 @@ static void __exit zs_exit(void)
 	zpool_unregister_driver(&zs_zpool_driver);
 #endif
 	zs_unregister_cpu_notifier();
+
+	zs_stat_exit();
 }
 
 module_init(zs_init);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-12-29  0:08 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-13 13:45 [PATCH 2/2] mm/zsmalloc: add statistics support Ganesh Mahendran
2014-12-16  2:45 ` Minchan Kim
2014-12-17  7:05   ` Ganesh Mahendran
2014-12-18 23:44     ` Minchan Kim
2014-12-19  1:21       ` Ganesh Mahendran
2014-12-19  1:30         ` Minchan Kim
2014-12-19  1:49           ` Ganesh Mahendran
2014-12-19  1:59             ` Minchan Kim
2014-12-26 13:06 Ganesh Mahendran
2014-12-29  0:10 ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).