All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] zram: switch to per-cpu compression streams
@ 2016-04-28 16:17 Sergey Senozhatsky
  2016-04-28 16:17 ` [PATCH 1/2] zsmalloc: require GFP in zs_malloc() Sergey Senozhatsky
  2016-04-28 16:17 ` [PATCH 2/2] zram: user per-cpu compression streams Sergey Senozhatsky
  0 siblings, 2 replies; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-04-28 16:17 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, Sergey Senozhatsky, Sergey Senozhatsky

Hello,

The patchset drops idle compression streams list from zcomp, so
zram now is using per-cpu compression streams. This removes two
spin_lock()/spin_unlock() calls from the write path and prevents
write OP from being preempted with active compression stream,
which improves performance (according to the tests, see commit
messages) and simplifies the code.

against next-20160428

Sergey Senozhatsky (2):
  zsmalloc: require GFP in zs_malloc()
  zram: user per-cpu compression streams

 drivers/block/zram/zcomp.c    | 293 ++++++++++++------------------------------
 drivers/block/zram/zcomp.h    |  12 +-
 drivers/block/zram/zram_drv.c |  34 ++++-
 include/linux/zsmalloc.h      |   2 +-
 mm/zsmalloc.c                 |  15 +--
 5 files changed, 119 insertions(+), 237 deletions(-)

-- 
2.8.0

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/2] zsmalloc: require GFP in zs_malloc()
  2016-04-28 16:17 [PATCH 0/2] zram: switch to per-cpu compression streams Sergey Senozhatsky
@ 2016-04-28 16:17 ` Sergey Senozhatsky
  2016-04-29  5:44   ` Minchan Kim
  2016-04-28 16:17 ` [PATCH 2/2] zram: user per-cpu compression streams Sergey Senozhatsky
  1 sibling, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-04-28 16:17 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, Sergey Senozhatsky, Sergey Senozhatsky

Pass GFP flags to zs_malloc() instead of using a fixed set
(supplied during pool creation), so we can be more flexible,
but, more importantly, this will be need to switch zram to
per-cpu compression streams.

Apart from that, this also align zs_malloc() interface with
zspool/zbud.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
---
 drivers/block/zram/zram_drv.c |  2 +-
 include/linux/zsmalloc.h      |  2 +-
 mm/zsmalloc.c                 | 15 ++++++---------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f7..9030992 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen);
+	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
 	if (!handle) {
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb160..6d89f8b 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -44,7 +44,7 @@ struct zs_pool;
 struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
 void zs_destroy_pool(struct zs_pool *pool);
 
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index a0890e9..2c22aff 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
 	struct size_class **size_class;
 	struct kmem_cache *handle_cachep;
 
-	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
 	struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
 	kmem_cache_destroy(pool->handle_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+			gfp & ~__GFP_HIGHMEM);
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -335,7 +334,7 @@ static void zs_zpool_destroy(void *pool)
 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
-	*handle = zs_malloc(pool, size);
+	*handle = zs_malloc(pool, size, gfp);
 	return *handle ? 0 : -1;
 }
 static void zs_zpool_free(void *pool, unsigned long handle)
@@ -1391,7 +1390,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * otherwise 0.
  * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
 	unsigned long handle, obj;
 	struct size_class *class;
@@ -1400,7 +1399,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
 		return 0;
 
-	handle = alloc_handle(pool);
+	handle = alloc_handle(pool, gfp);
 	if (!handle)
 		return 0;
 
@@ -1413,7 +1412,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(class, gfp);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1945,8 +1944,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 		prev_class = class;
 	}
 
-	pool->flags = flags;
-
 	if (zs_pool_stat_create(pool, name))
 		goto err;
 
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/2] zram: user per-cpu compression streams
  2016-04-28 16:17 [PATCH 0/2] zram: switch to per-cpu compression streams Sergey Senozhatsky
  2016-04-28 16:17 ` [PATCH 1/2] zsmalloc: require GFP in zs_malloc() Sergey Senozhatsky
@ 2016-04-28 16:17 ` Sergey Senozhatsky
  2016-05-02  6:23   ` Minchan Kim
  1 sibling, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-04-28 16:17 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, Sergey Senozhatsky, Sergey Senozhatsky

Remove idle streams list and keep compression streams in per-cpu
data. This removes two contented spin_lock()/spin_unlock() calls
from write path and also prevent write OP from being preempted
while holding the compression stream, which can cause slow downs.

For instance, let's assume that we have N cpus and N-2
max_comp_streams.TASK1 owns the last idle stream, TASK2-TASK3 come
in with the write requests:

  TASK1            TASK2              TASK3
 zram_bvec_write()
  spin_lock
  find stream
  spin_unlock

  compress

  <<preempted>>   zram_bvec_write()
                   spin_lock
                   find stream
                   spin_unlock
                     no_stream
                       schedule
                                     zram_bvec_write()
                                      spin_lock
                                      find_stream
                                      spin_unlock
                                        no_stream
                                          schedule
   spin_lock
   release stream
   spin_unlock
     wake up TASK2

not only TASK2 and TASK3 will not get the stream, TASK1 will be
preempted in the middle of its operation; while we would prefer
it to finish compression and release the stream.

Test environment: x86_64, 4 CPU box, 3G zram, lzo

The following fio tests were executed:
      read, randread, write, randwrite, rw, randrw
with the increasing number of jobs from 1 to 10.

                4 streams        8 streams       per-cpu
===========================================================
jobs1
READ:           2520.1MB/s       2566.5MB/s      2491.5MB/s
READ:           2102.7MB/s       2104.2MB/s      2091.3MB/s
WRITE:          1355.1MB/s       1320.2MB/s      1378.9MB/s
WRITE:          1103.5MB/s       1097.2MB/s      1122.5MB/s
READ:           434013KB/s       435153KB/s      439961KB/s
WRITE:          433969KB/s       435109KB/s      439917KB/s
READ:           403166KB/s       405139KB/s      403373KB/s
WRITE:          403223KB/s       405197KB/s      403430KB/s
jobs2
READ:           7958.6MB/s       8105.6MB/s      8073.7MB/s
READ:           6864.9MB/s       6989.8MB/s      7021.8MB/s
WRITE:          2438.1MB/s       2346.9MB/s      3400.2MB/s
WRITE:          1994.2MB/s       1990.3MB/s      2941.2MB/s
READ:           981504KB/s       973906KB/s      1018.8MB/s
WRITE:          981659KB/s       974060KB/s      1018.1MB/s
READ:           937021KB/s       938976KB/s      987250KB/s
WRITE:          934878KB/s       936830KB/s      984993KB/s
jobs3
READ:           13280MB/s        13553MB/s       13553MB/s
READ:           11534MB/s        11785MB/s       11755MB/s
WRITE:          3456.9MB/s       3469.9MB/s      4810.3MB/s
WRITE:          3029.6MB/s       3031.6MB/s      4264.8MB/s
READ:           1363.8MB/s       1362.6MB/s      1448.9MB/s
WRITE:          1361.9MB/s       1360.7MB/s      1446.9MB/s
READ:           1309.4MB/s       1310.6MB/s      1397.5MB/s
WRITE:          1307.4MB/s       1308.5MB/s      1395.3MB/s
jobs4
READ:           20244MB/s        20177MB/s       20344MB/s
READ:           17886MB/s        17913MB/s       17835MB/s
WRITE:          4071.6MB/s       4046.1MB/s      6370.2MB/s
WRITE:          3608.9MB/s       3576.3MB/s      5785.4MB/s
READ:           1824.3MB/s       1821.6MB/s      1997.5MB/s
WRITE:          1819.8MB/s       1817.4MB/s      1992.5MB/s
READ:           1765.7MB/s       1768.3MB/s      1937.3MB/s
WRITE:          1767.5MB/s       1769.1MB/s      1939.2MB/s
jobs5
READ:           18663MB/s        18986MB/s       18823MB/s
READ:           16659MB/s        16605MB/s       16954MB/s
WRITE:          3912.4MB/s       3888.7MB/s      6126.9MB/s
WRITE:          3506.4MB/s       3442.5MB/s      5519.3MB/s
READ:           1798.2MB/s       1746.5MB/s      1935.8MB/s
WRITE:          1792.7MB/s       1740.7MB/s      1929.1MB/s
READ:           1727.6MB/s       1658.2MB/s      1917.3MB/s
WRITE:          1726.5MB/s       1657.2MB/s      1916.6MB/s
jobs6
READ:           21017MB/s        20922MB/s       21162MB/s
READ:           19022MB/s        19140MB/s       18770MB/s
WRITE:          3968.2MB/s       4037.7MB/s      6620.8MB/s
WRITE:          3643.5MB/s       3590.2MB/s      6027.5MB/s
READ:           1871.8MB/s       1880.5MB/s      2049.9MB/s
WRITE:          1867.8MB/s       1877.2MB/s      2046.2MB/s
READ:           1755.8MB/s       1710.3MB/s      1964.7MB/s
WRITE:          1750.5MB/s       1705.9MB/s      1958.8MB/s
jobs7
READ:           21103MB/s        20677MB/s       21482MB/s
READ:           18522MB/s        18379MB/s       19443MB/s
WRITE:          4022.5MB/s       4067.4MB/s      6755.9MB/s
WRITE:          3691.7MB/s       3695.5MB/s      5925.6MB/s
READ:           1841.5MB/s       1933.9MB/s      2090.5MB/s
WRITE:          1842.7MB/s       1935.3MB/s      2091.9MB/s
READ:           1832.4MB/s       1856.4MB/s      1971.5MB/s
WRITE:          1822.3MB/s       1846.2MB/s      1960.6MB/s
jobs8
READ:           20463MB/s        20194MB/s       20862MB/s
READ:           18178MB/s        17978MB/s       18299MB/s
WRITE:          4085.9MB/s       4060.2MB/s      7023.8MB/s
WRITE:          3776.3MB/s       3737.9MB/s      6278.2MB/s
READ:           1957.6MB/s       1944.4MB/s      2109.5MB/s
WRITE:          1959.2MB/s       1946.2MB/s      2111.4MB/s
READ:           1900.6MB/s       1885.7MB/s      2082.1MB/s
WRITE:          1896.2MB/s       1881.4MB/s      2078.3MB/s
jobs9
READ:           19692MB/s        19734MB/s       19334MB/s
READ:           17678MB/s        18249MB/s       17666MB/s
WRITE:          4004.7MB/s       4064.8MB/s      6990.7MB/s
WRITE:          3724.7MB/s       3772.1MB/s      6193.6MB/s
READ:           1953.7MB/s       1967.3MB/s      2105.6MB/s
WRITE:          1953.4MB/s       1966.7MB/s      2104.1MB/s
READ:           1860.4MB/s       1897.4MB/s      2068.5MB/s
WRITE:          1858.9MB/s       1895.9MB/s      2066.8MB/s
jobs10
READ:           19730MB/s        19579MB/s       19492MB/s
READ:           18028MB/s        18018MB/s       18221MB/s
WRITE:          4027.3MB/s       4090.6MB/s      7020.1MB/s
WRITE:          3810.5MB/s       3846.8MB/s      6426.8MB/s
READ:           1956.1MB/s       1994.6MB/s      2145.2MB/s
WRITE:          1955.9MB/s       1993.5MB/s      2144.8MB/s
READ:           1852.8MB/s       1911.6MB/s      2075.8MB/s
WRITE:          1855.7MB/s       1914.6MB/s      2078.1MB/s

perf stat

                                4 streams                       8 streams                       per-cpu
====================================================================================================================
jobs1
stalled-cycles-frontend      23,174,811,209 (  38.21%)     23,220,254,188 (  38.25%)       23,061,406,918 (  38.34%)
stalled-cycles-backend       11,514,174,638 (  18.98%)     11,696,722,657 (  19.27%)       11,370,852,810 (  18.90%)
instructions                 73,925,005,782 (    1.22)     73,903,177,632 (    1.22)       73,507,201,037 (    1.22)
branches                     14,455,124,835 ( 756.063)     14,455,184,779 ( 755.281)       14,378,599,509 ( 758.546)
branch-misses                    69,801,336 (   0.48%)         80,225,529 (   0.55%)           72,044,726 (   0.50%)
jobs2
stalled-cycles-frontend      49,912,741,782 (  46.11%)     50,101,189,290 (  45.95%)       32,874,195,633 (  35.11%)
stalled-cycles-backend       27,080,366,230 (  25.02%)     27,949,970,232 (  25.63%)       16,461,222,706 (  17.58%)
instructions                122,831,629,690 (    1.13)    122,919,846,419 (    1.13)      121,924,786,775 (    1.30)
branches                     23,725,889,239 ( 692.663)     23,733,547,140 ( 688.062)       23,553,950,311 ( 794.794)
branch-misses                    90,733,041 (   0.38%)         96,320,895 (   0.41%)           84,561,092 (   0.36%)
jobs3
stalled-cycles-frontend      66,437,834,608 (  45.58%)     63,534,923,344 (  43.69%)       42,101,478,505 (  33.19%)
stalled-cycles-backend       34,940,799,661 (  23.97%)     34,774,043,148 (  23.91%)       21,163,324,388 (  16.68%)
instructions                171,692,121,862 (    1.18)    171,775,373,044 (    1.18)      170,353,542,261 (    1.34)
branches                     32,968,962,622 ( 628.723)     32,987,739,894 ( 630.512)       32,729,463,918 ( 717.027)
branch-misses                   111,522,732 (   0.34%)        110,472,894 (   0.33%)           99,791,291 (   0.30%)
jobs4
stalled-cycles-frontend      98,741,701,675 (  49.72%)     94,797,349,965 (  47.59%)       54,535,655,381 (  33.53%)
stalled-cycles-backend       54,642,609,615 (  27.51%)     55,233,554,408 (  27.73%)       27,882,323,541 (  17.14%)
instructions                220,884,807,851 (    1.11)    220,930,887,273 (    1.11)      218,926,845,851 (    1.35)
branches                     42,354,518,180 ( 592.105)     42,362,770,587 ( 590.452)       41,955,552,870 ( 716.154)
branch-misses                   138,093,449 (   0.33%)        131,295,286 (   0.31%)          121,794,771 (   0.29%)
jobs5
stalled-cycles-frontend     116,219,747,212 (  48.14%)    110,310,397,012 (  46.29%)       66,373,082,723 (  33.70%)
stalled-cycles-backend       66,325,434,776 (  27.48%)     64,157,087,914 (  26.92%)       32,999,097,299 (  16.76%)
instructions                270,615,008,466 (    1.12)    270,546,409,525 (    1.14)      268,439,910,948 (    1.36)
branches                     51,834,046,557 ( 599.108)     51,811,867,722 ( 608.883)       51,412,576,077 ( 729.213)
branch-misses                   158,197,086 (   0.31%)        142,639,805 (   0.28%)          133,425,455 (   0.26%)
jobs6
stalled-cycles-frontend     138,009,414,492 (  48.23%)    139,063,571,254 (  48.80%)       75,278,568,278 (  32.80%)
stalled-cycles-backend       79,211,949,650 (  27.68%)     79,077,241,028 (  27.75%)       37,735,797,899 (  16.44%)
instructions                319,763,993,731 (    1.12)    319,937,782,834 (    1.12)      316,663,600,784 (    1.38)
branches                     61,219,433,294 ( 595.056)     61,250,355,540 ( 598.215)       60,523,446,617 ( 733.706)
branch-misses                   169,257,123 (   0.28%)        154,898,028 (   0.25%)          141,180,587 (   0.23%)
jobs7
stalled-cycles-frontend     162,974,812,119 (  49.20%)    159,290,061,987 (  48.43%)       88,046,641,169 (  33.21%)
stalled-cycles-backend       92,223,151,661 (  27.84%)     91,667,904,406 (  27.87%)       44,068,454,971 (  16.62%)
instructions                369,516,432,430 (    1.12)    369,361,799,063 (    1.12)      365,290,380,661 (    1.38)
branches                     70,795,673,950 ( 594.220)     70,743,136,124 ( 597.876)       69,803,996,038 ( 732.822)
branch-misses                   181,708,327 (   0.26%)        165,767,821 (   0.23%)          150,109,797 (   0.22%)
jobs8
stalled-cycles-frontend     185,000,017,027 (  49.30%)    182,334,345,473 (  48.37%)       99,980,147,041 (  33.26%)
stalled-cycles-backend      105,753,516,186 (  28.18%)    107,937,830,322 (  28.63%)       51,404,177,181 (  17.10%)
instructions                418,153,161,055 (    1.11)    418,308,565,828 (    1.11)      413,653,475,581 (    1.38)
branches                     80,035,882,398 ( 592.296)     80,063,204,510 ( 589.843)       79,024,105,589 ( 730.530)
branch-misses                   199,764,528 (   0.25%)        177,936,926 (   0.22%)          160,525,449 (   0.20%)
jobs9
stalled-cycles-frontend     210,941,799,094 (  49.63%)    204,714,679,254 (  48.55%)      114,251,113,756 (  33.96%)
stalled-cycles-backend      122,640,849,067 (  28.85%)    122,188,553,256 (  28.98%)       58,360,041,127 (  17.35%)
instructions                468,151,025,415 (    1.10)    467,354,869,323 (    1.11)      462,665,165,216 (    1.38)
branches                     89,657,067,510 ( 585.628)     89,411,550,407 ( 588.990)       88,360,523,943 ( 730.151)
branch-misses                   218,292,301 (   0.24%)        191,701,247 (   0.21%)          178,535,678 (   0.20%)
jobs10
stalled-cycles-frontend     233,595,958,008 (  49.81%)    227,540,615,689 (  49.11%)      160,341,979,938 (  43.07%)
stalled-cycles-backend      136,153,676,021 (  29.03%)    133,635,240,742 (  28.84%)       65,909,135,465 (  17.70%)
instructions                517,001,168,497 (    1.10)    516,210,976,158 (    1.11)      511,374,038,613 (    1.37)
branches                     98,911,641,329 ( 585.796)     98,700,069,712 ( 591.583)       97,646,761,028 ( 728.712)
branch-misses                   232,341,823 (   0.23%)        199,256,308 (   0.20%)          183,135,268 (   0.19%)

per-cpu streams tend to cause significantly less stalled cycles;
execute less branches and hit less branch-misses.

perf stat reported execution time

                        4 streams        8 streams       per-cpu
====================================================================
jobs1
seconds elapsed        20.909073870     20.875670495    20.817838540
jobs2
seconds elapsed        18.529488399     18.720566469    16.356103108
jobs3
seconds elapsed        18.991159531     18.991340812    16.766216066
jobs4
seconds elapsed        19.560643828     19.551323547    16.246621715
jobs5
seconds elapsed        24.746498464     25.221646740    20.696112444
jobs6
seconds elapsed        28.258181828     28.289765505    22.885688857
jobs7
seconds elapsed        32.632490241     31.909125381    26.272753738
jobs8
seconds elapsed        35.651403851     36.027596308    29.108024711
jobs9
seconds elapsed        40.569362365     40.024227989    32.898204012
jobs10
seconds elapsed        44.673112304     43.874898137    35.632952191

Please see
  Link: http://marc.info/?l=linux-kernel&m=146166970727530
  Link: http://marc.info/?l=linux-kernel&m=146174716719650
for more test results (under low memory conditions).

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zcomp.c    | 293 ++++++++++++------------------------------
 drivers/block/zram/zcomp.h    |  12 +-
 drivers/block/zram/zram_drv.c |  34 ++++-
 3 files changed, 112 insertions(+), 227 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 3ef42e5..d4159e4 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
+#include <linux/cpu.h>
 
 #include "zcomp.h"
 #include "zcomp_lzo.h"
@@ -20,29 +21,6 @@
 #include "zcomp_lz4.h"
 #endif
 
-/*
- * single zcomp_strm backend
- */
-struct zcomp_strm_single {
-	struct mutex strm_lock;
-	struct zcomp_strm *zstrm;
-};
-
-/*
- * multi zcomp_strm backend
- */
-struct zcomp_strm_multi {
-	/* protect strm list */
-	spinlock_t strm_lock;
-	/* max possible number of zstrm streams */
-	int max_strm;
-	/* number of available zstrm streams */
-	int avail_strm;
-	/* list of available strms */
-	struct list_head idle_strm;
-	wait_queue_head_t strm_wait;
-};
-
 static struct zcomp_backend *backends[] = {
 	&zcomp_lzo,
 #ifdef CONFIG_ZRAM_LZ4_COMPRESS
@@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
 	return zstrm;
 }
 
-/*
- * get idle zcomp_strm or wait until other process release
- * (zcomp_strm_release()) one for us
- */
-static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-	struct zcomp_strm *zstrm;
-
-	while (1) {
-		spin_lock(&zs->strm_lock);
-		if (!list_empty(&zs->idle_strm)) {
-			zstrm = list_entry(zs->idle_strm.next,
-					struct zcomp_strm, list);
-			list_del(&zstrm->list);
-			spin_unlock(&zs->strm_lock);
-			return zstrm;
-		}
-		/* zstrm streams limit reached, wait for idle stream */
-		if (zs->avail_strm >= zs->max_strm) {
-			spin_unlock(&zs->strm_lock);
-			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
-			continue;
-		}
-		/* allocate new zstrm stream */
-		zs->avail_strm++;
-		spin_unlock(&zs->strm_lock);
-		/*
-		 * This function can be called in swapout/fs write path
-		 * so we can't use GFP_FS|IO. And it assumes we already
-		 * have at least one stream in zram initialization so we
-		 * don't do best effort to allocate more stream in here.
-		 * A default stream will work well without further multiple
-		 * streams. That's why we use NORETRY | NOWARN.
-		 */
-		zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
-					__GFP_NOWARN);
-		if (!zstrm) {
-			spin_lock(&zs->strm_lock);
-			zs->avail_strm--;
-			spin_unlock(&zs->strm_lock);
-			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
-			continue;
-		}
-		break;
-	}
-	return zstrm;
-}
-
-/* add stream back to idle list and wake up waiter or free the stream */
-static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-
-	spin_lock(&zs->strm_lock);
-	if (zs->avail_strm <= zs->max_strm) {
-		list_add(&zstrm->list, &zs->idle_strm);
-		spin_unlock(&zs->strm_lock);
-		wake_up(&zs->strm_wait);
-		return;
-	}
-
-	zs->avail_strm--;
-	spin_unlock(&zs->strm_lock);
-	zcomp_strm_free(comp, zstrm);
-}
-
-/* change max_strm limit */
-static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-	struct zcomp_strm *zstrm;
-
-	spin_lock(&zs->strm_lock);
-	zs->max_strm = num_strm;
-	/*
-	 * if user has lowered the limit and there are idle streams,
-	 * immediately free as much streams (and memory) as we can.
-	 */
-	while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
-		zstrm = list_entry(zs->idle_strm.next,
-				struct zcomp_strm, list);
-		list_del(&zstrm->list);
-		zcomp_strm_free(comp, zstrm);
-		zs->avail_strm--;
-	}
-	spin_unlock(&zs->strm_lock);
-	return true;
-}
-
-static void zcomp_strm_multi_destroy(struct zcomp *comp)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-	struct zcomp_strm *zstrm;
-
-	while (!list_empty(&zs->idle_strm)) {
-		zstrm = list_entry(zs->idle_strm.next,
-				struct zcomp_strm, list);
-		list_del(&zstrm->list);
-		zcomp_strm_free(comp, zstrm);
-	}
-	kfree(zs);
-}
-
-static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
-{
-	struct zcomp_strm *zstrm;
-	struct zcomp_strm_multi *zs;
-
-	comp->destroy = zcomp_strm_multi_destroy;
-	comp->strm_find = zcomp_strm_multi_find;
-	comp->strm_release = zcomp_strm_multi_release;
-	comp->set_max_streams = zcomp_strm_multi_set_max_streams;
-	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
-	if (!zs)
-		return -ENOMEM;
-
-	comp->stream = zs;
-	spin_lock_init(&zs->strm_lock);
-	INIT_LIST_HEAD(&zs->idle_strm);
-	init_waitqueue_head(&zs->strm_wait);
-	zs->max_strm = max_strm;
-	zs->avail_strm = 1;
-
-	zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
-	if (!zstrm) {
-		kfree(zs);
-		return -ENOMEM;
-	}
-	list_add(&zstrm->list, &zs->idle_strm);
-	return 0;
-}
-
-static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
-{
-	struct zcomp_strm_single *zs = comp->stream;
-	mutex_lock(&zs->strm_lock);
-	return zs->zstrm;
-}
-
-static void zcomp_strm_single_release(struct zcomp *comp,
-		struct zcomp_strm *zstrm)
-{
-	struct zcomp_strm_single *zs = comp->stream;
-	mutex_unlock(&zs->strm_lock);
-}
-
-static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
-{
-	/* zcomp_strm_single support only max_comp_streams == 1 */
-	return false;
-}
-
-static void zcomp_strm_single_destroy(struct zcomp *comp)
-{
-	struct zcomp_strm_single *zs = comp->stream;
-	zcomp_strm_free(comp, zs->zstrm);
-	kfree(zs);
-}
-
-static int zcomp_strm_single_create(struct zcomp *comp)
-{
-	struct zcomp_strm_single *zs;
-
-	comp->destroy = zcomp_strm_single_destroy;
-	comp->strm_find = zcomp_strm_single_find;
-	comp->strm_release = zcomp_strm_single_release;
-	comp->set_max_streams = zcomp_strm_single_set_max_streams;
-	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
-	if (!zs)
-		return -ENOMEM;
-
-	comp->stream = zs;
-	mutex_init(&zs->strm_lock);
-	zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
-	if (!zs->zstrm) {
-		kfree(zs);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
 /* show available compressors */
 ssize_t zcomp_available_show(const char *comp, char *buf)
 {
@@ -301,17 +97,17 @@ bool zcomp_available_algorithm(const char *comp)
 
 bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
 {
-	return comp->set_max_streams(comp, num_strm);
+	return true;
 }
 
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
 {
-	return comp->strm_find(comp);
+	return *get_cpu_ptr(comp->stream);
 }
 
 void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
 {
-	comp->strm_release(comp, zstrm);
+	put_cpu_ptr(comp->stream);
 }
 
 int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
@@ -327,9 +123,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
 	return comp->backend->decompress(src, src_len, dst);
 }
 
+static int __zcomp_cpu_notifier(struct zcomp *comp,
+		unsigned long action, unsigned long cpu)
+{
+	struct zcomp_strm *zstrm;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
+			break;
+		zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
+		if (IS_ERR_OR_NULL(zstrm)) {
+			pr_err("Can't allocate a compression stream\n");
+			return NOTIFY_BAD;
+		}
+		*per_cpu_ptr(comp->stream, cpu) = zstrm;
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		zstrm = *per_cpu_ptr(comp->stream, cpu);
+		if (!IS_ERR_OR_NULL(zstrm))
+			zcomp_strm_free(comp, zstrm);
+		*per_cpu_ptr(comp->stream, cpu) = NULL;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int zcomp_cpu_notifier(struct notifier_block *nb,
+		unsigned long action, void *pcpu)
+{
+	unsigned long cpu = (unsigned long)pcpu;
+	struct zcomp *comp = container_of(nb, typeof(*comp), notifier);
+
+	return __zcomp_cpu_notifier(comp, action, cpu);
+}
+
+static int zcomp_init(struct zcomp *comp)
+{
+	unsigned long cpu;
+	int ret;
+
+	comp->notifier.notifier_call = zcomp_cpu_notifier;
+
+	comp->stream = alloc_percpu(struct zcomp_strm *);
+	if (!comp->stream)
+		return -ENOMEM;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu) {
+		ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu);
+		if (ret == NOTIFY_BAD)
+			goto cleanup;
+	}
+	__register_cpu_notifier(&comp->notifier);
+	cpu_notifier_register_done();
+	return 0;
+
+cleanup:
+	for_each_online_cpu(cpu)
+		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+	cpu_notifier_register_done();
+	return -ENOMEM;
+}
+
 void zcomp_destroy(struct zcomp *comp)
 {
-	comp->destroy(comp);
+	unsigned long cpu;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu)
+		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+	__unregister_cpu_notifier(&comp->notifier);
+	cpu_notifier_register_done();
+
+	free_percpu(comp->stream);
 	kfree(comp);
 }
 
@@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
 		return ERR_PTR(-ENOMEM);
 
 	comp->backend = backend;
-	if (max_strm > 1)
-		error = zcomp_strm_multi_create(comp, max_strm);
-	else
-		error = zcomp_strm_single_create(comp);
+	error = zcomp_init(comp);
 	if (error) {
 		kfree(comp);
 		return ERR_PTR(error);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index b7d2a4b..aba8c21 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -10,8 +10,6 @@
 #ifndef _ZCOMP_H_
 #define _ZCOMP_H_
 
-#include <linux/mutex.h>
-
 struct zcomp_strm {
 	/* compression/decompression buffer */
 	void *buffer;
@@ -21,8 +19,6 @@ struct zcomp_strm {
 	 * working memory)
 	 */
 	void *private;
-	/* used in multi stream backend, protected by backend strm_lock */
-	struct list_head list;
 };
 
 /* static compression backend */
@@ -41,13 +37,9 @@ struct zcomp_backend {
 
 /* dynamic per-device compression frontend */
 struct zcomp {
-	void *stream;
+	struct zcomp_strm * __percpu *stream;
 	struct zcomp_backend *backend;
-
-	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
-	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
-	bool (*set_max_streams)(struct zcomp *comp, int num_strm);
-	void (*destroy)(struct zcomp *comp);
+	struct notifier_block notifier;
 };
 
 ssize_t zcomp_available_show(const char *comp, char *buf);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9030992..cad1751 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -650,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 {
 	int ret = 0;
 	size_t clen;
-	unsigned long handle;
+	unsigned long handle = 0;
 	struct page *page;
 	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
 	struct zram_meta *meta = zram->meta;
@@ -673,9 +673,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			goto out;
 	}
 
-	zstrm = zcomp_strm_find(zram->comp);
+compress_again:
 	user_mem = kmap_atomic(page);
-
 	if (is_partial_io(bvec)) {
 		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
 		       bvec->bv_len);
@@ -699,6 +698,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		goto out;
 	}
 
+	zstrm = zcomp_strm_find(zram->comp);
 	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
 	if (!is_partial_io(bvec)) {
 		kunmap_atomic(user_mem);
@@ -710,6 +710,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		pr_err("Compression failed! err=%d\n", ret);
 		goto out;
 	}
+
 	src = zstrm->buffer;
 	if (unlikely(clen > max_zpage_size)) {
 		clen = PAGE_SIZE;
@@ -717,8 +718,33 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
+	/*
+	 * handle allocation has 2 paths:
+	 * a) fast path is executed with preemption disabled (for
+	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
+	 *  since we can't sleep;
+	 * b) slow path enables preemption and attempts to allocate
+	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
+	 *  put per-cpu compression stream and, thus, to re-do
+	 *  the compression once handle is allocated.
+	 *
+	 * if we have a 'non-null' handle here then we are coming
+	 * from the slow path and handle has already been allocated.
+	 */
+	if (!handle)
+		handle = zs_malloc(meta->mem_pool, clen,
+				__GFP_KSWAPD_RECLAIM |
+				__GFP_NOWARN |
+				__GFP_HIGHMEM);
 	if (!handle) {
+		zcomp_strm_release(zram->comp, zstrm);
+		zstrm = NULL;
+
+		handle = zs_malloc(meta->mem_pool, clen,
+				GFP_NOIO | __GFP_HIGHMEM);
+		if (handle)
+			goto compress_again;
+
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
 		ret = -ENOMEM;
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] zsmalloc: require GFP in zs_malloc()
  2016-04-28 16:17 ` [PATCH 1/2] zsmalloc: require GFP in zs_malloc() Sergey Senozhatsky
@ 2016-04-29  5:44   ` Minchan Kim
  2016-04-29  7:30     ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-04-29  5:44 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Andrew Morton, linux-kernel, Sergey Senozhatsky

On Fri, Apr 29, 2016 at 01:17:09AM +0900, Sergey Senozhatsky wrote:
> Pass GFP flags to zs_malloc() instead of using a fixed set
> (supplied during pool creation), so we can be more flexible,
> but, more importantly, this will be need to switch zram to
> per-cpu compression streams.
> 
> Apart from that, this also align zs_malloc() interface with
> zspool/zbud.
> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> ---
>  drivers/block/zram/zram_drv.c |  2 +-
>  include/linux/zsmalloc.h      |  2 +-
>  mm/zsmalloc.c                 | 15 ++++++---------
>  3 files changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 370c2f7..9030992 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>  			src = uncmem;
>  	}
>  
> -	handle = zs_malloc(meta->mem_pool, clen);
> +	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
>  	if (!handle) {
>  		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
>  			index, clen);
> diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
> index 34eb160..6d89f8b 100644
> --- a/include/linux/zsmalloc.h
> +++ b/include/linux/zsmalloc.h
> @@ -44,7 +44,7 @@ struct zs_pool;
>  struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
>  void zs_destroy_pool(struct zs_pool *pool);
>  
> -unsigned long zs_malloc(struct zs_pool *pool, size_t size);
> +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
>  void zs_free(struct zs_pool *pool, unsigned long obj);
>  
>  void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index a0890e9..2c22aff 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -247,7 +247,6 @@ struct zs_pool {
>  	struct size_class **size_class;
>  	struct kmem_cache *handle_cachep;
>  
> -	gfp_t flags;	/* allocation flags used when growing pool */
>  	atomic_long_t pages_allocated;
>  
>  	struct zs_pool_stats stats;
> @@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
>  	kmem_cache_destroy(pool->handle_cachep);
>  }
>  
> -static unsigned long alloc_handle(struct zs_pool *pool)
> +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
>  {
>  	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
> -		pool->flags & ~__GFP_HIGHMEM);
> +			gfp & ~__GFP_HIGHMEM);
>  }
>  
>  static void free_handle(struct zs_pool *pool, unsigned long handle)
> @@ -335,7 +334,7 @@ static void zs_zpool_destroy(void *pool)
>  static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
>  			unsigned long *handle)
>  {
> -	*handle = zs_malloc(pool, size);
> +	*handle = zs_malloc(pool, size, gfp);
>  	return *handle ? 0 : -1;
>  }
>  static void zs_zpool_free(void *pool, unsigned long handle)
> @@ -1391,7 +1390,7 @@ static unsigned long obj_malloc(struct size_class *class,
>   * otherwise 0.
>   * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
>   */
> -unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
>  {
>  	unsigned long handle, obj;
>  	struct size_class *class;
> @@ -1400,7 +1399,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>  	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
>  		return 0;
>  
> -	handle = alloc_handle(pool);
> +	handle = alloc_handle(pool, gfp);
>  	if (!handle)
>  		return 0;
>  
> @@ -1413,7 +1412,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>  
>  	if (!first_page) {
>  		spin_unlock(&class->lock);
> -		first_page = alloc_zspage(class, pool->flags);
> +		first_page = alloc_zspage(class, gfp);
>  		if (unlikely(!first_page)) {
>  			free_handle(pool, handle);
>  			return 0;
> @@ -1945,8 +1944,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)

So, we can remove flags parameter passing and comment about that.

Other than that,

Acked-by: Minchan Kim <minchan@kernel.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] zsmalloc: require GFP in zs_malloc()
  2016-04-29  5:44   ` Minchan Kim
@ 2016-04-29  7:30     ` Sergey Senozhatsky
  0 siblings, 0 replies; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-04-29  7:30 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel, Sergey Senozhatsky

On (04/29/16 14:44), Minchan Kim wrote:
[..]
> > @@ -1945,8 +1944,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
> 
> So, we can remove flags parameter passing and comment about that.

good point; gfp_t is now unneeded there. will resubmit 0001 later.

> Other than that,
> 
> Acked-by: Minchan Kim <minchan@kernel.org>

thanks.

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-04-28 16:17 ` [PATCH 2/2] zram: user per-cpu compression streams Sergey Senozhatsky
@ 2016-05-02  6:23   ` Minchan Kim
  2016-05-02  7:25     ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-02  6:23 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Andrew Morton, linux-kernel, Sergey Senozhatsky

Hello Sergey,

Sorry for the late reply.

On Fri, Apr 29, 2016 at 01:17:10AM +0900, Sergey Senozhatsky wrote:
> Remove idle streams list and keep compression streams in per-cpu
> data. This removes two contented spin_lock()/spin_unlock() calls
> from write path and also prevent write OP from being preempted
> while holding the compression stream, which can cause slow downs.
> 
> For instance, let's assume that we have N cpus and N-2
> max_comp_streams.TASK1 owns the last idle stream, TASK2-TASK3 come
> in with the write requests:
> 
>   TASK1            TASK2              TASK3
>  zram_bvec_write()
>   spin_lock
>   find stream
>   spin_unlock
> 
>   compress
> 
>   <<preempted>>   zram_bvec_write()
>                    spin_lock
>                    find stream
>                    spin_unlock
>                      no_stream
>                        schedule
>                                      zram_bvec_write()
>                                       spin_lock
>                                       find_stream
>                                       spin_unlock
>                                         no_stream
>                                           schedule
>    spin_lock
>    release stream
>    spin_unlock
>      wake up TASK2
> 
> not only TASK2 and TASK3 will not get the stream, TASK1 will be
> preempted in the middle of its operation; while we would prefer
> it to finish compression and release the stream.
> 
> Test environment: x86_64, 4 CPU box, 3G zram, lzo
> 
> The following fio tests were executed:
>       read, randread, write, randwrite, rw, randrw
> with the increasing number of jobs from 1 to 10.
> 
>                 4 streams        8 streams       per-cpu
> ===========================================================
> jobs1
> READ:           2520.1MB/s       2566.5MB/s      2491.5MB/s
> READ:           2102.7MB/s       2104.2MB/s      2091.3MB/s
> WRITE:          1355.1MB/s       1320.2MB/s      1378.9MB/s
> WRITE:          1103.5MB/s       1097.2MB/s      1122.5MB/s
> READ:           434013KB/s       435153KB/s      439961KB/s
> WRITE:          433969KB/s       435109KB/s      439917KB/s
> READ:           403166KB/s       405139KB/s      403373KB/s
> WRITE:          403223KB/s       405197KB/s      403430KB/s
> jobs2
> READ:           7958.6MB/s       8105.6MB/s      8073.7MB/s
> READ:           6864.9MB/s       6989.8MB/s      7021.8MB/s
> WRITE:          2438.1MB/s       2346.9MB/s      3400.2MB/s
> WRITE:          1994.2MB/s       1990.3MB/s      2941.2MB/s
> READ:           981504KB/s       973906KB/s      1018.8MB/s
> WRITE:          981659KB/s       974060KB/s      1018.1MB/s
> READ:           937021KB/s       938976KB/s      987250KB/s
> WRITE:          934878KB/s       936830KB/s      984993KB/s
> jobs3
> READ:           13280MB/s        13553MB/s       13553MB/s
> READ:           11534MB/s        11785MB/s       11755MB/s
> WRITE:          3456.9MB/s       3469.9MB/s      4810.3MB/s
> WRITE:          3029.6MB/s       3031.6MB/s      4264.8MB/s
> READ:           1363.8MB/s       1362.6MB/s      1448.9MB/s
> WRITE:          1361.9MB/s       1360.7MB/s      1446.9MB/s
> READ:           1309.4MB/s       1310.6MB/s      1397.5MB/s
> WRITE:          1307.4MB/s       1308.5MB/s      1395.3MB/s
> jobs4
> READ:           20244MB/s        20177MB/s       20344MB/s
> READ:           17886MB/s        17913MB/s       17835MB/s
> WRITE:          4071.6MB/s       4046.1MB/s      6370.2MB/s
> WRITE:          3608.9MB/s       3576.3MB/s      5785.4MB/s
> READ:           1824.3MB/s       1821.6MB/s      1997.5MB/s
> WRITE:          1819.8MB/s       1817.4MB/s      1992.5MB/s
> READ:           1765.7MB/s       1768.3MB/s      1937.3MB/s
> WRITE:          1767.5MB/s       1769.1MB/s      1939.2MB/s
> jobs5
> READ:           18663MB/s        18986MB/s       18823MB/s
> READ:           16659MB/s        16605MB/s       16954MB/s
> WRITE:          3912.4MB/s       3888.7MB/s      6126.9MB/s
> WRITE:          3506.4MB/s       3442.5MB/s      5519.3MB/s
> READ:           1798.2MB/s       1746.5MB/s      1935.8MB/s
> WRITE:          1792.7MB/s       1740.7MB/s      1929.1MB/s
> READ:           1727.6MB/s       1658.2MB/s      1917.3MB/s
> WRITE:          1726.5MB/s       1657.2MB/s      1916.6MB/s
> jobs6
> READ:           21017MB/s        20922MB/s       21162MB/s
> READ:           19022MB/s        19140MB/s       18770MB/s
> WRITE:          3968.2MB/s       4037.7MB/s      6620.8MB/s
> WRITE:          3643.5MB/s       3590.2MB/s      6027.5MB/s
> READ:           1871.8MB/s       1880.5MB/s      2049.9MB/s
> WRITE:          1867.8MB/s       1877.2MB/s      2046.2MB/s
> READ:           1755.8MB/s       1710.3MB/s      1964.7MB/s
> WRITE:          1750.5MB/s       1705.9MB/s      1958.8MB/s
> jobs7
> READ:           21103MB/s        20677MB/s       21482MB/s
> READ:           18522MB/s        18379MB/s       19443MB/s
> WRITE:          4022.5MB/s       4067.4MB/s      6755.9MB/s
> WRITE:          3691.7MB/s       3695.5MB/s      5925.6MB/s
> READ:           1841.5MB/s       1933.9MB/s      2090.5MB/s
> WRITE:          1842.7MB/s       1935.3MB/s      2091.9MB/s
> READ:           1832.4MB/s       1856.4MB/s      1971.5MB/s
> WRITE:          1822.3MB/s       1846.2MB/s      1960.6MB/s
> jobs8
> READ:           20463MB/s        20194MB/s       20862MB/s
> READ:           18178MB/s        17978MB/s       18299MB/s
> WRITE:          4085.9MB/s       4060.2MB/s      7023.8MB/s
> WRITE:          3776.3MB/s       3737.9MB/s      6278.2MB/s
> READ:           1957.6MB/s       1944.4MB/s      2109.5MB/s
> WRITE:          1959.2MB/s       1946.2MB/s      2111.4MB/s
> READ:           1900.6MB/s       1885.7MB/s      2082.1MB/s
> WRITE:          1896.2MB/s       1881.4MB/s      2078.3MB/s
> jobs9
> READ:           19692MB/s        19734MB/s       19334MB/s
> READ:           17678MB/s        18249MB/s       17666MB/s
> WRITE:          4004.7MB/s       4064.8MB/s      6990.7MB/s
> WRITE:          3724.7MB/s       3772.1MB/s      6193.6MB/s
> READ:           1953.7MB/s       1967.3MB/s      2105.6MB/s
> WRITE:          1953.4MB/s       1966.7MB/s      2104.1MB/s
> READ:           1860.4MB/s       1897.4MB/s      2068.5MB/s
> WRITE:          1858.9MB/s       1895.9MB/s      2066.8MB/s
> jobs10
> READ:           19730MB/s        19579MB/s       19492MB/s
> READ:           18028MB/s        18018MB/s       18221MB/s
> WRITE:          4027.3MB/s       4090.6MB/s      7020.1MB/s
> WRITE:          3810.5MB/s       3846.8MB/s      6426.8MB/s
> READ:           1956.1MB/s       1994.6MB/s      2145.2MB/s
> WRITE:          1955.9MB/s       1993.5MB/s      2144.8MB/s
> READ:           1852.8MB/s       1911.6MB/s      2075.8MB/s
> WRITE:          1855.7MB/s       1914.6MB/s      2078.1MB/s
> 
> perf stat
> 
>                                 4 streams                       8 streams                       per-cpu
> ====================================================================================================================
> jobs1
> stalled-cycles-frontend      23,174,811,209 (  38.21%)     23,220,254,188 (  38.25%)       23,061,406,918 (  38.34%)
> stalled-cycles-backend       11,514,174,638 (  18.98%)     11,696,722,657 (  19.27%)       11,370,852,810 (  18.90%)
> instructions                 73,925,005,782 (    1.22)     73,903,177,632 (    1.22)       73,507,201,037 (    1.22)
> branches                     14,455,124,835 ( 756.063)     14,455,184,779 ( 755.281)       14,378,599,509 ( 758.546)
> branch-misses                    69,801,336 (   0.48%)         80,225,529 (   0.55%)           72,044,726 (   0.50%)
> jobs2
> stalled-cycles-frontend      49,912,741,782 (  46.11%)     50,101,189,290 (  45.95%)       32,874,195,633 (  35.11%)
> stalled-cycles-backend       27,080,366,230 (  25.02%)     27,949,970,232 (  25.63%)       16,461,222,706 (  17.58%)
> instructions                122,831,629,690 (    1.13)    122,919,846,419 (    1.13)      121,924,786,775 (    1.30)
> branches                     23,725,889,239 ( 692.663)     23,733,547,140 ( 688.062)       23,553,950,311 ( 794.794)
> branch-misses                    90,733,041 (   0.38%)         96,320,895 (   0.41%)           84,561,092 (   0.36%)
> jobs3
> stalled-cycles-frontend      66,437,834,608 (  45.58%)     63,534,923,344 (  43.69%)       42,101,478,505 (  33.19%)
> stalled-cycles-backend       34,940,799,661 (  23.97%)     34,774,043,148 (  23.91%)       21,163,324,388 (  16.68%)
> instructions                171,692,121,862 (    1.18)    171,775,373,044 (    1.18)      170,353,542,261 (    1.34)
> branches                     32,968,962,622 ( 628.723)     32,987,739,894 ( 630.512)       32,729,463,918 ( 717.027)
> branch-misses                   111,522,732 (   0.34%)        110,472,894 (   0.33%)           99,791,291 (   0.30%)
> jobs4
> stalled-cycles-frontend      98,741,701,675 (  49.72%)     94,797,349,965 (  47.59%)       54,535,655,381 (  33.53%)
> stalled-cycles-backend       54,642,609,615 (  27.51%)     55,233,554,408 (  27.73%)       27,882,323,541 (  17.14%)
> instructions                220,884,807,851 (    1.11)    220,930,887,273 (    1.11)      218,926,845,851 (    1.35)
> branches                     42,354,518,180 ( 592.105)     42,362,770,587 ( 590.452)       41,955,552,870 ( 716.154)
> branch-misses                   138,093,449 (   0.33%)        131,295,286 (   0.31%)          121,794,771 (   0.29%)
> jobs5
> stalled-cycles-frontend     116,219,747,212 (  48.14%)    110,310,397,012 (  46.29%)       66,373,082,723 (  33.70%)
> stalled-cycles-backend       66,325,434,776 (  27.48%)     64,157,087,914 (  26.92%)       32,999,097,299 (  16.76%)
> instructions                270,615,008,466 (    1.12)    270,546,409,525 (    1.14)      268,439,910,948 (    1.36)
> branches                     51,834,046,557 ( 599.108)     51,811,867,722 ( 608.883)       51,412,576,077 ( 729.213)
> branch-misses                   158,197,086 (   0.31%)        142,639,805 (   0.28%)          133,425,455 (   0.26%)
> jobs6
> stalled-cycles-frontend     138,009,414,492 (  48.23%)    139,063,571,254 (  48.80%)       75,278,568,278 (  32.80%)
> stalled-cycles-backend       79,211,949,650 (  27.68%)     79,077,241,028 (  27.75%)       37,735,797,899 (  16.44%)
> instructions                319,763,993,731 (    1.12)    319,937,782,834 (    1.12)      316,663,600,784 (    1.38)
> branches                     61,219,433,294 ( 595.056)     61,250,355,540 ( 598.215)       60,523,446,617 ( 733.706)
> branch-misses                   169,257,123 (   0.28%)        154,898,028 (   0.25%)          141,180,587 (   0.23%)
> jobs7
> stalled-cycles-frontend     162,974,812,119 (  49.20%)    159,290,061,987 (  48.43%)       88,046,641,169 (  33.21%)
> stalled-cycles-backend       92,223,151,661 (  27.84%)     91,667,904,406 (  27.87%)       44,068,454,971 (  16.62%)
> instructions                369,516,432,430 (    1.12)    369,361,799,063 (    1.12)      365,290,380,661 (    1.38)
> branches                     70,795,673,950 ( 594.220)     70,743,136,124 ( 597.876)       69,803,996,038 ( 732.822)
> branch-misses                   181,708,327 (   0.26%)        165,767,821 (   0.23%)          150,109,797 (   0.22%)
> jobs8
> stalled-cycles-frontend     185,000,017,027 (  49.30%)    182,334,345,473 (  48.37%)       99,980,147,041 (  33.26%)
> stalled-cycles-backend      105,753,516,186 (  28.18%)    107,937,830,322 (  28.63%)       51,404,177,181 (  17.10%)
> instructions                418,153,161,055 (    1.11)    418,308,565,828 (    1.11)      413,653,475,581 (    1.38)
> branches                     80,035,882,398 ( 592.296)     80,063,204,510 ( 589.843)       79,024,105,589 ( 730.530)
> branch-misses                   199,764,528 (   0.25%)        177,936,926 (   0.22%)          160,525,449 (   0.20%)
> jobs9
> stalled-cycles-frontend     210,941,799,094 (  49.63%)    204,714,679,254 (  48.55%)      114,251,113,756 (  33.96%)
> stalled-cycles-backend      122,640,849,067 (  28.85%)    122,188,553,256 (  28.98%)       58,360,041,127 (  17.35%)
> instructions                468,151,025,415 (    1.10)    467,354,869,323 (    1.11)      462,665,165,216 (    1.38)
> branches                     89,657,067,510 ( 585.628)     89,411,550,407 ( 588.990)       88,360,523,943 ( 730.151)
> branch-misses                   218,292,301 (   0.24%)        191,701,247 (   0.21%)          178,535,678 (   0.20%)
> jobs10
> stalled-cycles-frontend     233,595,958,008 (  49.81%)    227,540,615,689 (  49.11%)      160,341,979,938 (  43.07%)
> stalled-cycles-backend      136,153,676,021 (  29.03%)    133,635,240,742 (  28.84%)       65,909,135,465 (  17.70%)
> instructions                517,001,168,497 (    1.10)    516,210,976,158 (    1.11)      511,374,038,613 (    1.37)
> branches                     98,911,641,329 ( 585.796)     98,700,069,712 ( 591.583)       97,646,761,028 ( 728.712)
> branch-misses                   232,341,823 (   0.23%)        199,256,308 (   0.20%)          183,135,268 (   0.19%)
> 
> per-cpu streams tend to cause significantly less stalled cycles;
> execute less branches and hit less branch-misses.
> 
> perf stat reported execution time
> 
>                         4 streams        8 streams       per-cpu
> ====================================================================
> jobs1
> seconds elapsed        20.909073870     20.875670495    20.817838540
> jobs2
> seconds elapsed        18.529488399     18.720566469    16.356103108
> jobs3
> seconds elapsed        18.991159531     18.991340812    16.766216066
> jobs4
> seconds elapsed        19.560643828     19.551323547    16.246621715
> jobs5
> seconds elapsed        24.746498464     25.221646740    20.696112444
> jobs6
> seconds elapsed        28.258181828     28.289765505    22.885688857
> jobs7
> seconds elapsed        32.632490241     31.909125381    26.272753738
> jobs8
> seconds elapsed        35.651403851     36.027596308    29.108024711
> jobs9
> seconds elapsed        40.569362365     40.024227989    32.898204012
> jobs10
> seconds elapsed        44.673112304     43.874898137    35.632952191
> 
> Please see
>   Link: http://marc.info/?l=linux-kernel&m=146166970727530
>   Link: http://marc.info/?l=linux-kernel&m=146174716719650
> for more test results (under low memory conditions).

Thanks for the your great test.
Today, I tried making heavy memory pressure to make dobule compression
but it was not easy so I guess it's really hard to get in real practice
I hope it doesn't make any regression. ;-)

> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> Suggested-by: Minchan Kim <minchan@kernel.org>
> ---
>  drivers/block/zram/zcomp.c    | 293 ++++++++++++------------------------------
>  drivers/block/zram/zcomp.h    |  12 +-
>  drivers/block/zram/zram_drv.c |  34 ++++-
>  3 files changed, 112 insertions(+), 227 deletions(-)
> 
> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> index 3ef42e5..d4159e4 100644
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -13,6 +13,7 @@
>  #include <linux/slab.h>
>  #include <linux/wait.h>
>  #include <linux/sched.h>
> +#include <linux/cpu.h>
>  
>  #include "zcomp.h"
>  #include "zcomp_lzo.h"
> @@ -20,29 +21,6 @@
>  #include "zcomp_lz4.h"
>  #endif
>  
> -/*
> - * single zcomp_strm backend
> - */
> -struct zcomp_strm_single {
> -	struct mutex strm_lock;
> -	struct zcomp_strm *zstrm;
> -};
> -
> -/*
> - * multi zcomp_strm backend
> - */
> -struct zcomp_strm_multi {
> -	/* protect strm list */
> -	spinlock_t strm_lock;
> -	/* max possible number of zstrm streams */
> -	int max_strm;
> -	/* number of available zstrm streams */
> -	int avail_strm;
> -	/* list of available strms */
> -	struct list_head idle_strm;
> -	wait_queue_head_t strm_wait;
> -};
> -
>  static struct zcomp_backend *backends[] = {
>  	&zcomp_lzo,
>  #ifdef CONFIG_ZRAM_LZ4_COMPRESS
> @@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
>  	return zstrm;
>  }
>  
> -/*
> - * get idle zcomp_strm or wait until other process release
> - * (zcomp_strm_release()) one for us
> - */
> -static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
> -{
> -	struct zcomp_strm_multi *zs = comp->stream;
> -	struct zcomp_strm *zstrm;
> -
> -	while (1) {
> -		spin_lock(&zs->strm_lock);
> -		if (!list_empty(&zs->idle_strm)) {
> -			zstrm = list_entry(zs->idle_strm.next,
> -					struct zcomp_strm, list);
> -			list_del(&zstrm->list);
> -			spin_unlock(&zs->strm_lock);
> -			return zstrm;
> -		}
> -		/* zstrm streams limit reached, wait for idle stream */
> -		if (zs->avail_strm >= zs->max_strm) {
> -			spin_unlock(&zs->strm_lock);
> -			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
> -			continue;
> -		}
> -		/* allocate new zstrm stream */
> -		zs->avail_strm++;
> -		spin_unlock(&zs->strm_lock);
> -		/*
> -		 * This function can be called in swapout/fs write path
> -		 * so we can't use GFP_FS|IO. And it assumes we already
> -		 * have at least one stream in zram initialization so we
> -		 * don't do best effort to allocate more stream in here.
> -		 * A default stream will work well without further multiple
> -		 * streams. That's why we use NORETRY | NOWARN.
> -		 */
> -		zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
> -					__GFP_NOWARN);
> -		if (!zstrm) {
> -			spin_lock(&zs->strm_lock);
> -			zs->avail_strm--;
> -			spin_unlock(&zs->strm_lock);
> -			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
> -			continue;
> -		}
> -		break;
> -	}
> -	return zstrm;
> -}
> -
> -/* add stream back to idle list and wake up waiter or free the stream */
> -static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
> -{
> -	struct zcomp_strm_multi *zs = comp->stream;
> -
> -	spin_lock(&zs->strm_lock);
> -	if (zs->avail_strm <= zs->max_strm) {
> -		list_add(&zstrm->list, &zs->idle_strm);
> -		spin_unlock(&zs->strm_lock);
> -		wake_up(&zs->strm_wait);
> -		return;
> -	}
> -
> -	zs->avail_strm--;
> -	spin_unlock(&zs->strm_lock);
> -	zcomp_strm_free(comp, zstrm);
> -}
> -
> -/* change max_strm limit */
> -static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
> -{
> -	struct zcomp_strm_multi *zs = comp->stream;
> -	struct zcomp_strm *zstrm;
> -
> -	spin_lock(&zs->strm_lock);
> -	zs->max_strm = num_strm;
> -	/*
> -	 * if user has lowered the limit and there are idle streams,
> -	 * immediately free as much streams (and memory) as we can.
> -	 */
> -	while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
> -		zstrm = list_entry(zs->idle_strm.next,
> -				struct zcomp_strm, list);
> -		list_del(&zstrm->list);
> -		zcomp_strm_free(comp, zstrm);
> -		zs->avail_strm--;
> -	}
> -	spin_unlock(&zs->strm_lock);
> -	return true;
> -}
> -
> -static void zcomp_strm_multi_destroy(struct zcomp *comp)
> -{
> -	struct zcomp_strm_multi *zs = comp->stream;
> -	struct zcomp_strm *zstrm;
> -
> -	while (!list_empty(&zs->idle_strm)) {
> -		zstrm = list_entry(zs->idle_strm.next,
> -				struct zcomp_strm, list);
> -		list_del(&zstrm->list);
> -		zcomp_strm_free(comp, zstrm);
> -	}
> -	kfree(zs);
> -}
> -
> -static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
> -{
> -	struct zcomp_strm *zstrm;
> -	struct zcomp_strm_multi *zs;
> -
> -	comp->destroy = zcomp_strm_multi_destroy;
> -	comp->strm_find = zcomp_strm_multi_find;
> -	comp->strm_release = zcomp_strm_multi_release;
> -	comp->set_max_streams = zcomp_strm_multi_set_max_streams;
> -	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
> -	if (!zs)
> -		return -ENOMEM;
> -
> -	comp->stream = zs;
> -	spin_lock_init(&zs->strm_lock);
> -	INIT_LIST_HEAD(&zs->idle_strm);
> -	init_waitqueue_head(&zs->strm_wait);
> -	zs->max_strm = max_strm;
> -	zs->avail_strm = 1;
> -
> -	zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
> -	if (!zstrm) {
> -		kfree(zs);
> -		return -ENOMEM;
> -	}
> -	list_add(&zstrm->list, &zs->idle_strm);
> -	return 0;
> -}
> -
> -static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
> -{
> -	struct zcomp_strm_single *zs = comp->stream;
> -	mutex_lock(&zs->strm_lock);
> -	return zs->zstrm;
> -}
> -
> -static void zcomp_strm_single_release(struct zcomp *comp,
> -		struct zcomp_strm *zstrm)
> -{
> -	struct zcomp_strm_single *zs = comp->stream;
> -	mutex_unlock(&zs->strm_lock);
> -}
> -
> -static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
> -{
> -	/* zcomp_strm_single support only max_comp_streams == 1 */
> -	return false;
> -}
> -
> -static void zcomp_strm_single_destroy(struct zcomp *comp)
> -{
> -	struct zcomp_strm_single *zs = comp->stream;
> -	zcomp_strm_free(comp, zs->zstrm);
> -	kfree(zs);
> -}
> -
> -static int zcomp_strm_single_create(struct zcomp *comp)
> -{
> -	struct zcomp_strm_single *zs;
> -
> -	comp->destroy = zcomp_strm_single_destroy;
> -	comp->strm_find = zcomp_strm_single_find;
> -	comp->strm_release = zcomp_strm_single_release;
> -	comp->set_max_streams = zcomp_strm_single_set_max_streams;
> -	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
> -	if (!zs)
> -		return -ENOMEM;
> -
> -	comp->stream = zs;
> -	mutex_init(&zs->strm_lock);
> -	zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
> -	if (!zs->zstrm) {
> -		kfree(zs);
> -		return -ENOMEM;
> -	}
> -	return 0;
> -}
> -
>  /* show available compressors */
>  ssize_t zcomp_available_show(const char *comp, char *buf)
>  {
> @@ -301,17 +97,17 @@ bool zcomp_available_algorithm(const char *comp)
>  
>  bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
>  {
> -	return comp->set_max_streams(comp, num_strm);
> +	return true;
>  }

I like this part. We don't need to remove set_max_stream interface
right now. Because percpu might make regression if double comp raio
is high. If so, we should roll back so let's wait for two year and
if anyting is not wrong, then we could deprecate multi stream
interfaces. I hope you are on same page.

>  
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
>  {
> -	return comp->strm_find(comp);
> +	return *get_cpu_ptr(comp->stream);
>  }
>  
>  void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
>  {
> -	comp->strm_release(comp, zstrm);
> +	put_cpu_ptr(comp->stream);
>  }
>  
>  int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
> @@ -327,9 +123,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
>  	return comp->backend->decompress(src, src_len, dst);
>  }
>  
> +static int __zcomp_cpu_notifier(struct zcomp *comp,
> +		unsigned long action, unsigned long cpu)
> +{
> +	struct zcomp_strm *zstrm;
> +
> +	switch (action) {
> +	case CPU_UP_PREPARE:
> +		if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
> +			break;
> +		zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
> +		if (IS_ERR_OR_NULL(zstrm)) {
> +			pr_err("Can't allocate a compression stream\n");
> +			return NOTIFY_BAD;
> +		}
> +		*per_cpu_ptr(comp->stream, cpu) = zstrm;
> +		break;
> +	case CPU_DEAD:
> +	case CPU_UP_CANCELED:
> +		zstrm = *per_cpu_ptr(comp->stream, cpu);
> +		if (!IS_ERR_OR_NULL(zstrm))
> +			zcomp_strm_free(comp, zstrm);
> +		*per_cpu_ptr(comp->stream, cpu) = NULL;
> +		break;
> +	default:
> +		break;
> +	}
> +	return NOTIFY_OK;
> +}
> +
> +static int zcomp_cpu_notifier(struct notifier_block *nb,
> +		unsigned long action, void *pcpu)
> +{
> +	unsigned long cpu = (unsigned long)pcpu;
> +	struct zcomp *comp = container_of(nb, typeof(*comp), notifier);
> +
> +	return __zcomp_cpu_notifier(comp, action, cpu);
> +}
> +
> +static int zcomp_init(struct zcomp *comp)
> +{
> +	unsigned long cpu;
> +	int ret;
> +
> +	comp->notifier.notifier_call = zcomp_cpu_notifier;
> +
> +	comp->stream = alloc_percpu(struct zcomp_strm *);
> +	if (!comp->stream)
> +		return -ENOMEM;
> +
> +	cpu_notifier_register_begin();
> +	for_each_online_cpu(cpu) {
> +		ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu);
> +		if (ret == NOTIFY_BAD)
> +			goto cleanup;
> +	}
> +	__register_cpu_notifier(&comp->notifier);
> +	cpu_notifier_register_done();
> +	return 0;
> +
> +cleanup:
> +	for_each_online_cpu(cpu)
> +		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
> +	cpu_notifier_register_done();
> +	return -ENOMEM;
> +}
> +
>  void zcomp_destroy(struct zcomp *comp)
>  {
> -	comp->destroy(comp);
> +	unsigned long cpu;
> +
> +	cpu_notifier_register_begin();
> +	for_each_online_cpu(cpu)
> +		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
> +	__unregister_cpu_notifier(&comp->notifier);
> +	cpu_notifier_register_done();
> +
> +	free_percpu(comp->stream);
>  	kfree(comp);
>  }
>  
> @@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)

Trivial:
We could remove max_strm now and change description.


>  		return ERR_PTR(-ENOMEM);
>  
>  	comp->backend = backend;
> -	if (max_strm > 1)
> -		error = zcomp_strm_multi_create(comp, max_strm);
> -	else
> -		error = zcomp_strm_single_create(comp);
> +	error = zcomp_init(comp);
>  	if (error) {
>  		kfree(comp);
>  		return ERR_PTR(error);
> diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> index b7d2a4b..aba8c21 100644
> --- a/drivers/block/zram/zcomp.h
> +++ b/drivers/block/zram/zcomp.h
> @@ -10,8 +10,6 @@
>  #ifndef _ZCOMP_H_
>  #define _ZCOMP_H_
>  
> -#include <linux/mutex.h>
> -
>  struct zcomp_strm {
>  	/* compression/decompression buffer */
>  	void *buffer;
> @@ -21,8 +19,6 @@ struct zcomp_strm {
>  	 * working memory)
>  	 */
>  	void *private;
> -	/* used in multi stream backend, protected by backend strm_lock */
> -	struct list_head list;
>  };
>  
>  /* static compression backend */
> @@ -41,13 +37,9 @@ struct zcomp_backend {
>  
>  /* dynamic per-device compression frontend */
>  struct zcomp {
> -	void *stream;
> +	struct zcomp_strm * __percpu *stream;
>  	struct zcomp_backend *backend;
> -
> -	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
> -	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
> -	bool (*set_max_streams)(struct zcomp *comp, int num_strm);
> -	void (*destroy)(struct zcomp *comp);
> +	struct notifier_block notifier;
>  };
>  
>  ssize_t zcomp_available_show(const char *comp, char *buf);
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 9030992..cad1751 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -650,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>  {
>  	int ret = 0;
>  	size_t clen;
> -	unsigned long handle;
> +	unsigned long handle = 0;
>  	struct page *page;
>  	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
>  	struct zram_meta *meta = zram->meta;
> @@ -673,9 +673,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>  			goto out;
>  	}
>  
> -	zstrm = zcomp_strm_find(zram->comp);
> +compress_again:
>  	user_mem = kmap_atomic(page);
> -
>  	if (is_partial_io(bvec)) {
>  		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
>  		       bvec->bv_len);
> @@ -699,6 +698,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>  		goto out;
>  	}
>  
> +	zstrm = zcomp_strm_find(zram->comp);
>  	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
>  	if (!is_partial_io(bvec)) {
>  		kunmap_atomic(user_mem);
> @@ -710,6 +710,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>  		pr_err("Compression failed! err=%d\n", ret);
>  		goto out;
>  	}
> +
>  	src = zstrm->buffer;
>  	if (unlikely(clen > max_zpage_size)) {
>  		clen = PAGE_SIZE;
> @@ -717,8 +718,33 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>  			src = uncmem;
>  	}
>  
> -	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
> +	/*
> +	 * handle allocation has 2 paths:
> +	 * a) fast path is executed with preemption disabled (for
> +	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
> +	 *  since we can't sleep;
> +	 * b) slow path enables preemption and attempts to allocate
> +	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
> +	 *  put per-cpu compression stream and, thus, to re-do
> +	 *  the compression once handle is allocated.
> +	 *
> +	 * if we have a 'non-null' handle here then we are coming
> +	 * from the slow path and handle has already been allocated.
> +	 */
> +	if (!handle)
> +		handle = zs_malloc(meta->mem_pool, clen,
> +				__GFP_KSWAPD_RECLAIM |
> +				__GFP_NOWARN |
> +				__GFP_HIGHMEM);
>  	if (!handle) {
> +		zcomp_strm_release(zram->comp, zstrm);
> +		zstrm = NULL;
> +
> +		handle = zs_malloc(meta->mem_pool, clen,
> +				GFP_NOIO | __GFP_HIGHMEM);
> +		if (handle)
> +			goto compress_again;

We can avoid page_zero_filled in second trial but not sure it is worth
to do because in my experiment, not easy to make double compression.
If I did, the ratio is really small compared total_write so it doesn't
need to add new branch to detect it in normal path.

Acutally, I asked adding dobule compression stat to you. It seems you
forgot but okay. Because I want to change stat code and contents so
I will send a patch after I send rework about stat. :)

Thanks for the great work, Sergey!

Acked-by: Minchan Kim <minchan@kernel.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-02  6:23   ` Minchan Kim
@ 2016-05-02  7:25     ` Sergey Senozhatsky
  2016-05-02  8:06       ` Sergey Senozhatsky
  2016-05-02  8:28       ` Minchan Kim
  0 siblings, 2 replies; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-02  7:25 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel, Sergey Senozhatsky

Hello Minchan,

On (05/02/16 15:23), Minchan Kim wrote:
[..]
> Thanks for the your great test.
> Today, I tried making heavy memory pressure to make dobule compression
> but it was not easy so I guess it's really hard to get in real practice
> I hope it doesn't make any regression. ;-)

:) it is quite 'hard', indeed. huge 're-compression' numbers usually
come together with OOM-kill and OOM-panic. per my 'testing experience'.

I'm thinking about making that test a 'general zram' test and post
it somewhere on github/etc., so we it'll be easier to measure and
compare things.

> >  bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> >  {
> > -	return comp->set_max_streams(comp, num_strm);
> > +	return true;
> >  }
> 
> I like this part. We don't need to remove set_max_stream interface
> right now. Because percpu might make regression if double comp raio
> is high. If so, we should roll back so let's wait for two year and
> if anyting is not wrong, then we could deprecate multi stream
> interfaces. I hope you are on same page.

sure. I did this intentionally, because I really don't want to create a
mess of attrs that are about to retire. we have N attrs that will disappear
or see a downcast (and we warn users in kernel log) next summer (4.10 or
4.11, IIRC). if we add max_comp_stream to the list now then we either should
abandon our regular "2 years of retirement warn" rule and drop max_comp_stream
only after 1 year (because it has 1 year of overlap with already scheduled
for removal attrs) or complicate things for zram users: attrs will be dropped
in different kernel releases and users are advised to keep that in mind. I
think it's fine to have it as a NOOP attr as of now and deprecate it during
the next 'attr cleanup' 2 years cycle. yes, we are on the same page ;)

> > @@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
> 
> Trivial:
> We could remove max_strm now and change description.

oh, yes.

> > +		zcomp_strm_release(zram->comp, zstrm);
> > +		zstrm = NULL;
> > +
> > +		handle = zs_malloc(meta->mem_pool, clen,
> > +				GFP_NOIO | __GFP_HIGHMEM);
> > +		if (handle)
> > +			goto compress_again;
> 
> We can avoid page_zero_filled in second trial but not sure it is worth
> to do because in my experiment, not easy to make double compression.
> If I did, the ratio is really small compared total_write so it doesn't
> need to add new branch to detect it in normal path.
>
> Acutally, I asked adding dobule compression stat to you. It seems you
> forgot but okay. Because I want to change stat code and contents so
> I will send a patch after I send rework about stat. :)

aha... I misunderstood you. I thought you talked about the test results
in particular, not about zram stats in general. hm, given that we don't
close the door for 'idle compression streams list' yet, and may revert
per-cpu streams, may be we can introduce this stat in 4.8? otherwise, in
the worst case, we will have to trim a user visible stat file once again
IF we, for some reason, will decide to return idle list back (hopefully
we will not). does it sound OK to you to not touch the stat file now?

> Thanks for the great work, Sergey!

thanks!

> Acked-by: Minchan Kim <minchan@kernel.org>

thanks!

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-02  7:25     ` Sergey Senozhatsky
@ 2016-05-02  8:06       ` Sergey Senozhatsky
  2016-05-03  5:23         ` Minchan Kim
  2016-05-02  8:28       ` Minchan Kim
  1 sibling, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-02  8:06 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel, Sergey Senozhatsky

On (05/02/16 16:25), Sergey Senozhatsky wrote:
[..]
> > Trivial:
> > We could remove max_strm now and change description.
> 
> oh, yes.

how about something like this? remove max_comp_streams entirely, but
leave the attr. if we keep zram->max_comp_streams and return its value
(set by user space) from show() handler, we are techically lying;
because the actual number of streams is now num_online_cpus().


===8<===8<===

From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Subject: [PATCH] zram: remove max_comp_streams internals

Remove the internal part of max_comp_streams interface, since we
switched to per-cpu streams. We will keep RW max_comp_streams attr
around, because:

a) we may (silently) switch back to idle compression streams list
   and don't want to disturb user space
b) max_comp_streams attr must wait for the next 'lay off cycle';
   we give user space 2 years to adjust before we remove/downgrade
   the attr, and there are already several attrs scheduled for
   removal in 4.11, so it's too late for max_comp_streams.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
---
 drivers/block/zram/zcomp.c    |  7 +------
 drivers/block/zram/zcomp.h    |  2 +-
 drivers/block/zram/zram_drv.c | 47 +++++++++++--------------------------------
 drivers/block/zram/zram_drv.h |  1 -
 4 files changed, 14 insertions(+), 43 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index d4159e4..d4de9cb 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
 	return find_backend(comp) != NULL;
 }
 
-bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
-{
-	return true;
-}
-
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
 {
 	return *get_cpu_ptr(comp->stream);
@@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
  * case of allocation error, or any other error potentially
  * returned by functions zcomp_strm_{multi,single}_create.
  */
-struct zcomp *zcomp_create(const char *compress, int max_strm)
+struct zcomp *zcomp_create(const char *compress)
 {
 	struct zcomp *comp;
 	struct zcomp_backend *backend;
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index aba8c21..ffd88cb 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -45,7 +45,7 @@ struct zcomp {
 ssize_t zcomp_available_show(const char *comp, char *buf);
 bool zcomp_available_algorithm(const char *comp);
 
-struct zcomp *zcomp_create(const char *comp, int max_strm);
+struct zcomp *zcomp_create(const char *comp);
 void zcomp_destroy(struct zcomp *comp);
 
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index cad1751..817e511 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
 	return len;
 }
 
+/*
+ * We switched to per-cpu streams and this attr is not needed anymore.
+ * However, we will keep it around for some time, because:
+ * a) we may revert per-cpu streams in the future
+ * b) it's visible to user space and we need to follow our 2 years
+ *    retirement rule; but we already have a number of 'soon to be
+ *    altered' attrs, so max_comp_streams need to wait for the next
+ *    layoff cycle.
+ */
 static ssize_t max_comp_streams_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	int val;
-	struct zram *zram = dev_to_zram(dev);
-
-	down_read(&zram->init_lock);
-	val = zram->max_comp_streams;
-	up_read(&zram->init_lock);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
 }
 
 static ssize_t max_comp_streams_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	int num;
-	struct zram *zram = dev_to_zram(dev);
-	int ret;
-
-	ret = kstrtoint(buf, 0, &num);
-	if (ret < 0)
-		return ret;
-	if (num < 1)
-		return -EINVAL;
-
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		if (!zcomp_set_max_streams(zram->comp, num)) {
-			pr_info("Cannot change max compression streams\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	zram->max_comp_streams = num;
-	ret = len;
-out:
-	up_write(&zram->init_lock);
-	return ret;
+	return len;
 }
 
 static ssize_t comp_algorithm_show(struct device *dev,
@@ -1035,7 +1014,6 @@ static void zram_reset_device(struct zram *zram)
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
 	zram->disksize = 0;
-	zram->max_comp_streams = 1;
 
 	set_capacity(zram->disk, 0);
 	part_stat_set_all(&zram->disk->part0, 0);
@@ -1064,7 +1042,7 @@ static ssize_t disksize_store(struct device *dev,
 	if (!meta)
 		return -ENOMEM;
 
-	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+	comp = zcomp_create(zram->compressor);
 	if (IS_ERR(comp)) {
 		pr_err("Cannot initialise %s compressing backend\n",
 				zram->compressor);
@@ -1299,7 +1277,6 @@ static int zram_add(void)
 	}
 	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
 	zram->meta = NULL;
-	zram->max_comp_streams = 1;
 
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 8e92339..06b1636 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -102,7 +102,6 @@ struct zram {
 	 * the number of pages zram can consume for storing compressed data
 	 */
 	unsigned long limit_pages;
-	int max_comp_streams;
 
 	struct zram_stats stats;
 	atomic_t refcount; /* refcount for zram_meta */
-- 
2.8.2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-02  7:25     ` Sergey Senozhatsky
  2016-05-02  8:06       ` Sergey Senozhatsky
@ 2016-05-02  8:28       ` Minchan Kim
  2016-05-02  9:21         ` Sergey Senozhatsky
  1 sibling, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-02  8:28 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel

On Mon, May 02, 2016 at 04:25:08PM +0900, Sergey Senozhatsky wrote:
> Hello Minchan,
> 
> On (05/02/16 15:23), Minchan Kim wrote:
> [..]
> > Thanks for the your great test.
> > Today, I tried making heavy memory pressure to make dobule compression
> > but it was not easy so I guess it's really hard to get in real practice
> > I hope it doesn't make any regression. ;-)
> 
> :) it is quite 'hard', indeed. huge 're-compression' numbers usually
> come together with OOM-kill and OOM-panic. per my 'testing experience'.
> 
> I'm thinking about making that test a 'general zram' test and post
> it somewhere on github/etc., so we it'll be easier to measure and
> compare things.

Indeed!

> 
> > >  bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> > >  {
> > > -	return comp->set_max_streams(comp, num_strm);
> > > +	return true;
> > >  }
> > 
> > I like this part. We don't need to remove set_max_stream interface
> > right now. Because percpu might make regression if double comp raio
> > is high. If so, we should roll back so let's wait for two year and
> > if anyting is not wrong, then we could deprecate multi stream
> > interfaces. I hope you are on same page.
> 
> sure. I did this intentionally, because I really don't want to create a
> mess of attrs that are about to retire. we have N attrs that will disappear
> or see a downcast (and we warn users in kernel log) next summer (4.10 or
> 4.11, IIRC). if we add max_comp_stream to the list now then we either should
> abandon our regular "2 years of retirement warn" rule and drop max_comp_stream
> only after 1 year (because it has 1 year of overlap with already scheduled
> for removal attrs) or complicate things for zram users: attrs will be dropped
> in different kernel releases and users are advised to keep that in mind. I
> think it's fine to have it as a NOOP attr as of now and deprecate it during
> the next 'attr cleanup' 2 years cycle. yes, we are on the same page ;)
> 
> > > @@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
> > 
> > Trivial:
> > We could remove max_strm now and change description.
> 
> oh, yes.
> 
> > > +		zcomp_strm_release(zram->comp, zstrm);
> > > +		zstrm = NULL;
> > > +
> > > +		handle = zs_malloc(meta->mem_pool, clen,
> > > +				GFP_NOIO | __GFP_HIGHMEM);
> > > +		if (handle)
> > > +			goto compress_again;
> > 
> > We can avoid page_zero_filled in second trial but not sure it is worth
> > to do because in my experiment, not easy to make double compression.
> > If I did, the ratio is really small compared total_write so it doesn't
> > need to add new branch to detect it in normal path.
> >
> > Acutally, I asked adding dobule compression stat to you. It seems you
> > forgot but okay. Because I want to change stat code and contents so
> > I will send a patch after I send rework about stat. :)
> 
> aha... I misunderstood you. I thought you talked about the test results
> in particular, not about zram stats in general. hm, given that we don't
> close the door for 'idle compression streams list' yet, and may revert
> per-cpu streams, may be we can introduce this stat in 4.8? otherwise, in
> the worst case, we will have to trim a user visible stat file once again
> IF we, for some reason, will decide to return idle list back (hopefully
> we will not). does it sound OK to you to not touch the stat file now?

My concern is how we can capture such regression without introducing
the stat of recompression? Do you have an idea? :)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-02  8:28       ` Minchan Kim
@ 2016-05-02  9:21         ` Sergey Senozhatsky
  2016-05-03  1:40           ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-02  9:21 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/02/16 17:28), Minchan Kim wrote:
[..]
> > aha... I misunderstood you. I thought you talked about the test results
> > in particular, not about zram stats in general. hm, given that we don't
> > close the door for 'idle compression streams list' yet, and may revert
> > per-cpu streams, may be we can introduce this stat in 4.8? otherwise, in
> > the worst case, we will have to trim a user visible stat file once again
> > IF we, for some reason, will decide to return idle list back (hopefully
> > we will not). does it sound OK to you to not touch the stat file now?
> 
> My concern is how we can capture such regression without introducing
> the stat of recompression? Do you have an idea? :)

...hm...  inc ->failed_writes?

... or as a dirty and ugly and illegal (read "undocumented") hack, we
probably can use ->failed_reads for that purpose. simply because I don't
think any one has ever seen ->failed_reads != 0.

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-02  9:21         ` Sergey Senozhatsky
@ 2016-05-03  1:40           ` Minchan Kim
  2016-05-03  1:53             ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-03  1:40 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel

On Mon, May 02, 2016 at 06:21:57PM +0900, Sergey Senozhatsky wrote:
> On (05/02/16 17:28), Minchan Kim wrote:
> [..]
> > > aha... I misunderstood you. I thought you talked about the test results
> > > in particular, not about zram stats in general. hm, given that we don't
> > > close the door for 'idle compression streams list' yet, and may revert
> > > per-cpu streams, may be we can introduce this stat in 4.8? otherwise, in
> > > the worst case, we will have to trim a user visible stat file once again
> > > IF we, for some reason, will decide to return idle list back (hopefully
> > > we will not). does it sound OK to you to not touch the stat file now?
> > 
> > My concern is how we can capture such regression without introducing
> > the stat of recompression? Do you have an idea? :)
> 
> ...hm...  inc ->failed_writes?
> 
> ... or as a dirty and ugly and illegal (read "undocumented") hack, we
> probably can use ->failed_reads for that purpose. simply because I don't
> think any one has ever seen ->failed_reads != 0.

:(

How about adding new debugfs for zram? And let's put unstable stats or
things we need to debug to there.

I got lessson from zs_create_pool discussion a few days ago,
"debugfs is just optioinal feature so it should be okay to not support
sometime e.g., -ENOMEM, -EEXIST and so on " although I'm not sure.

Logically, it might be right and easy to say but I don't believe it
makes sense practically. Once *userspace* relies on it, we should keep
the rule. No matter we said it thorugh document

I hope it's not only one person, fortunately one more people.

                https://lwn.net/Articles/309298/

Okay, let's add the knob to the existing sysfs(There is no different
between sysfs and debugfs with point of userspace once they start to
use it) because no need to add new code to avoid such mess.

Any thoughts?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  1:40           ` Minchan Kim
@ 2016-05-03  1:53             ` Sergey Senozhatsky
  2016-05-03  2:20               ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  1:53 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

Hello Minchan,

On (05/03/16 10:40), Minchan Kim wrote:
> > 
> > ...hm...  inc ->failed_writes?
[..]
> Okay, let's add the knob to the existing sysfs(There is no different
> between sysfs and debugfs with point of userspace once they start to
> use it) because no need to add new code to avoid such mess.
> 
> Any thoughts?

so you don't want to account failed fast-path writes in failed_writes?
it sort of kind of fits, to some extent. re-compression is, basically,
a new write operation -- allocate handle, map the page again, compress,
etc., etc. so in a sense failed fast-path write is _almost_ a failed write,
except that we took extra care of handling it and retried the op inside
of zram, not from bio or fs layer.

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  1:53             ` Sergey Senozhatsky
@ 2016-05-03  2:20               ` Minchan Kim
  2016-05-03  2:30                 ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-03  2:20 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel

Hi Sergey!

On Tue, May 03, 2016 at 10:53:33AM +0900, Sergey Senozhatsky wrote:
> Hello Minchan,
> 
> On (05/03/16 10:40), Minchan Kim wrote:
> > > 
> > > ...hm...  inc ->failed_writes?
> [..]
> > Okay, let's add the knob to the existing sysfs(There is no different
> > between sysfs and debugfs with point of userspace once they start to
> > use it) because no need to add new code to avoid such mess.
> > 
> > Any thoughts?
> 
> so you don't want to account failed fast-path writes in failed_writes?

Yes, I think we cannot reuse the field.

> it sort of kind of fits, to some extent. re-compression is, basically,
> a new write operation -- allocate handle, map the page again, compress,
> etc., etc. so in a sense failed fast-path write is _almost_ a failed write,
> except that we took extra care of handling it and retried the op inside
> of zram, not from bio or fs layer.

Maybe, I don't understand you but my point is simple.
Documentation/ABI/testing/sysfs-block-zram says

What:           /sys/block/zram<id>/failed_writes
Date:           February 2014
Contact:        Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Description:
                The failed_writes file is read-only and specifies the number of
                failed writes happened on this device.

If user read it, they will think failed_writes means write-error on
the device. As well, it's true until now. :)

If we piggy back on that to represent duplicate compression, users
will start to see the increased count and they wonder "Hmm, fs or swap
on zram might be corrupted" and don't use zram any more. Otherwise,
they will report it to us. :) We should explain to them "Hey, it's not
failed write but just duplicated I/O blah blah. Please read document
and we already corrected the wording in the document to represent
current status." which would be painful both users and us.

Rather than piggy backing on failed_writes, actually I want to remove
failed_[read|write] which is no point stat, I think.

We are concerning about returing back to no per-cpu options but actually,
I don't want. If duplicate compression is really problem(But It's really
unlikely), we should try to solve the problem itself with different way
rather than roll-back to old, first of all.

I hope we can. So let's not add big worry about adding new dup stat. :)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  2:20               ` Minchan Kim
@ 2016-05-03  2:30                 ` Sergey Senozhatsky
  2016-05-03  4:29                   ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  2:30 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/03/16 11:20), Minchan Kim wrote:
[..]
> We are concerning about returing back to no per-cpu options but actually,
> I don't want. If duplicate compression is really problem(But It's really
> unlikely), we should try to solve the problem itself with different way
> rather than roll-back to old, first of all.
> 
> I hope we can. So let's not add big worry about adding new dup stat. :)

ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
I'd prefer mm_stat column, or somewhere in those cumulative files; not a
dedicated node: we decided to get rid of them some time ago.

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  2:30                 ` Sergey Senozhatsky
@ 2016-05-03  4:29                   ` Sergey Senozhatsky
  2016-05-03  5:03                     ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  4:29 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/03/16 11:30), Sergey Senozhatsky wrote:
> > We are concerning about returing back to no per-cpu options but actually,
> > I don't want. If duplicate compression is really problem(But It's really
> > unlikely), we should try to solve the problem itself with different way
> > rather than roll-back to old, first of all.
> > 
> > I hope we can. So let's not add big worry about adding new dup stat. :)
> 
> ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
> I'd prefer mm_stat column, or somewhere in those cumulative files; not a
> dedicated node: we decided to get rid of them some time ago.
> 

will io_stat node work for you?
I'll submit a formal patch later today. when you have time, can you
take a look at http://marc.info/?l=linux-kernel&m=146217628030970 ?
I think we can fold this one into 0002. it will make 0002 slightly
bigger, but there nothing complicated in there, just cleanup.

====

From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Subject: [PATCH] zram: export the number of re-compressions

Make the number of re-compressions visible via the io_stat node,
so we will be able to track down any issues caused by per-cpu
compression streams.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
---
 Documentation/blockdev/zram.txt | 3 +++
 drivers/block/zram/zram_drv.c   | 7 +++++--
 drivers/block/zram/zram_drv.h   | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 5bda503..386d260 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -183,6 +183,8 @@ mem_limit         RW    the maximum amount of memory ZRAM can use to store
 pages_compacted   RO    the number of pages freed during compaction
                         (available only via zram<id>/mm_stat node)
 compact           WO    trigger memory compaction
+num_recompress    RO    the number of times fast compression paths failed
+                        and zram performed re-compression via a slow path
 
 WARNING
 =======
@@ -215,6 +217,7 @@ whitespace:
 	failed_writes
 	invalid_io
 	notify_free
+	num_recompress
 
 File /sys/block/zram<id>/mm_stat
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 817e511..11b19c9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -395,7 +395,8 @@ static ssize_t io_stat_show(struct device *dev,
 			(u64)atomic64_read(&zram->stats.failed_reads),
 			(u64)atomic64_read(&zram->stats.failed_writes),
 			(u64)atomic64_read(&zram->stats.invalid_io),
-			(u64)atomic64_read(&zram->stats.notify_free));
+			(u64)atomic64_read(&zram->stats.notify_free),
+			(u64)atomic64_read(&zram->stats.num_recompress));
 	up_read(&zram->init_lock);
 
 	return ret;
@@ -721,8 +722,10 @@ compress_again:
 
 		handle = zs_malloc(meta->mem_pool, clen,
 				GFP_NOIO | __GFP_HIGHMEM);
-		if (handle)
+		if (handle) {
+			atomic64_inc(&zram->stats.num_recompress);
 			goto compress_again;
+		}
 
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 06b1636..78d7e8f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -85,6 +85,7 @@ struct zram_stats {
 	atomic64_t zero_pages;		/* no. of zero filled pages */
 	atomic64_t pages_stored;	/* no. of pages currently stored */
 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
+	atomic64_t num_recompress; /* no. of failed compression fast paths */
 };
 
 struct zram_meta {
-- 
2.8.2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  4:29                   ` Sergey Senozhatsky
@ 2016-05-03  5:03                     ` Minchan Kim
  2016-05-03  6:53                       ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-03  5:03 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel

On Tue, May 03, 2016 at 01:29:02PM +0900, Sergey Senozhatsky wrote:
> On (05/03/16 11:30), Sergey Senozhatsky wrote:
> > > We are concerning about returing back to no per-cpu options but actually,
> > > I don't want. If duplicate compression is really problem(But It's really
> > > unlikely), we should try to solve the problem itself with different way
> > > rather than roll-back to old, first of all.
> > > 
> > > I hope we can. So let's not add big worry about adding new dup stat. :)
> > 
> > ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
> > I'd prefer mm_stat column, or somewhere in those cumulative files; not a
> > dedicated node: we decided to get rid of them some time ago.
> > 
> 
> will io_stat node work for you?

Firstly, I thought io_stat would be better. However, on second thought,
I want to withdraw.

I think io_stat should go away.

        failed_read
        failed_write
        invalid_io

I think Above things are really unneeded. If something is fail, upper
class on top of zram, for example, FSes or Swap should emit the warning.
So, I don't think we need to maintain it in zram layer.

        notify_free

It's kins of discard command for the point of block device so I think
general block should take care of it like read and write. If block will
do it, remained thing about notify_free is only zram_slot_free_notify
so I think we can move it from io_stat to mm_stat because it's related
to memory, not block I/O.

With hoping with above things, I suggest let's not add anything to
io_stat any more from now on and let's remove it sometime.
Instead of it, let's add new dup stat.

What do you think about it?


> I'll submit a formal patch later today. when you have time, can you
> take a look at http://marc.info/?l=linux-kernel&m=146217628030970 ?

Oops, Sorry I missed. I will take a look.

> I think we can fold this one into 0002. it will make 0002 slightly
> bigger, but there nothing complicated in there, just cleanup.
> 
> ====
> 
> From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> Subject: [PATCH] zram: export the number of re-compressions
> 
> Make the number of re-compressions visible via the io_stat node,
> so we will be able to track down any issues caused by per-cpu
> compression streams.
> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> Suggested-by: Minchan Kim <minchan@kernel.org>
> ---
>  Documentation/blockdev/zram.txt | 3 +++
>  drivers/block/zram/zram_drv.c   | 7 +++++--
>  drivers/block/zram/zram_drv.h   | 1 +
>  3 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
> index 5bda503..386d260 100644
> --- a/Documentation/blockdev/zram.txt
> +++ b/Documentation/blockdev/zram.txt
> @@ -183,6 +183,8 @@ mem_limit         RW    the maximum amount of memory ZRAM can use to store
>  pages_compacted   RO    the number of pages freed during compaction
>                          (available only via zram<id>/mm_stat node)
>  compact           WO    trigger memory compaction
> +num_recompress    RO    the number of times fast compression paths failed
> +                        and zram performed re-compression via a slow path
>  
>  WARNING
>  =======
> @@ -215,6 +217,7 @@ whitespace:
>  	failed_writes
>  	invalid_io
>  	notify_free
> +	num_recompress
>  
>  File /sys/block/zram<id>/mm_stat
>  
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 817e511..11b19c9 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -395,7 +395,8 @@ static ssize_t io_stat_show(struct device *dev,
>  			(u64)atomic64_read(&zram->stats.failed_reads),
>  			(u64)atomic64_read(&zram->stats.failed_writes),
>  			(u64)atomic64_read(&zram->stats.invalid_io),
> -			(u64)atomic64_read(&zram->stats.notify_free));
> +			(u64)atomic64_read(&zram->stats.notify_free),
> +			(u64)atomic64_read(&zram->stats.num_recompress));
>  	up_read(&zram->init_lock);
>  
>  	return ret;
> @@ -721,8 +722,10 @@ compress_again:
>  
>  		handle = zs_malloc(meta->mem_pool, clen,
>  				GFP_NOIO | __GFP_HIGHMEM);
> -		if (handle)
> +		if (handle) {
> +			atomic64_inc(&zram->stats.num_recompress);
>  			goto compress_again;
> +		}
>  
>  		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
>  			index, clen);
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 06b1636..78d7e8f 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -85,6 +85,7 @@ struct zram_stats {
>  	atomic64_t zero_pages;		/* no. of zero filled pages */
>  	atomic64_t pages_stored;	/* no. of pages currently stored */
>  	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
> +	atomic64_t num_recompress; /* no. of failed compression fast paths */
>  };
>  
>  struct zram_meta {
> -- 
> 2.8.2
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-02  8:06       ` Sergey Senozhatsky
@ 2016-05-03  5:23         ` Minchan Kim
  2016-05-03  5:40           ` Minchan Kim
  2016-05-03  5:44           ` Sergey Senozhatsky
  0 siblings, 2 replies; 23+ messages in thread
From: Minchan Kim @ 2016-05-03  5:23 UTC (permalink / raw)
  To: Sergey Senozhatsky; +Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel

On Mon, May 02, 2016 at 05:06:00PM +0900, Sergey Senozhatsky wrote:
> On (05/02/16 16:25), Sergey Senozhatsky wrote:
> [..]
> > > Trivial:
> > > We could remove max_strm now and change description.
> > 
> > oh, yes.
> 
> how about something like this? remove max_comp_streams entirely, but
> leave the attr. if we keep zram->max_comp_streams and return its value
> (set by user space) from show() handler, we are techically lying;
> because the actual number of streams is now num_online_cpus().

Yes, we should have limit the value to num_online_cpus from the
beginning.

> 
> 
> ===8<===8<===
> 
> From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> Subject: [PATCH] zram: remove max_comp_streams internals
> 
> Remove the internal part of max_comp_streams interface, since we
> switched to per-cpu streams. We will keep RW max_comp_streams attr
> around, because:
> 
> a) we may (silently) switch back to idle compression streams list
>    and don't want to disturb user space
> b) max_comp_streams attr must wait for the next 'lay off cycle';
>    we give user space 2 years to adjust before we remove/downgrade
>    the attr, and there are already several attrs scheduled for
>    removal in 4.11, so it's too late for max_comp_streams.
> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> ---
>  drivers/block/zram/zcomp.c    |  7 +------
>  drivers/block/zram/zcomp.h    |  2 +-
>  drivers/block/zram/zram_drv.c | 47 +++++++++++--------------------------------
>  drivers/block/zram/zram_drv.h |  1 -
>  4 files changed, 14 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> index d4159e4..d4de9cb 100644
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
>  	return find_backend(comp) != NULL;
>  }
>  
> -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> -{
> -	return true;
> -}
> -
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
>  {
>  	return *get_cpu_ptr(comp->stream);
> @@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
>   * case of allocation error, or any other error potentially
>   * returned by functions zcomp_strm_{multi,single}_create.
>   */
> -struct zcomp *zcomp_create(const char *compress, int max_strm)
> +struct zcomp *zcomp_create(const char *compress)
>  {
>  	struct zcomp *comp;
>  	struct zcomp_backend *backend;
> diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> index aba8c21..ffd88cb 100644
> --- a/drivers/block/zram/zcomp.h
> +++ b/drivers/block/zram/zcomp.h
> @@ -45,7 +45,7 @@ struct zcomp {
>  ssize_t zcomp_available_show(const char *comp, char *buf);
>  bool zcomp_available_algorithm(const char *comp);
>  
> -struct zcomp *zcomp_create(const char *comp, int max_strm);
> +struct zcomp *zcomp_create(const char *comp);
>  void zcomp_destroy(struct zcomp *comp);
>  
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index cad1751..817e511 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
>  	return len;
>  }
>  
> +/*
> + * We switched to per-cpu streams and this attr is not needed anymore.
> + * However, we will keep it around for some time, because:
> + * a) we may revert per-cpu streams in the future
> + * b) it's visible to user space and we need to follow our 2 years
> + *    retirement rule; but we already have a number of 'soon to be
> + *    altered' attrs, so max_comp_streams need to wait for the next
> + *    layoff cycle.
> + */

Thanks for nice comment.

>  static ssize_t max_comp_streams_show(struct device *dev,
>  		struct device_attribute *attr, char *buf)
>  {
> -	int val;
> -	struct zram *zram = dev_to_zram(dev);
> -
> -	down_read(&zram->init_lock);
> -	val = zram->max_comp_streams;
> -	up_read(&zram->init_lock);
> -
> -	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
> +	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
>  }
>  
>  static ssize_t max_comp_streams_store(struct device *dev,
>  		struct device_attribute *attr, const char *buf, size_t len)
>  {
> -	int num;
> -	struct zram *zram = dev_to_zram(dev);
> -	int ret;
> -
> -	ret = kstrtoint(buf, 0, &num);
> -	if (ret < 0)
> -		return ret;
> -	if (num < 1)
> -		return -EINVAL;
> -
> -	down_write(&zram->init_lock);
> -	if (init_done(zram)) {
> -		if (!zcomp_set_max_streams(zram->comp, num)) {
> -			pr_info("Cannot change max compression streams\n");
> -			ret = -EINVAL;
> -			goto out;
> -		}
> -	}
> -
> -	zram->max_comp_streams = num;
> -	ret = len;
> -out:
> -	up_write(&zram->init_lock);
> -	return ret;

At least, we need sanity check code, still?
Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
cat /sys/xxx/max_comp_stream returns num_online_cpus.


> +	return len;
>  }
>  
>  static ssize_t comp_algorithm_show(struct device *dev,
> @@ -1035,7 +1014,6 @@ static void zram_reset_device(struct zram *zram)
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
>  	zram->disksize = 0;
> -	zram->max_comp_streams = 1;
>  
>  	set_capacity(zram->disk, 0);
>  	part_stat_set_all(&zram->disk->part0, 0);
> @@ -1064,7 +1042,7 @@ static ssize_t disksize_store(struct device *dev,
>  	if (!meta)
>  		return -ENOMEM;
>  
> -	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> +	comp = zcomp_create(zram->compressor);
>  	if (IS_ERR(comp)) {
>  		pr_err("Cannot initialise %s compressing backend\n",
>  				zram->compressor);
> @@ -1299,7 +1277,6 @@ static int zram_add(void)
>  	}
>  	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
>  	zram->meta = NULL;
> -	zram->max_comp_streams = 1;
>  
>  	pr_info("Added device: %s\n", zram->disk->disk_name);
>  	return device_id;
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 8e92339..06b1636 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -102,7 +102,6 @@ struct zram {
>  	 * the number of pages zram can consume for storing compressed data
>  	 */
>  	unsigned long limit_pages;
> -	int max_comp_streams;
>  
>  	struct zram_stats stats;
>  	atomic_t refcount; /* refcount for zram_meta */
> -- 
> 2.8.2
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  5:23         ` Minchan Kim
@ 2016-05-03  5:40           ` Minchan Kim
  2016-05-03  5:57             ` Sergey Senozhatsky
  2016-05-03  5:44           ` Sergey Senozhatsky
  1 sibling, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-03  5:40 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On Tue, May 03, 2016 at 02:23:24PM +0900, Minchan Kim wrote:
> On Mon, May 02, 2016 at 05:06:00PM +0900, Sergey Senozhatsky wrote:
> > On (05/02/16 16:25), Sergey Senozhatsky wrote:
> > [..]
> > > > Trivial:
> > > > We could remove max_strm now and change description.
> > > 
> > > oh, yes.
> > 
> > how about something like this? remove max_comp_streams entirely, but
> > leave the attr. if we keep zram->max_comp_streams and return its value
> > (set by user space) from show() handler, we are techically lying;
> > because the actual number of streams is now num_online_cpus().
> 
> Yes, we should have limit the value to num_online_cpus from the
> beginning.
> 
> > 
> > 
> > ===8<===8<===
> > 
> > From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> > Subject: [PATCH] zram: remove max_comp_streams internals
> > 
> > Remove the internal part of max_comp_streams interface, since we
> > switched to per-cpu streams. We will keep RW max_comp_streams attr
> > around, because:
> > 
> > a) we may (silently) switch back to idle compression streams list
> >    and don't want to disturb user space
> > b) max_comp_streams attr must wait for the next 'lay off cycle';
> >    we give user space 2 years to adjust before we remove/downgrade
> >    the attr, and there are already several attrs scheduled for
> >    removal in 4.11, so it's too late for max_comp_streams.
> > 
> > Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> > ---
> >  drivers/block/zram/zcomp.c    |  7 +------
> >  drivers/block/zram/zcomp.h    |  2 +-
> >  drivers/block/zram/zram_drv.c | 47 +++++++++++--------------------------------
> >  drivers/block/zram/zram_drv.h |  1 -
> >  4 files changed, 14 insertions(+), 43 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> > index d4159e4..d4de9cb 100644
> > --- a/drivers/block/zram/zcomp.c
> > +++ b/drivers/block/zram/zcomp.c
> > @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
> >  	return find_backend(comp) != NULL;
> >  }
> >  
> > -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> > -{
> > -	return true;
> > -}
> > -
> >  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
> >  {
> >  	return *get_cpu_ptr(comp->stream);
> > @@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
> >   * case of allocation error, or any other error potentially
> >   * returned by functions zcomp_strm_{multi,single}_create.
> >   */
> > -struct zcomp *zcomp_create(const char *compress, int max_strm)
> > +struct zcomp *zcomp_create(const char *compress)
> >  {
> >  	struct zcomp *comp;
> >  	struct zcomp_backend *backend;
> > diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> > index aba8c21..ffd88cb 100644
> > --- a/drivers/block/zram/zcomp.h
> > +++ b/drivers/block/zram/zcomp.h
> > @@ -45,7 +45,7 @@ struct zcomp {
> >  ssize_t zcomp_available_show(const char *comp, char *buf);
> >  bool zcomp_available_algorithm(const char *comp);
> >  
> > -struct zcomp *zcomp_create(const char *comp, int max_strm);
> > +struct zcomp *zcomp_create(const char *comp);
> >  void zcomp_destroy(struct zcomp *comp);
> >  
> >  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index cad1751..817e511 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
> >  	return len;
> >  }
> >  
> > +/*
> > + * We switched to per-cpu streams and this attr is not needed anymore.
> > + * However, we will keep it around for some time, because:
> > + * a) we may revert per-cpu streams in the future
> > + * b) it's visible to user space and we need to follow our 2 years
> > + *    retirement rule; but we already have a number of 'soon to be
> > + *    altered' attrs, so max_comp_streams need to wait for the next
> > + *    layoff cycle.
> > + */
> 
> Thanks for nice comment.
> 
> >  static ssize_t max_comp_streams_show(struct device *dev,
> >  		struct device_attribute *attr, char *buf)
> >  {
> > -	int val;
> > -	struct zram *zram = dev_to_zram(dev);
> > -
> > -	down_read(&zram->init_lock);
> > -	val = zram->max_comp_streams;
> > -	up_read(&zram->init_lock);
> > -
> > -	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
> > +	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
> >  }
> >  
> >  static ssize_t max_comp_streams_store(struct device *dev,
> >  		struct device_attribute *attr, const char *buf, size_t len)
> >  {
> > -	int num;
> > -	struct zram *zram = dev_to_zram(dev);
> > -	int ret;
> > -
> > -	ret = kstrtoint(buf, 0, &num);
> > -	if (ret < 0)
> > -		return ret;
> > -	if (num < 1)
> > -		return -EINVAL;
> > -
> > -	down_write(&zram->init_lock);
> > -	if (init_done(zram)) {
> > -		if (!zcomp_set_max_streams(zram->comp, num)) {
> > -			pr_info("Cannot change max compression streams\n");
> > -			ret = -EINVAL;
> > -			goto out;
> > -		}
> > -	}
> > -
> > -	zram->max_comp_streams = num;
> > -	ret = len;
> > -out:
> > -	up_write(&zram->init_lock);
> > -	return ret;
> 
> At least, we need sanity check code, still?
> Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> cat /sys/xxx/max_comp_stream returns num_online_cpus.

One more thing,

User:
echo 4 > /sys/xxx/max_comp_stream"
cat /sys/xxx/max_comp_streams
8

which is rather weird?

We should keep user's value and return it to user although it's techically
lying. IMO, it would be best way to prevent confusing for user until we
removes max_comp_streams finally.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  5:23         ` Minchan Kim
  2016-05-03  5:40           ` Minchan Kim
@ 2016-05-03  5:44           ` Sergey Senozhatsky
  1 sibling, 0 replies; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  5:44 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/03/16 14:23), Minchan Kim wrote:
[..]
> > -	zram->max_comp_streams = num;
> > -	ret = len;
> > -out:
> > -	up_write(&zram->init_lock);
> > -	return ret;
> 
> At least, we need sanity check code, still?
> Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> cat /sys/xxx/max_comp_stream returns num_online_cpus.

hm, I couldn't find any reason to keep the check. we completely
ignore the value anyway, cat /sys/xxx/max_comp_stream will always
return num_online_cpus(), regardless the correctness of supplied
data; `garbage', `2', `1024', `32' make no difference.

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  5:40           ` Minchan Kim
@ 2016-05-03  5:57             ` Sergey Senozhatsky
  2016-05-03  6:19               ` Minchan Kim
  0 siblings, 1 reply; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  5:57 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/03/16 14:40), Minchan Kim wrote:
[..]
> > At least, we need sanity check code, still?
> > Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> > cat /sys/xxx/max_comp_stream returns num_online_cpus.
> 
> One more thing,
> 
> User:
> echo 4 > /sys/xxx/max_comp_stream"
> cat /sys/xxx/max_comp_streams
> 8

sure, it can also be

cat /sys/xxx/max_comp_streams
5
cat /sys/xxx/max_comp_streams
6
cat /sys/xxx/max_comp_streams
7
cat /sys/xxx/max_comp_streams
3

depending on the availability of CPUs. but why would user space
constantly check max_comp_streams?

> which is rather weird?
> 
> We should keep user's value and return it to user although it's techically
> lying. IMO, it would be best way to prevent confusing for user until we
> removes max_comp_streams finally.

well, I preferred to show the actual state of the device. besides,
does anyone really do

	write buffer to file
	if (success)
		read from file and compare with the buffer

?

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  5:57             ` Sergey Senozhatsky
@ 2016-05-03  6:19               ` Minchan Kim
  2016-05-03  7:01                 ` Sergey Senozhatsky
  0 siblings, 1 reply; 23+ messages in thread
From: Minchan Kim @ 2016-05-03  6:19 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Minchan Kim, Sergey Senozhatsky, Andrew Morton, linux-kernel

On Tue, May 03, 2016 at 02:57:15PM +0900, Sergey Senozhatsky wrote:
> On (05/03/16 14:40), Minchan Kim wrote:
> [..]
> > > At least, we need sanity check code, still?
> > > Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> > > cat /sys/xxx/max_comp_stream returns num_online_cpus.
> > 
> > One more thing,
> > 
> > User:
> > echo 4 > /sys/xxx/max_comp_stream"
> > cat /sys/xxx/max_comp_streams
> > 8
> 
> sure, it can also be
> 
> cat /sys/xxx/max_comp_streams
> 5
> cat /sys/xxx/max_comp_streams
> 6
> cat /sys/xxx/max_comp_streams
> 7
> cat /sys/xxx/max_comp_streams
> 3
> 
> depending on the availability of CPUs. but why would user space
> constantly check max_comp_streams?
> 
> > which is rather weird?
> > 
> > We should keep user's value and return it to user although it's techically
> > lying. IMO, it would be best way to prevent confusing for user until we
> > removes max_comp_streams finally.
> 
> well, I preferred to show the actual state of the device. besides,
> does anyone really do
> 
> 	write buffer to file
> 	if (success)
> 		read from file and compare with the buffer
> 
> ?
> 

Okay, I want to go with your approach!
Could you update zram.txt to reflect it?

Thanks.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  5:03                     ` Minchan Kim
@ 2016-05-03  6:53                       ` Sergey Senozhatsky
  0 siblings, 0 replies; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  6:53 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/03/16 14:03), Minchan Kim wrote:
> > will io_stat node work for you?
> 
> Firstly, I thought io_stat would be better. However, on second thought,
> I want to withdraw.
> 
> I think io_stat should go away.
> 
>         failed_read
>         failed_write
>         invalid_io
> 
> I think Above things are really unneeded. If something is fail, upper
> class on top of zram, for example, FSes or Swap should emit the warning.
> So, I don't think we need to maintain it in zram layer.
> 
>         notify_free
> 
> It's kins of discard command for the point of block device so I think
> general block should take care of it like read and write. If block will
> do it, remained thing about notify_free is only zram_slot_free_notify
> so I think we can move it from io_stat to mm_stat because it's related
> to memory, not block I/O.
> 
> With hoping with above things, I suggest let's not add anything to
> io_stat any more from now on and let's remove it sometime.
> Instead of it, let's add new dup stat.
> 
> What do you think about it?

I agree and it's true that we export a number of senseless and useless
stats (well, to most of the users). what's your plan regarding the
num_recompress file? is it guaranteed to go away once we convince ourselves
that per-cpu streams are fine (1 or 2 years later) or will it stay? if it's
100% 'temporal' then _probably_ we can instead add a per-device
/sys/.../zramID/debug_file  (or similar name), document it as
  "IT WILL NEVER BE DOCUMENTED, IT WILL NEVER BE STABLE, DON'T EVER USE IT"
and export there anything we want anytime we want. or is it too childish?

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] zram: user per-cpu compression streams
  2016-05-03  6:19               ` Minchan Kim
@ 2016-05-03  7:01                 ` Sergey Senozhatsky
  0 siblings, 0 replies; 23+ messages in thread
From: Sergey Senozhatsky @ 2016-05-03  7:01 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton, linux-kernel

On (05/03/16 15:19), Minchan Kim wrote:
> > > > cat /sys/xxx/max_comp_stream returns num_online_cpus.
> > > 
> > > One more thing,
> > > 
> > > User:
> > > echo 4 > /sys/xxx/max_comp_stream"
> > > cat /sys/xxx/max_comp_streams
> > > 8
> > 
> > sure, it can also be
> > 
> > cat /sys/xxx/max_comp_streams
> > 5
> > cat /sys/xxx/max_comp_streams
> > 6
> > cat /sys/xxx/max_comp_streams
> > 7
> > cat /sys/xxx/max_comp_streams
> > 3
> > 
> > depending on the availability of CPUs. but why would user space
> > constantly check max_comp_streams?
> > 
> > > which is rather weird?
> > > 
> > > We should keep user's value and return it to user although it's techically
> > > lying. IMO, it would be best way to prevent confusing for user until we
> > > removes max_comp_streams finally.
> > 
> > well, I preferred to show the actual state of the device. besides,
> > does anyone really do
> > 
> > 	write buffer to file
> > 	if (success)
> > 		read from file and compare with the buffer
> > 
> > ?
> > 
> 
> Okay, I want to go with your approach!

thanks. I mean that was my thinking when I decided to change the
max_comp_streams output. but no pressure, it does change the numbers
that user space will see. don't have any strong opinion, can keep it
as zcomp cleanup only -- w/o touching the _show()/_store() parts.

> Could you update zram.txt to reflect it?

will do later today. I think I'd prefer to keep it as independent
patch, since it does change the user visible behaviour after all (no
idea if it's true tho; can't easily think why would anyone keep track
of the values returned by cat /sys/xxx/max_comp_streams), so we can
revert it w/o reverting the per-cpu streams IF anyone/anything will
get upset with the change.

	-ss

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2016-05-03  6:59 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-28 16:17 [PATCH 0/2] zram: switch to per-cpu compression streams Sergey Senozhatsky
2016-04-28 16:17 ` [PATCH 1/2] zsmalloc: require GFP in zs_malloc() Sergey Senozhatsky
2016-04-29  5:44   ` Minchan Kim
2016-04-29  7:30     ` Sergey Senozhatsky
2016-04-28 16:17 ` [PATCH 2/2] zram: user per-cpu compression streams Sergey Senozhatsky
2016-05-02  6:23   ` Minchan Kim
2016-05-02  7:25     ` Sergey Senozhatsky
2016-05-02  8:06       ` Sergey Senozhatsky
2016-05-03  5:23         ` Minchan Kim
2016-05-03  5:40           ` Minchan Kim
2016-05-03  5:57             ` Sergey Senozhatsky
2016-05-03  6:19               ` Minchan Kim
2016-05-03  7:01                 ` Sergey Senozhatsky
2016-05-03  5:44           ` Sergey Senozhatsky
2016-05-02  8:28       ` Minchan Kim
2016-05-02  9:21         ` Sergey Senozhatsky
2016-05-03  1:40           ` Minchan Kim
2016-05-03  1:53             ` Sergey Senozhatsky
2016-05-03  2:20               ` Minchan Kim
2016-05-03  2:30                 ` Sergey Senozhatsky
2016-05-03  4:29                   ` Sergey Senozhatsky
2016-05-03  5:03                     ` Minchan Kim
2016-05-03  6:53                       ` Sergey Senozhatsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.