zram/zcomp: use GFP_NOIO to allocate streams
diff mbox series

Message ID 1448285279-4013-1-git-send-email-sergey.senozhatsky@gmail.com
State New, archived
Headers show
Series
  • zram/zcomp: use GFP_NOIO to allocate streams
Related show

Commit Message

Sergey Senozhatsky Nov. 23, 2015, 1:27 p.m. UTC
We can end up allocating a new compression stream with GFP_KERNEL
from within the IO path, which may result is nested (recursive) IO
operations. That can introduce problems if the IO path in question
is a reclaimer, holding some locks that will deadlock nested IOs.

Allocate streams and working memory using GFP_NOIO flag, forbidding
recursive IO and FS operations.

An example:

[  747.233722] inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage.
[  747.233724] git/20158 [HC0[0]:SC0[0]:HE1:SE1] takes:
[  747.233725]  (jbd2_handle){+.+.?.}, at: [<ffffffff811e31db>] start_this_handle+0x4ca/0x555
[  747.233733] {IN-RECLAIM_FS-W} state was registered at:
[  747.233735]   [<ffffffff8107b8e9>] __lock_acquire+0x8da/0x117b
[  747.233738]   [<ffffffff8107c950>] lock_acquire+0x10c/0x1a7
[  747.233740]   [<ffffffff811e323e>] start_this_handle+0x52d/0x555
[  747.233742]   [<ffffffff811e331a>] jbd2__journal_start+0xb4/0x237
[  747.233744]   [<ffffffff811cc6c7>] __ext4_journal_start_sb+0x108/0x17e
[  747.233748]   [<ffffffff811a90bf>] ext4_dirty_inode+0x32/0x61
[  747.233750]   [<ffffffff8115f37e>] __mark_inode_dirty+0x16b/0x60c
[  747.233754]   [<ffffffff81150ad6>] iput+0x11e/0x274
[  747.233757]   [<ffffffff8114bfbd>] __dentry_kill+0x148/0x1b8
[  747.233759]   [<ffffffff8114c9d9>] shrink_dentry_list+0x274/0x44a
[  747.233761]   [<ffffffff8114d38a>] prune_dcache_sb+0x4a/0x55
[  747.233763]   [<ffffffff8113b1ad>] super_cache_scan+0xfc/0x176
[  747.233767]   [<ffffffff810fa089>] shrink_slab.part.14.constprop.25+0x2a2/0x4d3
[  747.233770]   [<ffffffff810fcccb>] shrink_zone+0x74/0x140
[  747.233772]   [<ffffffff810fd924>] kswapd+0x6b7/0x930
[  747.233774]   [<ffffffff81058887>] kthread+0x107/0x10f
[  747.233778]   [<ffffffff814fadff>] ret_from_fork+0x3f/0x70
[  747.233783] irq event stamp: 138297
[  747.233784] hardirqs last  enabled at (138297): [<ffffffff8107aff3>] debug_check_no_locks_freed+0x113/0x12f
[  747.233786] hardirqs last disabled at (138296): [<ffffffff8107af13>] debug_check_no_locks_freed+0x33/0x12f
[  747.233788] softirqs last  enabled at (137818): [<ffffffff81040f89>] __do_softirq+0x2d3/0x3e9
[  747.233792] softirqs last disabled at (137813): [<ffffffff81041292>] irq_exit+0x41/0x95
[  747.233794]
               other info that might help us debug this:
[  747.233796]  Possible unsafe locking scenario:
[  747.233797]        CPU0
[  747.233798]        ----
[  747.233799]   lock(jbd2_handle);
[  747.233801]   <Interrupt>
[  747.233801]     lock(jbd2_handle);
[  747.233803]
                *** DEADLOCK ***
[  747.233805] 5 locks held by git/20158:
[  747.233806]  #0:  (sb_writers#7){.+.+.+}, at: [<ffffffff81155411>] mnt_want_write+0x24/0x4b
[  747.233811]  #1:  (&type->i_mutex_dir_key#2/1){+.+.+.}, at: [<ffffffff81145087>] lock_rename+0xd9/0xe3
[  747.233817]  #2:  (&sb->s_type->i_mutex_key#11){+.+.+.}, at: [<ffffffff8114f8e2>] lock_two_nondirectories+0x3f/0x6b
[  747.233822]  #3:  (&sb->s_type->i_mutex_key#11/4){+.+.+.}, at: [<ffffffff8114f909>] lock_two_nondirectories+0x66/0x6b
[  747.233827]  #4:  (jbd2_handle){+.+.?.}, at: [<ffffffff811e31db>] start_this_handle+0x4ca/0x555
[  747.233831]
               stack backtrace:
[  747.233834] CPU: 2 PID: 20158 Comm: git Not tainted 4.1.0-rc7-next-20150615-dbg-00016-g8bdf555-dirty #211
[  747.233837]  ffff8800a56cea40 ffff88010d0a75f8 ffffffff814f446d ffffffff81077036
[  747.233840]  ffffffff823a84b0 ffff88010d0a7638 ffffffff814f3849 0000000000000001
[  747.233843]  000000000000000a ffff8800a56cf6f8 ffff8800a56cea40 ffffffff810795dd
[  747.233846] Call Trace:
[  747.233849]  [<ffffffff814f446d>] dump_stack+0x4c/0x6e
[  747.233852]  [<ffffffff81077036>] ? up+0x39/0x3e
[  747.233854]  [<ffffffff814f3849>] print_usage_bug.part.23+0x25b/0x26a
[  747.233857]  [<ffffffff810795dd>] ? print_shortest_lock_dependencies+0x182/0x182
[  747.233859]  [<ffffffff8107a9c9>] mark_lock+0x384/0x56d
[  747.233862]  [<ffffffff8107ac11>] mark_held_locks+0x5f/0x76
[  747.233865]  [<ffffffffa023d2f3>] ? zcomp_strm_alloc+0x25/0x73 [zram]
[  747.233867]  [<ffffffff8107d13b>] lockdep_trace_alloc+0xb2/0xb5
[  747.233870]  [<ffffffff8112bac7>] kmem_cache_alloc_trace+0x32/0x1e2
[  747.233873]  [<ffffffffa023d2f3>] zcomp_strm_alloc+0x25/0x73 [zram]
[  747.233876]  [<ffffffffa023d428>] zcomp_strm_multi_find+0xe7/0x173 [zram]
[  747.233879]  [<ffffffffa023d58b>] zcomp_strm_find+0xc/0xe [zram]
[  747.233881]  [<ffffffffa023f292>] zram_bvec_rw+0x2ca/0x7e0 [zram]
[  747.233885]  [<ffffffffa023fa8c>] zram_make_request+0x1fa/0x301 [zram]
[  747.233889]  [<ffffffff812142f8>] generic_make_request+0x9c/0xdb
[  747.233891]  [<ffffffff8121442e>] submit_bio+0xf7/0x120
[  747.233895]  [<ffffffff810f1c0c>] ? __test_set_page_writeback+0x1a0/0x1b8
[  747.233897]  [<ffffffff811a9d00>] ext4_io_submit+0x2e/0x43
[  747.233899]  [<ffffffff811a9efa>] ext4_bio_write_page+0x1b7/0x300
[  747.233902]  [<ffffffff811a2106>] mpage_submit_page+0x60/0x77
[  747.233905]  [<ffffffff811a25b0>] mpage_map_and_submit_buffers+0x10f/0x21d
[  747.233907]  [<ffffffff811a6814>] ext4_writepages+0xc8c/0xe1b
[  747.233910]  [<ffffffff810f3f77>] do_writepages+0x23/0x2c
[  747.233913]  [<ffffffff810ea5d1>] __filemap_fdatawrite_range+0x84/0x8b
[  747.233915]  [<ffffffff810ea657>] filemap_flush+0x1c/0x1e
[  747.233917]  [<ffffffff811a3851>] ext4_alloc_da_blocks+0xb8/0x117
[  747.233919]  [<ffffffff811af52a>] ext4_rename+0x132/0x6dc
[  747.233921]  [<ffffffff8107ac11>] ? mark_held_locks+0x5f/0x76
[  747.233924]  [<ffffffff811afafd>] ext4_rename2+0x29/0x2b
[  747.233926]  [<ffffffff811427ea>] vfs_rename+0x540/0x636
[  747.233928]  [<ffffffff81146a01>] SyS_renameat2+0x359/0x44d
[  747.233931]  [<ffffffff81146b26>] SyS_rename+0x1e/0x20
[  747.233933]  [<ffffffff814faa17>] entry_SYSCALL_64_fastpath+0x12/0x6f

The patch also does some very trivial cosmetic tweaks, not worth
of a separate patch.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
---
 drivers/block/zram/zcomp.c     |  4 ++--
 drivers/block/zram/zcomp_lz4.c | 12 ++++++++----
 drivers/block/zram/zcomp_lzo.c | 12 ++++++++----
 3 files changed, 18 insertions(+), 10 deletions(-)

Comments

Andrew Morton Nov. 23, 2015, 11:18 p.m. UTC | #1
On Mon, 23 Nov 2015 22:27:59 +0900 Sergey Senozhatsky <sergey.senozhatsky@gmail.com> wrote:

> We can end up allocating a new compression stream with GFP_KERNEL
> from within the IO path, which may result is nested (recursive) IO
> operations. That can introduce problems if the IO path in question
> is a reclaimer, holding some locks that will deadlock nested IOs.
> 
> Allocate streams and working memory using GFP_NOIO flag, forbidding
> recursive IO and FS operations.
> 
> ...
>
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -76,7 +76,7 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
>   */
>  static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
>  {
> -	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
> +	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO);
>  	if (!zstrm)
>  		return NULL;
>  
> @@ -85,7 +85,7 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
>  	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
>  	 * case when compressed size is larger than the original one
>  	 */
> -	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
> +	zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1);

OK.

> --- a/drivers/block/zram/zcomp_lz4.c
> +++ b/drivers/block/zram/zcomp_lz4.c
> @@ -20,10 +20,13 @@ static void *zcomp_lz4_create(void)
>  	void *ret;
>  
>  	ret = kzalloc(LZ4_MEM_COMPRESS,
> -			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
> -	if (!ret)
> -		ret = vzalloc(LZ4_MEM_COMPRESS);
> -	return ret;
> +			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);

But here we've still lost __GFP_RECLAIM, unnecessarily.  And it's quite
unclear why __GFP_NORETRY and __GFP_NOMEMALLOC are being used.

IOW, why not simply use (GFP_NOIO|__GFP_NOWARN)?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Minchan Kim Nov. 23, 2015, 11:23 p.m. UTC | #2
Hello Sergey,

On Mon, Nov 23, 2015 at 10:27:59PM +0900, Sergey Senozhatsky wrote:
> We can end up allocating a new compression stream with GFP_KERNEL
> from within the IO path, which may result is nested (recursive) IO
> operations. That can introduce problems if the IO path in question
> is a reclaimer, holding some locks that will deadlock nested IOs.
> 
> Allocate streams and working memory using GFP_NOIO flag, forbidding
> recursive IO and FS operations.
> 
> An example:
> 
> [  747.233722] inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage.
> [  747.233724] git/20158 [HC0[0]:SC0[0]:HE1:SE1] takes:
> [  747.233725]  (jbd2_handle){+.+.?.}, at: [<ffffffff811e31db>] start_this_handle+0x4ca/0x555
> [  747.233733] {IN-RECLAIM_FS-W} state was registered at:
> [  747.233735]   [<ffffffff8107b8e9>] __lock_acquire+0x8da/0x117b
> [  747.233738]   [<ffffffff8107c950>] lock_acquire+0x10c/0x1a7
> [  747.233740]   [<ffffffff811e323e>] start_this_handle+0x52d/0x555
> [  747.233742]   [<ffffffff811e331a>] jbd2__journal_start+0xb4/0x237
> [  747.233744]   [<ffffffff811cc6c7>] __ext4_journal_start_sb+0x108/0x17e
> [  747.233748]   [<ffffffff811a90bf>] ext4_dirty_inode+0x32/0x61
> [  747.233750]   [<ffffffff8115f37e>] __mark_inode_dirty+0x16b/0x60c
> [  747.233754]   [<ffffffff81150ad6>] iput+0x11e/0x274
> [  747.233757]   [<ffffffff8114bfbd>] __dentry_kill+0x148/0x1b8
> [  747.233759]   [<ffffffff8114c9d9>] shrink_dentry_list+0x274/0x44a
> [  747.233761]   [<ffffffff8114d38a>] prune_dcache_sb+0x4a/0x55
> [  747.233763]   [<ffffffff8113b1ad>] super_cache_scan+0xfc/0x176
> [  747.233767]   [<ffffffff810fa089>] shrink_slab.part.14.constprop.25+0x2a2/0x4d3
> [  747.233770]   [<ffffffff810fcccb>] shrink_zone+0x74/0x140
> [  747.233772]   [<ffffffff810fd924>] kswapd+0x6b7/0x930
> [  747.233774]   [<ffffffff81058887>] kthread+0x107/0x10f
> [  747.233778]   [<ffffffff814fadff>] ret_from_fork+0x3f/0x70
> [  747.233783] irq event stamp: 138297
> [  747.233784] hardirqs last  enabled at (138297): [<ffffffff8107aff3>] debug_check_no_locks_freed+0x113/0x12f
> [  747.233786] hardirqs last disabled at (138296): [<ffffffff8107af13>] debug_check_no_locks_freed+0x33/0x12f
> [  747.233788] softirqs last  enabled at (137818): [<ffffffff81040f89>] __do_softirq+0x2d3/0x3e9
> [  747.233792] softirqs last disabled at (137813): [<ffffffff81041292>] irq_exit+0x41/0x95
> [  747.233794]
>                other info that might help us debug this:
> [  747.233796]  Possible unsafe locking scenario:
> [  747.233797]        CPU0
> [  747.233798]        ----
> [  747.233799]   lock(jbd2_handle);
> [  747.233801]   <Interrupt>
> [  747.233801]     lock(jbd2_handle);
> [  747.233803]
>                 *** DEADLOCK ***
> [  747.233805] 5 locks held by git/20158:
> [  747.233806]  #0:  (sb_writers#7){.+.+.+}, at: [<ffffffff81155411>] mnt_want_write+0x24/0x4b
> [  747.233811]  #1:  (&type->i_mutex_dir_key#2/1){+.+.+.}, at: [<ffffffff81145087>] lock_rename+0xd9/0xe3
> [  747.233817]  #2:  (&sb->s_type->i_mutex_key#11){+.+.+.}, at: [<ffffffff8114f8e2>] lock_two_nondirectories+0x3f/0x6b
> [  747.233822]  #3:  (&sb->s_type->i_mutex_key#11/4){+.+.+.}, at: [<ffffffff8114f909>] lock_two_nondirectories+0x66/0x6b
> [  747.233827]  #4:  (jbd2_handle){+.+.?.}, at: [<ffffffff811e31db>] start_this_handle+0x4ca/0x555
> [  747.233831]
>                stack backtrace:
> [  747.233834] CPU: 2 PID: 20158 Comm: git Not tainted 4.1.0-rc7-next-20150615-dbg-00016-g8bdf555-dirty #211
> [  747.233837]  ffff8800a56cea40 ffff88010d0a75f8 ffffffff814f446d ffffffff81077036
> [  747.233840]  ffffffff823a84b0 ffff88010d0a7638 ffffffff814f3849 0000000000000001
> [  747.233843]  000000000000000a ffff8800a56cf6f8 ffff8800a56cea40 ffffffff810795dd
> [  747.233846] Call Trace:
> [  747.233849]  [<ffffffff814f446d>] dump_stack+0x4c/0x6e
> [  747.233852]  [<ffffffff81077036>] ? up+0x39/0x3e
> [  747.233854]  [<ffffffff814f3849>] print_usage_bug.part.23+0x25b/0x26a
> [  747.233857]  [<ffffffff810795dd>] ? print_shortest_lock_dependencies+0x182/0x182
> [  747.233859]  [<ffffffff8107a9c9>] mark_lock+0x384/0x56d
> [  747.233862]  [<ffffffff8107ac11>] mark_held_locks+0x5f/0x76
> [  747.233865]  [<ffffffffa023d2f3>] ? zcomp_strm_alloc+0x25/0x73 [zram]
> [  747.233867]  [<ffffffff8107d13b>] lockdep_trace_alloc+0xb2/0xb5
> [  747.233870]  [<ffffffff8112bac7>] kmem_cache_alloc_trace+0x32/0x1e2
> [  747.233873]  [<ffffffffa023d2f3>] zcomp_strm_alloc+0x25/0x73 [zram]
> [  747.233876]  [<ffffffffa023d428>] zcomp_strm_multi_find+0xe7/0x173 [zram]
> [  747.233879]  [<ffffffffa023d58b>] zcomp_strm_find+0xc/0xe [zram]
> [  747.233881]  [<ffffffffa023f292>] zram_bvec_rw+0x2ca/0x7e0 [zram]
> [  747.233885]  [<ffffffffa023fa8c>] zram_make_request+0x1fa/0x301 [zram]
> [  747.233889]  [<ffffffff812142f8>] generic_make_request+0x9c/0xdb
> [  747.233891]  [<ffffffff8121442e>] submit_bio+0xf7/0x120
> [  747.233895]  [<ffffffff810f1c0c>] ? __test_set_page_writeback+0x1a0/0x1b8
> [  747.233897]  [<ffffffff811a9d00>] ext4_io_submit+0x2e/0x43
> [  747.233899]  [<ffffffff811a9efa>] ext4_bio_write_page+0x1b7/0x300
> [  747.233902]  [<ffffffff811a2106>] mpage_submit_page+0x60/0x77
> [  747.233905]  [<ffffffff811a25b0>] mpage_map_and_submit_buffers+0x10f/0x21d
> [  747.233907]  [<ffffffff811a6814>] ext4_writepages+0xc8c/0xe1b
> [  747.233910]  [<ffffffff810f3f77>] do_writepages+0x23/0x2c
> [  747.233913]  [<ffffffff810ea5d1>] __filemap_fdatawrite_range+0x84/0x8b
> [  747.233915]  [<ffffffff810ea657>] filemap_flush+0x1c/0x1e
> [  747.233917]  [<ffffffff811a3851>] ext4_alloc_da_blocks+0xb8/0x117
> [  747.233919]  [<ffffffff811af52a>] ext4_rename+0x132/0x6dc
> [  747.233921]  [<ffffffff8107ac11>] ? mark_held_locks+0x5f/0x76
> [  747.233924]  [<ffffffff811afafd>] ext4_rename2+0x29/0x2b
> [  747.233926]  [<ffffffff811427ea>] vfs_rename+0x540/0x636
> [  747.233928]  [<ffffffff81146a01>] SyS_renameat2+0x359/0x44d
> [  747.233931]  [<ffffffff81146b26>] SyS_rename+0x1e/0x20
> [  747.233933]  [<ffffffff814faa17>] entry_SYSCALL_64_fastpath+0x12/0x6f
> 
> The patch also does some very trivial cosmetic tweaks, not worth
> of a separate patch.

I assume you saw real problem and tested this patch. It means
it's -stable material. If so, let's send this patch to -stable
without cosmetic change and let's drop vmalloc part for the
convenience for stable. Instead, we could apply your patch first
than Kyeongdon's one and Kyeongdon can resend his patch with fixing
vmalloc part.


> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> ---
>  drivers/block/zram/zcomp.c     |  4 ++--
>  drivers/block/zram/zcomp_lz4.c | 12 ++++++++----
>  drivers/block/zram/zcomp_lzo.c | 12 ++++++++----
>  3 files changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> index 5cb13ca..c536177 100644
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -76,7 +76,7 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
>   */
>  static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
>  {
> -	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
> +	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO);
>  	if (!zstrm)
>  		return NULL;
>  
> @@ -85,7 +85,7 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
>  	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
>  	 * case when compressed size is larger than the original one
>  	 */
> -	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
> +	zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1);
>  	if (!zstrm->private || !zstrm->buffer) {
>  		zcomp_strm_free(comp, zstrm);
>  		zstrm = NULL;
> diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
> index 0cc4799..0bce010 100644
> --- a/drivers/block/zram/zcomp_lz4.c
> +++ b/drivers/block/zram/zcomp_lz4.c
> @@ -20,10 +20,13 @@ static void *zcomp_lz4_create(void)
>  	void *ret;
>  
>  	ret = kzalloc(LZ4_MEM_COMPRESS,
> -			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
> -	if (!ret)
> -		ret = vzalloc(LZ4_MEM_COMPRESS);
> -	return ret;
> +			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);
> +	if (ret)
> +		return ret;
> +
> +	return __vmalloc(LZ4_MEM_COMPRESS,
> +			GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_ZERO,
> +			PAGE_KERNEL);
>  }
>  
>  static void zcomp_lz4_destroy(void *private)
> @@ -42,6 +45,7 @@ static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
>  		unsigned char *dst)
>  {
>  	size_t dst_len = PAGE_SIZE;
> +
>  	/* return  : Success if return 0 */
>  	return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
>  }
> diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
> index 59b8aa4..e5db8de 100644
> --- a/drivers/block/zram/zcomp_lzo.c
> +++ b/drivers/block/zram/zcomp_lzo.c
> @@ -20,10 +20,13 @@ static void *lzo_create(void)
>  	void *ret;
>  
>  	ret = kzalloc(LZO1X_MEM_COMPRESS,
> -			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
> -	if (!ret)
> -		ret = vzalloc(LZO1X_MEM_COMPRESS);
> -	return ret;
> +			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);
> +	if (ret)
> +		return ret;
> +
> +	return __vmalloc(LZO1X_MEM_COMPRESS,
> +			GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_ZERO,
> +			PAGE_KERNEL);
>  }
>  
>  static void lzo_destroy(void *private)
> @@ -42,6 +45,7 @@ static int lzo_decompress(const unsigned char *src, size_t src_len,
>  		unsigned char *dst)
>  {
>  	size_t dst_len = PAGE_SIZE;
> +
>  	int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
>  	return ret == LZO_E_OK ? 0 : ret;
>  }
> -- 
> 2.6.2
>
Sergey Senozhatsky Nov. 24, 2015, 12:30 a.m. UTC | #3
On (11/23/15 15:18), Andrew Morton wrote:
[..]
> > --- a/drivers/block/zram/zcomp_lz4.c
> > +++ b/drivers/block/zram/zcomp_lz4.c
> > @@ -20,10 +20,13 @@ static void *zcomp_lz4_create(void)
> >  	void *ret;
> >  
> >  	ret = kzalloc(LZ4_MEM_COMPRESS,
> > -			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
> > -	if (!ret)
> > -		ret = vzalloc(LZ4_MEM_COMPRESS);
> > -	return ret;
> > +			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);
> 
> But here we've still lost __GFP_RECLAIM, unnecessarily.  And it's quite
> unclear why __GFP_NORETRY and __GFP_NOMEMALLOC are being used.

__GFP_NORETRY

we are guaranteed to have at least one compression stream, so sooner or
later every IO operation will be served. any IO that has failed in
zcomp_lz4_create() or zcomp_lzo_create() will simply wait for already
available compression stream to become idle. so this allocation is not
so dramatically important - we just increase the level of parallelism
(N idle streams let N IO operations to execute concurrently). apart from
that we are in a low memory condition (or whatever was the reason the
kernel failed to allocate LZ4_MEM_COMPRESS or LZO1X_MEM_COMPRESS) and
we can avoid pressuring the kernel furher.

for the same reason __GFP_NOMEMALLOC is used -- we don't want to waste
an emergency memory for compression streams.


I agree on __GFP_RECLAIM. Thanks.


> IOW, why not simply use (GFP_NOIO|__GFP_NOWARN)?

GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC ?

	-ss
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andrew Morton Nov. 24, 2015, 12:47 a.m. UTC | #4
On Tue, 24 Nov 2015 09:30:27 +0900 Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com> wrote:

> On (11/23/15 15:18), Andrew Morton wrote:
> [..]
> > > --- a/drivers/block/zram/zcomp_lz4.c
> > > +++ b/drivers/block/zram/zcomp_lz4.c
> > > @@ -20,10 +20,13 @@ static void *zcomp_lz4_create(void)
> > >  	void *ret;
> > >  
> > >  	ret = kzalloc(LZ4_MEM_COMPRESS,
> > > -			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
> > > -	if (!ret)
> > > -		ret = vzalloc(LZ4_MEM_COMPRESS);
> > > -	return ret;
> > > +			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);
> > 
> > But here we've still lost __GFP_RECLAIM, unnecessarily.  And it's quite
> > unclear why __GFP_NORETRY and __GFP_NOMEMALLOC are being used.
> 
> __GFP_NORETRY
> 
> we are guaranteed to have at least one compression stream, so sooner or
> later every IO operation will be served. any IO that has failed in
> zcomp_lz4_create() or zcomp_lzo_create() will simply wait for already
> available compression stream to become idle. so this allocation is not
> so dramatically important - we just increase the level of parallelism
> (N idle streams let N IO operations to execute concurrently). apart from
> that we are in a low memory condition (or whatever was the reason the
> kernel failed to allocate LZ4_MEM_COMPRESS or LZO1X_MEM_COMPRESS) and
> we can avoid pressuring the kernel furher.
> 
> for the same reason __GFP_NOMEMALLOC is used -- we don't want to waste
> an emergency memory for compression streams.
> 

Doesn't make a lot of sense to me.  We use a weakened gfp for the
kmalloc and if that fails, fall into vmalloc() using the stronger gfp
anyway.

Perhaps it makes sense for higher-order allocations: we don't want to
thrash around trying to create an order-2 page - we'd prefer to give up
and fall into vmalloc to do a bunch of order-0 allocations.

But this argument holds for 1000 other kmalloc->vmalloc allocation
attempts - what's special about this one?

And whatever is the reason for this peculiar setup,

a) where's the proof that the change is actually beneficial?

b) let's get a good code comment in place so that future readers are not
   similarly puzzled.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Sergey Senozhatsky Nov. 24, 2015, 1:29 a.m. UTC | #5
Cc Kyeongdon

On (11/23/15 16:47), Andrew Morton wrote:
[..]
> 
> Doesn't make a lot of sense to me.  We use a weakened gfp for the
> kmalloc and if that fails, fall into vmalloc() using the stronger gfp
> anyway.

Sir, you are right. that was "fixed" in my patch (but I definitely should
have been more attentive when I reviewed Kyeongdon's patch and that was
a mistake to address this in my patch)

I didn't spot it until I replaced vzalloc() with __vmalloc() working on
my GFP_NOIO patch:

+       return __vmalloc(LZ4_MEM_COMPRESS,
+                       GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_ZERO,
+                       PAGE_KERNEL);


but I agree that we have created a mess already.

> Perhaps it makes sense for higher-order allocations: we don't want to
> thrash around trying to create an order-2 page - we'd prefer to give up
> and fall into vmalloc to do a bunch of order-0 allocations.
> 
> But this argument holds for 1000 other kmalloc->vmalloc allocation
> attempts - what's special about this one?
> 
> And whatever is the reason for this peculiar setup,
> 
> a) where's the proof that the change is actually beneficial?
> b) let's get a good code comment in place so that future readers are not
>    similarly puzzled.

or

c) start anew (hopefully Minchan and Kyeongdon are with me)


Per Kyeongdon's comment

:When we're using LZ4 multi compression streams for zram swap,
:we found out page allocation failure message in system running test.
:That was not only once, but a few(2 - 5 times per test).
:Also, some failure cases were continually occurring to try allocation
:order 3.
:
:In order to make parallel compression private data, we should call
:kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
: 2/3 size memory to allocate in that time, page allocation fails.
:This patch makes to use vmalloc() as fallback of kmalloc(), this
:prevents page alloc failure warning.


so (what I missed in the first place) is that the patch does not really
prevent page alloc failures warnings, because vmalloc() is still free to
warn us on every failed allocation. second, vmalloc() can fail and, thus,
we still will go down the 'do not attempt to allocate any memory anymore,
just wait for available stream to become idle'.


so my proposal

patch 1:
  a) switch to GFP_NOIO in critical parts (OR remove entirely remove the
  'dynamic' stream creation functionality. IOW, do not allocate compression
  streams from IO path, wait for and use available streams).

patch 2:
  b) do not fallback to vmalloc (we are prepared to handle kmalloc failures)
  c) add NOWARN to kmalloc (just to reduce the warning pressure)


well, (b) and (c), technically, can be merged with (a) but I have no
objections to have it as a separate patch.



what do you guys think?

	-ss
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Minchan Kim Nov. 24, 2015, 4:13 a.m. UTC | #6
Hi Sergey,

On Tue, Nov 24, 2015 at 10:29:27AM +0900, Sergey Senozhatsky wrote:
> Cc Kyeongdon
> 
> On (11/23/15 16:47), Andrew Morton wrote:
> [..]
> > 
> > Doesn't make a lot of sense to me.  We use a weakened gfp for the
> > kmalloc and if that fails, fall into vmalloc() using the stronger gfp
> > anyway.
> 
> Sir, you are right. that was "fixed" in my patch (but I definitely should
> have been more attentive when I reviewed Kyeongdon's patch and that was
> a mistake to address this in my patch)
> 
> I didn't spot it until I replaced vzalloc() with __vmalloc() working on
> my GFP_NOIO patch:
> 
> +       return __vmalloc(LZ4_MEM_COMPRESS,
> +                       GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_ZERO,
> +                       PAGE_KERNEL);
> 
> 
> but I agree that we have created a mess already.
> 
> > Perhaps it makes sense for higher-order allocations: we don't want to
> > thrash around trying to create an order-2 page - we'd prefer to give up
> > and fall into vmalloc to do a bunch of order-0 allocations.
> > 
> > But this argument holds for 1000 other kmalloc->vmalloc allocation
> > attempts - what's special about this one?
> > 
> > And whatever is the reason for this peculiar setup,
> > 
> > a) where's the proof that the change is actually beneficial?
> > b) let's get a good code comment in place so that future readers are not
> >    similarly puzzled.
> 
> or
> 
> c) start anew (hopefully Minchan and Kyeongdon are with me)
> 
> 
> Per Kyeongdon's comment
> 
> :When we're using LZ4 multi compression streams for zram swap,
> :we found out page allocation failure message in system running test.
> :That was not only once, but a few(2 - 5 times per test).
> :Also, some failure cases were continually occurring to try allocation
> :order 3.
> :
> :In order to make parallel compression private data, we should call
> :kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
> : 2/3 size memory to allocate in that time, page allocation fails.
> :This patch makes to use vmalloc() as fallback of kmalloc(), this
> :prevents page alloc failure warning.
> 
> 
> so (what I missed in the first place) is that the patch does not really
> prevent page alloc failures warnings, because vmalloc() is still free to
> warn us on every failed allocation. second, vmalloc() can fail and, thus,
> we still will go down the 'do not attempt to allocate any memory anymore,
> just wait for available stream to become idle'.
> 
> 
> so my proposal
> 
> patch 1:
>   a) switch to GFP_NOIO in critical parts (OR remove entirely remove the
>   'dynamic' stream creation functionality. IOW, do not allocate compression
>   streams from IO path, wait for and use available streams).
> 
> patch 2:
>   b) do not fallback to vmalloc (we are prepared to handle kmalloc failures)
>   c) add NOWARN to kmalloc (just to reduce the warning pressure)
> 
> 
> well, (b) and (c), technically, can be merged with (a) but I have no
> objections to have it as a separate patch.
> 
> 
> 
> what do you guys think?

First of all, Thanks for the summary and proposal.

I think GFP_NOIO critical part(ie, your lockdep fix patch) should
go to -stable so it should stand alone.

About vmalloc, I like that. Just problem was gfp and we can
pass it from upper layer so I believe it makes code looks clean
and solve differnt gfp problem.

Please look at my patchset I just sent.

Thanks a lot!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Sergey Senozhatsky Nov. 24, 2015, 4:41 a.m. UTC | #7
On (11/24/15 13:13), Minchan Kim wrote:
> First of all, Thanks for the summary and proposal.

sure :)

> I think GFP_NOIO critical part(ie, your lockdep fix patch) should
> go to -stable so it should stand alone.
> 
> About vmalloc, I like that. Just problem was gfp and we can
> pass it from upper layer so I believe it makes code looks clean
> and solve differnt gfp problem.

doing vmalloc() after kmalloc in general looks ok, but the thing is that
kmalloc()->vmalloc() fallback does not mean that stream allocation will
end up being successful, because right after ->private we need to allocate
->buffer via __get_free_pages() and that thing can fail. so trying harder
in comp->backend->create() is just half of what we need.

but the question is -- do we have a really big reason to fallback in
->private allocation? we are quite prepared to handle that allocation
failure and I tend to think that in low memory condition it's probably
better to avoid stealing pages for additional streams; one stream is
just enough, if we are lucky to have more than one stream by that time
-- then fine.

> Please look at my patchset I just sent.

I'll take a look once I receive them (not in my inbox yet).

	-ss
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Patch
diff mbox series

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 5cb13ca..c536177 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -76,7 +76,7 @@  static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
  */
 static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 {
-	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
+	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO);
 	if (!zstrm)
 		return NULL;
 
@@ -85,7 +85,7 @@  static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
 	 * case when compressed size is larger than the original one
 	 */
-	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1);
 	if (!zstrm->private || !zstrm->buffer) {
 		zcomp_strm_free(comp, zstrm);
 		zstrm = NULL;
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index 0cc4799..0bce010 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -20,10 +20,13 @@  static void *zcomp_lz4_create(void)
 	void *ret;
 
 	ret = kzalloc(LZ4_MEM_COMPRESS,
-			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
-	if (!ret)
-		ret = vzalloc(LZ4_MEM_COMPRESS);
-	return ret;
+			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);
+	if (ret)
+		return ret;
+
+	return __vmalloc(LZ4_MEM_COMPRESS,
+			GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL);
 }
 
 static void zcomp_lz4_destroy(void *private)
@@ -42,6 +45,7 @@  static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
 		unsigned char *dst)
 {
 	size_t dst_len = PAGE_SIZE;
+
 	/* return  : Success if return 0 */
 	return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
 }
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index 59b8aa4..e5db8de 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -20,10 +20,13 @@  static void *lzo_create(void)
 	void *ret;
 
 	ret = kzalloc(LZO1X_MEM_COMPRESS,
-			__GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
-	if (!ret)
-		ret = vzalloc(LZO1X_MEM_COMPRESS);
-	return ret;
+			__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC);
+	if (ret)
+		return ret;
+
+	return __vmalloc(LZO1X_MEM_COMPRESS,
+			GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL);
 }
 
 static void lzo_destroy(void *private)
@@ -42,6 +45,7 @@  static int lzo_decompress(const unsigned char *src, size_t src_len,
 		unsigned char *dst)
 {
 	size_t dst_len = PAGE_SIZE;
+
 	int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
 	return ret == LZO_E_OK ? 0 : ret;
 }