All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
@ 2018-05-12 14:18 Tetsuo Handa
  2018-05-15  9:16 ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-12 14:18 UTC (permalink / raw)
  To: mhocko, rientjes; +Cc: guro, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

OK. Since "mm, oom: fix concurrent munlock and oom reaper unmap, v3" went to
linux.git as 27ae357fa82be5ab, it is time to resume this patch. I do hope that
you don't ignore me again...

Here is the reproducer of OOM lockup.
Note that I'm not using hundreds of concurrent memory allocating threads.

------------------------------------------------------------
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <sys/prctl.h>
#include <sys/time.h>
#include <sys/resource.h>

int main(int argc, char *argv[])
{
	struct sched_param sp = { 0 };
	cpu_set_t cpu = { { 1 } };
	static int pipe_fd[2] = { EOF, EOF };
	char *buf = NULL;
	unsigned long size = 0;
	unsigned int i;
	const int fd = open("/dev/zero", O_RDONLY);
	pipe(pipe_fd);
	signal(SIGCLD, SIG_IGN);
	if (fork() == 0) {
		prctl(PR_SET_NAME, (unsigned long) "first-victim", 0, 0, 0);
		while (1)
			pause();
	}
	close(pipe_fd[1]);
	sched_setaffinity(0, sizeof(cpu), &cpu);
	prctl(PR_SET_NAME, (unsigned long) "normal-priority", 0, 0, 0);
	for (i = 0; i < 32; i++)
		if (fork() == 0) {
			char c;
			buf = malloc(1048576);
			/* Wait until the first-victim is OOM-killed. */
			read(pipe_fd[0], &c, 1);
			/* Try to consume as much CPU time as possible. */
			read(fd, buf, 1048576);
			pause();
			_exit(0);
		}
	close(pipe_fd[0]);
	sleep(1);
	for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) {
		char *cp = realloc(buf, size);
		if (!cp) {
			size >>= 1;
			break;
		}
		buf = cp;
	}
	sched_setscheduler(0, SCHED_IDLE, &sp);
	setpriority(PRIO_PROCESS, 0, 19);
	prctl(PR_SET_NAME, (unsigned long) "idle-priority", 0, 0, 0);
	while (size) {
		int ret = read(fd, buf, size); /* Will cause OOM due to overcommit */
		if (ret <= 0)
			break;
		buf += ret;
		size -= ret;
	}
	return 0; /* Not reached. */
}
------------------------------------------------------------

And the output is shown below.
(Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20180512.txt.xz and
kernel config is at http://I-love.SAKURA.ne.jp/tmp/config-4.17-rc4 .)

------------------------------------------------------------
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_PREEMPT_COUNT=y
------------------------------------------------------------

------------------------------------------------------------
[  243.867497] idle-priority invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null), order=0, oom_score_adj=0
[  243.870958] idle-priority cpuset=/ mems_allowed=0
[  243.873757] CPU: 0 PID: 8151 Comm: idle-priority Kdump: loaded Not tainted 4.17.0-rc4+ #400
[  243.876647] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017
[  243.879890] Call Trace:
[  243.881396]  dump_stack+0x5e/0x8b
[  243.883068]  dump_header+0x6f/0x454
[  243.884778]  ? _raw_spin_unlock_irqrestore+0x2d/0x50
[  243.886770]  ? trace_hardirqs_on_caller+0xed/0x1a0
[  243.888952]  oom_kill_process+0x223/0x6a0
[  243.890942]  ? out_of_memory+0x26f/0x550
[  243.892909]  out_of_memory+0x120/0x550
[  243.894692]  ? out_of_memory+0x1f7/0x550
[  243.896535]  __alloc_pages_nodemask+0xc98/0xdd0
[  243.898465]  alloc_pages_vma+0x6e/0x1a0
[  243.900170]  __handle_mm_fault+0xe27/0x1380
[  243.902152]  handle_mm_fault+0x1b7/0x370
[  243.904047]  ? handle_mm_fault+0x41/0x370
[  243.905792]  __do_page_fault+0x1e9/0x510
[  243.907513]  do_page_fault+0x1b/0x60
[  243.909105]  ? page_fault+0x8/0x30
[  243.910777]  page_fault+0x1e/0x30
[  243.912331] RIP: 0010:__clear_user+0x38/0x60
[  243.913957] RSP: 0018:ffffc90001ebfdd8 EFLAGS: 00010202
[  243.915761] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 0000000000000002
[  243.917941] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 00007f5984db9000
[  243.920078] RBP: 00007f5984db8010 R08: 0000000000000000 R09: 0000000000000000
[  243.922276] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc90001ebfe68
[  243.924366] R13: 0000000053969000 R14: 0000000000001000 R15: 0000000000000000
[  243.926556]  ? __clear_user+0x19/0x60
[  243.927942]  iov_iter_zero+0x77/0x360
[  243.929437]  read_iter_zero+0x32/0xa0
[  243.930793]  __vfs_read+0xc0/0x120
[  243.932052]  vfs_read+0x94/0x140
[  243.933293]  ksys_read+0x40/0xa0
[  243.934453]  ? do_syscall_64+0x17/0x1f0
[  243.935773]  do_syscall_64+0x4f/0x1f0
[  243.937034]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  243.939054] RIP: 0033:0x7f5ab134bc70
[  243.940471] RSP: 002b:00007ffc78de8548 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  243.943037] RAX: ffffffffffffffda RBX: 0000000080001000 RCX: 00007f5ab134bc70
[  243.945421] RDX: 0000000080001000 RSI: 00007f593144f010 RDI: 0000000000000003
[  243.947500] RBP: 00007f593144f010 R08: 0000000000000000 R09: 0000000000021000
[  243.949567] R10: 00007ffc78de7fa0 R11: 0000000000000246 R12: 0000000000000003
[  243.951747] R13: 00007f58b1450010 R14: 0000000000000006 R15: 0000000000000000
[  243.953949] Mem-Info:
[  243.955039] active_anon:877880 inactive_anon:2117 isolated_anon:0
[  243.955039]  active_file:17 inactive_file:19 isolated_file:0
[  243.955039]  unevictable:0 dirty:0 writeback:0 unstable:0
[  243.955039]  slab_reclaimable:3696 slab_unreclaimable:14669
[  243.955039]  mapped:892 shmem:2199 pagetables:3619 bounce:0
[  243.955039]  free:21271 free_pcp:70 free_cma:0
[  243.964871] Node 0 active_anon:3511520kB inactive_anon:8468kB active_file:68kB inactive_file:76kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3568kB dirty:0kB writeback:0kB shmem:8796kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 3284992kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
[  243.971819] Node 0 DMA free:14804kB min:284kB low:352kB high:420kB active_anon:1064kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15988kB managed:15904kB mlocked:0kB kernel_stack:0kB pagetables:4kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
[  243.979206] lowmem_reserve[]: 0 2683 3633 3633
[  243.980835] Node 0 DMA32 free:53012kB min:49696kB low:62120kB high:74544kB active_anon:2693220kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3129216kB managed:2748024kB mlocked:0kB kernel_stack:16kB pagetables:204kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
[  243.987955] lowmem_reserve[]: 0 0 950 950
[  243.989471] Node 0 Normal free:17268kB min:17596kB low:21992kB high:26388kB active_anon:817296kB inactive_anon:8468kB active_file:68kB inactive_file:76kB unevictable:0kB writepending:0kB present:1048576kB managed:972972kB mlocked:0kB kernel_stack:4096kB pagetables:14268kB bounce:0kB free_pcp:280kB local_pcp:120kB free_cma:0kB
[  243.998191] lowmem_reserve[]: 0 0 0 0
[  243.999773] Node 0 DMA: 1*4kB (U) 2*8kB (UM) 2*16kB (UM) 1*32kB (U) 2*64kB (UM) 2*128kB (UM) 2*256kB (UM) 1*512kB (M) 1*1024kB (U) 0*2048kB 3*4096kB (M) = 14804kB
[  244.004513] Node 0 DMA32: 9*4kB (UM) 12*8kB (U) 17*16kB (UME) 14*32kB (UE) 9*64kB (UE) 7*128kB (UME) 8*256kB (UME) 9*512kB (UME) 7*1024kB (UME) 2*2048kB (ME) 8*4096kB (UM) = 53012kB
[  244.009711] Node 0 Normal: 181*4kB (UM) 7*8kB (UM) 55*16kB (UME) 95*32kB (UME) 33*64kB (UME) 12*128kB (UE) 3*256kB (UE) 0*512kB 8*1024kB (UM) 0*2048kB 0*4096kB = 17308kB
[  244.014831] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[  244.017675] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  244.020366] 2245 total pagecache pages
[  244.022137] 0 pages in swap cache
[  244.023758] Swap cache stats: add 0, delete 0, find 0/0
[  244.026300] Free swap  = 0kB
[  244.029023] Total swap = 0kB
[  244.030598] 1048445 pages RAM
[  244.032541] 0 pages HighMem/MovableOnly
[  244.034382] 114220 pages reserved
[  244.036039] 0 pages hwpoisoned
[  244.038042] Out of memory: Kill process 8151 (idle-priority) score 929 or sacrifice child
[  244.041499] Killed process 8157 (normal-priority) total-vm:5248kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  302.561100] INFO: task oom_reaper:40 blocked for more than 30 seconds.
[  302.563687]       Not tainted 4.17.0-rc4+ #400
[  302.565635] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  302.568355] oom_reaper      D14408    40      2 0x80000000
[  302.570616] Call Trace:
[  302.572154]  ? __schedule+0x227/0x780
[  302.573923]  ? __mutex_lock+0x289/0x8d0
[  302.575725]  schedule+0x34/0x80
[  302.577381]  schedule_preempt_disabled+0xc/0x20
[  302.579334]  __mutex_lock+0x28e/0x8d0
[  302.581136]  ? __mutex_lock+0xb6/0x8d0
[  302.582929]  ? find_held_lock+0x2d/0x90
[  302.584809]  ? oom_reaper+0x9f/0x270
[  302.586534]  oom_reaper+0x9f/0x270
[  302.588214]  ? wait_woken+0x90/0x90
[  302.589909]  kthread+0xf6/0x130
[  302.591585]  ? __oom_reap_task_mm+0x90/0x90
[  302.593430]  ? kthread_create_on_node+0x40/0x40
[  302.595341]  ret_from_fork+0x24/0x30
[  302.597127] INFO: task normal-priority:8157 blocked for more than 30 seconds.
[  302.599634]       Not tainted 4.17.0-rc4+ #400
[  302.601492] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  302.604047] normal-priority D13752  8157   8151 0x80100084
[  302.606052] Call Trace:
[  302.607385]  ? __schedule+0x227/0x780
[  302.608951]  ? __mutex_lock+0x289/0x8d0
[  302.610533]  schedule+0x34/0x80
[  302.611932]  schedule_preempt_disabled+0xc/0x20
[  302.613647]  __mutex_lock+0x28e/0x8d0
[  302.615144]  ? __mutex_lock+0xb6/0x8d0
[  302.616637]  ? __lock_acquire+0x22a/0x1830
[  302.618183]  ? exit_mmap+0x126/0x160
[  302.619591]  exit_mmap+0x126/0x160
[  302.620917]  ? do_exit+0x261/0xb80
[  302.622213]  ? find_held_lock+0x2d/0x90
[  302.623581]  mmput+0x63/0x130
[  302.624757]  do_exit+0x297/0xb80
[  302.625984]  do_group_exit+0x41/0xc0
[  302.627281]  get_signal+0x22a/0x810
[  302.628546]  do_signal+0x1e/0x600
[  302.629792]  exit_to_usermode_loop+0x34/0x6c
[  302.631302]  ? page_fault+0x8/0x30
[  302.632650]  prepare_exit_to_usermode+0xd4/0xe0
[  302.634163]  retint_user+0x8/0x18
[  302.635432] RIP: 0033:0x7f5ab134bc70
[  302.636725] RSP: 002b:00007ffc78de8548 EFLAGS: 00010246
[  302.638378] RAX: 0000000000000000 RBX: 00007f5ab1736010 RCX: 00007f5ab134bc70
[  302.640435] RDX: 0000000000000001 RSI: 00007ffc78de855f RDI: 0000000000000004
[  302.642487] RBP: 0000000000000000 R08: ffffffffffffffff R09: 0000000000100000
[  302.644531] R10: 00007ffc78de7fa0 R11: 0000000000000246 R12: 0000000000000003
[  302.646556] R13: 00007ffc78de86f0 R14: 0000000000000000 R15: 0000000000000000
[  302.648593] 
[  302.648593] Showing all locks held in the system:
[  302.650828] 2 locks held by kworker/0:1/37:
[  302.652320]  #0: 00000000528edd68 ((wq_completion)"events_freezable_power_efficient"){+.+.}, at: process_one_work+0x13c/0x380
[  302.655297]  #1: 00000000b1d2489c ((work_completion)(&(&ev->dwork)->work)){+.+.}, at: process_one_work+0x13c/0x380
[  302.658072] 1 lock held by khungtaskd/39:
[  302.659459]  #0: 00000000bfc6260d (tasklist_lock){.+.+}, at: debug_show_all_locks+0x39/0x1b0
[  302.661844] 1 lock held by oom_reaper/40:
[  302.663369]  #0: 000000005eee3cbe (oom_lock){+.+.}, at: oom_reaper+0x9f/0x270
[  302.665725] 2 locks held by agetty/3801:
[  302.667189]  #0: 00000000c0409157 (&tty->ldisc_sem){++++}, at: tty_ldisc_ref_wait+0x1f/0x50
[  302.669604]  #1: 000000008d7198da (&ldata->atomic_read_lock){+.+.}, at: n_tty_read+0xc0/0x8a0
[  302.672054] 2 locks held by smbd-notifyd/3898:
[  302.673621]  #0: 00000000c0fc1118 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.675991]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.678482] 2 locks held by cleanupd/3899:
[  302.679976]  #0: 0000000073a8b85a (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.682363]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.684866] 1 lock held by normal-priority/8157:
[  302.686517]  #0: 000000005eee3cbe (oom_lock){+.+.}, at: exit_mmap+0x126/0x160
[  302.688718] 2 locks held by normal-priority/8161:
[  302.690368]  #0: 000000007b02f050 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.692779]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.695312] 2 locks held by normal-priority/8162:
[  302.696985]  #0: 00000000cdede75e (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.699427]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.702004] 2 locks held by normal-priority/8165:
[  302.703721]  #0: 00000000cf5d7878 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.706198]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.708788] 2 locks held by normal-priority/8166:
[  302.710531]  #0: 00000000069df873 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.713031]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.715646] 2 locks held by normal-priority/8169:
[  302.717416]  #0: 00000000d218c9a8 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.719950]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.722641] 2 locks held by normal-priority/8170:
[  302.724434]  #0: 00000000a5a3283b (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.726964]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.729618] 2 locks held by normal-priority/8176:
[  302.731468]  #0: 0000000036591c0b (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.734075]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.736790] 2 locks held by normal-priority/8181:
[  302.738656]  #0: 0000000017fa21f0 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.741282]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.745176] 2 locks held by normal-priority/8182:
[  302.747556]  #0: 0000000048a6d0b7 (&mm->mmap_sem){++++}, at: __do_page_fault+0x133/0x510
[  302.750392]  #1: 00000000ebb6be0a (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0xae/0xc0
[  302.754698] 
[  302.755948] =============================================
(...snipped...)
[  399.139454] idle-priority   R  running task    12264  8151   4971 0x00000080
[  399.141499] Call Trace:
[  399.142539]  ? __schedule+0x227/0x780
[  399.143831]  schedule+0x34/0x80
[  399.144998]  schedule_timeout+0x196/0x390
[  399.146372]  ? collect_expired_timers+0xb0/0xb0
[  399.147933]  out_of_memory+0x12a/0x550
[  399.149230]  ? out_of_memory+0x1f7/0x550
[  399.150563]  __alloc_pages_nodemask+0xc98/0xdd0
[  399.152034]  alloc_pages_vma+0x6e/0x1a0
[  399.153350]  __handle_mm_fault+0xe27/0x1380
[  399.154735]  handle_mm_fault+0x1b7/0x370
[  399.156064]  ? handle_mm_fault+0x41/0x370
[  399.157406]  __do_page_fault+0x1e9/0x510
[  399.158740]  do_page_fault+0x1b/0x60
[  399.159985]  ? page_fault+0x8/0x30
[  399.161183]  page_fault+0x1e/0x30
[  399.162354] RIP: 0010:__clear_user+0x38/0x60
[  399.163814] RSP: 0018:ffffc90001ebfdd8 EFLAGS: 00010202
[  399.165399] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 0000000000000002
[  399.167389] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 00007f5984db9000
[  399.169369] RBP: 00007f5984db8010 R08: 0000000000000000 R09: 0000000000000000
[  399.171361] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc90001ebfe68
[  399.173358] R13: 0000000053969000 R14: 0000000000001000 R15: 0000000000000000
[  399.175353]  ? __clear_user+0x19/0x60
[  399.176616]  iov_iter_zero+0x77/0x360
[  399.177871]  read_iter_zero+0x32/0xa0
[  399.179131]  __vfs_read+0xc0/0x120
[  399.180377]  vfs_read+0x94/0x140
[  399.181549]  ksys_read+0x40/0xa0
[  399.182723]  ? do_syscall_64+0x17/0x1f0
[  399.184025]  do_syscall_64+0x4f/0x1f0
[  399.185291]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...snipped...)
[  481.033433] idle-priority   R  running task    12264  8151   4971 0x00000080
[  481.033433] Call Trace:
[  481.033433]  ? __schedule+0x227/0x780
[  481.033433]  schedule+0x34/0x80
[  481.033433]  schedule_timeout+0x196/0x390
[  481.033433]  ? collect_expired_timers+0xb0/0xb0
[  481.033433]  out_of_memory+0x12a/0x550
[  481.033433]  ? out_of_memory+0x1f7/0x550
[  481.033433]  __alloc_pages_nodemask+0xc98/0xdd0
[  481.033433]  alloc_pages_vma+0x6e/0x1a0
[  481.033433]  __handle_mm_fault+0xe27/0x1380
[  481.033433]  handle_mm_fault+0x1b7/0x370
[  481.033433]  ? handle_mm_fault+0x41/0x370
[  481.033433]  __do_page_fault+0x1e9/0x510
[  481.033433]  do_page_fault+0x1b/0x60
[  481.033433]  ? page_fault+0x8/0x30
[  481.033433]  page_fault+0x1e/0x30
[  481.033433] RIP: 0010:__clear_user+0x38/0x60
[  481.033433] RSP: 0018:ffffc90001ebfdd8 EFLAGS: 00010202
[  481.033433] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 0000000000000002
[  481.033433] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 00007f5984db9000
[  481.033433] RBP: 00007f5984db8010 R08: 0000000000000000 R09: 0000000000000000
[  481.033433] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc90001ebfe68
[  481.033433] R13: 0000000053969000 R14: 0000000000001000 R15: 0000000000000000
[  481.033433]  ? __clear_user+0x19/0x60
[  481.033433]  iov_iter_zero+0x77/0x360
[  481.033433]  read_iter_zero+0x32/0xa0
[  481.033433]  __vfs_read+0xc0/0x120
[  481.033433]  vfs_read+0x94/0x140
[  481.033433]  ksys_read+0x40/0xa0
[  481.033433]  ? do_syscall_64+0x17/0x1f0
[  481.033433]  do_syscall_64+0x4f/0x1f0
[  481.033433]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
------------------------------------------------------------

Once a thread which called out_of_memory() started sleeping at schedule_timeout_killable(1)
with oom_lock held, 32 concurrent direct reclaiming threads on the same CPU are sufficient
to trigger the OOM lockup. With below patch applied, every trial completes within 5 seconds.



>From 4b356c742a3f1b720d5b709792fe68b25d800902 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sat, 12 May 2018 12:27:52 +0900
Subject: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.

When I was examining a bug which occurs under CPU + memory pressure, I
observed that a thread which called out_of_memory() can sleep for minutes
at schedule_timeout_killable(1) with oom_lock held when many threads are
doing direct reclaim.

The whole point of the sleep is give the OOM victim some time to exit.
But since commit 27ae357fa82be5ab ("mm, oom: fix concurrent munlock and
oom reaper unmap, v3") changed the OOM victim to wait for oom_lock in order
to close race window at exit_mmap(), the whole point of this sleep is lost
now. We need to make sure that the thread which called out_of_memory() will
release oom_lock shortly. Therefore, this patch brings the sleep to outside
of the OOM path.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
---
 mm/oom_kill.c   | 38 +++++++++++++++++---------------------
 mm/page_alloc.c |  7 ++++++-
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ba6cb8..23ce67f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -479,6 +479,21 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
+/*
+ * We have to make sure not to cause premature new oom victim selection.
+ *
+ * __alloc_pages_may_oom()     oom_reap_task_mm()/exit_mmap()
+ *   mutex_trylock(&oom_lock)
+ *   get_page_from_freelist(ALLOC_WMARK_HIGH) # fails
+ *                               unmap_page_range() # frees some memory
+ *                               set_bit(MMF_OOM_SKIP)
+ *   out_of_memory()
+ *     select_bad_process()
+ *       test_bit(MMF_OOM_SKIP) # selects new oom victim
+ *   mutex_unlock(&oom_lock)
+ *
+ * Therefore, the callers hold oom_lock when calling this function.
+ */
 void __oom_reap_task_mm(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -523,20 +538,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	bool ret = true;
 
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
 	mutex_lock(&oom_lock);
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
@@ -1077,15 +1078,9 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen && oc->chosen != (void *)-1UL) {
+	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
-		/*
-		 * Give the killed process a good chance to exit before trying
-		 * to allocate memory again.
-		 */
-		schedule_timeout_killable(1);
-	}
 	return !!oc->chosen;
 }
 
@@ -1111,4 +1106,5 @@ void pagefault_out_of_memory(void)
 		return;
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
+	schedule_timeout_killable(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 905db9d..458ed32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3478,7 +3478,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
-		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 
@@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
 		no_progress_loops = 0;
+		/*
+		 * This schedule_timeout_*() serves as a guaranteed sleep for
+		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
+		 */
+		if (!tsk_is_oom_victim(current))
+			schedule_timeout_uninterruptible(1);
 		goto retry;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-12 14:18 [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held Tetsuo Handa
@ 2018-05-15  9:16 ` Michal Hocko
  2018-05-18 10:14   ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-15  9:16 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: rientjes, guro, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Sat 12-05-18 23:18:24, Tetsuo Handa wrote:
[...]
> @@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
>  	/* Retry as long as the OOM killer is making progress */
>  	if (did_some_progress) {
>  		no_progress_loops = 0;
> +		/*
> +		 * This schedule_timeout_*() serves as a guaranteed sleep for
> +		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> +		 */
> +		if (!tsk_is_oom_victim(current))
> +			schedule_timeout_uninterruptible(1);
>  		goto retry;

We already do have that sleep for PF_WQ_WORKER in should_reclaim_retry.
Why do we need it here as well?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-15  9:16 ` Michal Hocko
@ 2018-05-18 10:14   ` Tetsuo Handa
  2018-05-18 12:20     ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-18 10:14 UTC (permalink / raw)
  To: mhocko; +Cc: rientjes, guro, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Sat 12-05-18 23:18:24, Tetsuo Handa wrote:
> [...]
> > @@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> >  	/* Retry as long as the OOM killer is making progress */
> >  	if (did_some_progress) {
> >  		no_progress_loops = 0;
> > +		/*
> > +		 * This schedule_timeout_*() serves as a guaranteed sleep for
> > +		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > +		 */
> > +		if (!tsk_is_oom_victim(current))
> > +			schedule_timeout_uninterruptible(1);
> >  		goto retry;
> 
> We already do have that sleep for PF_WQ_WORKER in should_reclaim_retry.
> Why do we need it here as well?

Because that path depends on __zone_watermark_ok() == true which is not
guaranteed to be executed.

I consider that this "goto retry;" is a good location for making a short sleep.
Current code is so conditional that there are cases which needlessly retry
without sleeping (e.g. current thread finds an OOM victim at select_bad_process()
and immediately retries allocation attempt rather than giving the OOM victim
CPU resource for releasing memory) or needlessly sleep (e.g. current thread
was selected as an OOM victim but mutex_trylock(&oom_lock) in
__alloc_pages_may_oom() failed).

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-18 10:14   ` Tetsuo Handa
@ 2018-05-18 12:20     ` Michal Hocko
  2018-05-20 15:56       ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-18 12:20 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: rientjes, guro, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Fri 18-05-18 19:14:12, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Sat 12-05-18 23:18:24, Tetsuo Handa wrote:
> > [...]
> > > @@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > >  	/* Retry as long as the OOM killer is making progress */
> > >  	if (did_some_progress) {
> > >  		no_progress_loops = 0;
> > > +		/*
> > > +		 * This schedule_timeout_*() serves as a guaranteed sleep for
> > > +		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > > +		 */
> > > +		if (!tsk_is_oom_victim(current))
> > > +			schedule_timeout_uninterruptible(1);
> > >  		goto retry;
> > 
> > We already do have that sleep for PF_WQ_WORKER in should_reclaim_retry.
> > Why do we need it here as well?
> 
> Because that path depends on __zone_watermark_ok() == true which is not
> guaranteed to be executed.

Is there any reason we cannot do the special cased sleep for
PF_WQ_WORKER in should_reclaim_retry? The current code is complex enough
to make it even more so. If we need a hack for PF_WQ_WORKER case then we
definitely want to have a single place to do so.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-18 12:20     ` Michal Hocko
@ 2018-05-20 15:56       ` Tetsuo Handa
  2018-05-22  6:18         ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-20 15:56 UTC (permalink / raw)
  To: mhocko; +Cc: rientjes, guro, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Fri 18-05-18 19:14:12, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > On Sat 12-05-18 23:18:24, Tetsuo Handa wrote:
> > > [...]
> > > > @@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > > >  	/* Retry as long as the OOM killer is making progress */
> > > >  	if (did_some_progress) {
> > > >  		no_progress_loops = 0;
> > > > +		/*
> > > > +		 * This schedule_timeout_*() serves as a guaranteed sleep for
> > > > +		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > > > +		 */
> > > > +		if (!tsk_is_oom_victim(current))
> > > > +			schedule_timeout_uninterruptible(1);
> > > >  		goto retry;
> > > 
> > > We already do have that sleep for PF_WQ_WORKER in should_reclaim_retry.
> > > Why do we need it here as well?
> > 
> > Because that path depends on __zone_watermark_ok() == true which is not
> > guaranteed to be executed.
> 
> Is there any reason we cannot do the special cased sleep for
> PF_WQ_WORKER in should_reclaim_retry? The current code is complex enough
> to make it even more so. If we need a hack for PF_WQ_WORKER case then we
> definitely want to have a single place to do so.

I don't understand why you are talking about PF_WQ_WORKER case.

This sleep is not only for PF_WQ_WORKER case but also !PF_KTHREAD case.
I added this comment because you suggested simply removing any sleep which
waits for the OOM victim.

Making special cased sleep for PF_WQ_WORKER in should_reclaim_retry() cannot
become a reason to block this patch. You can propose it after this patch is
applied. This patch is for mitigating lockup problem caused by forever holding
oom_lock.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-20 15:56       ` Tetsuo Handa
@ 2018-05-22  6:18         ` Michal Hocko
  2018-05-23 10:24           ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-22  6:18 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: rientjes, guro, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Mon 21-05-18 00:56:05, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 18-05-18 19:14:12, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > On Sat 12-05-18 23:18:24, Tetsuo Handa wrote:
> > > > [...]
> > > > > @@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > > > >  	/* Retry as long as the OOM killer is making progress */
> > > > >  	if (did_some_progress) {
> > > > >  		no_progress_loops = 0;
> > > > > +		/*
> > > > > +		 * This schedule_timeout_*() serves as a guaranteed sleep for
> > > > > +		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > > > > +		 */
> > > > > +		if (!tsk_is_oom_victim(current))
> > > > > +			schedule_timeout_uninterruptible(1);
> > > > >  		goto retry;
> > > > 
> > > > We already do have that sleep for PF_WQ_WORKER in should_reclaim_retry.
> > > > Why do we need it here as well?
> > > 
> > > Because that path depends on __zone_watermark_ok() == true which is not
> > > guaranteed to be executed.
> > 
> > Is there any reason we cannot do the special cased sleep for
> > PF_WQ_WORKER in should_reclaim_retry? The current code is complex enough
> > to make it even more so. If we need a hack for PF_WQ_WORKER case then we
> > definitely want to have a single place to do so.
> 
> I don't understand why you are talking about PF_WQ_WORKER case.

Because that seems to be the reason to have it there as per your
comment.

> This sleep is not only for PF_WQ_WORKER case but also !PF_KTHREAD case.
> I added this comment because you suggested simply removing any sleep which
> waits for the OOM victim.

And now you have made the comment misleading and I suspect it is just
not really needed as well.

> Making special cased sleep for PF_WQ_WORKER in should_reclaim_retry() cannot
> become a reason to block this patch. You can propose it after this patch is
> applied. This patch is for mitigating lockup problem caused by forever holding
> oom_lock.

You are fiddling with other code paths at the same time so I _do_ care.
Spilling random code without a proper explanation is just not going to
fly.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-22  6:18         ` Michal Hocko
@ 2018-05-23 10:24           ` Tetsuo Handa
  2018-05-23 11:57             ` Michal Hocko
  2018-05-29  7:17             ` Michal Hocko
  0 siblings, 2 replies; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-23 10:24 UTC (permalink / raw)
  To: mhocko, guro; +Cc: rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> > I don't understand why you are talking about PF_WQ_WORKER case.
> 
> Because that seems to be the reason to have it there as per your
> comment.

OK. Then, I will fold below change into my patch.

        if (did_some_progress) {
                no_progress_loops = 0;
 +              /*
-+               * This schedule_timeout_*() serves as a guaranteed sleep for
-+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
++               * Try to give the OOM killer/reaper/victims some time for
++               * releasing memory.
 +               */
 +              if (!tsk_is_oom_victim(current))
 +                      schedule_timeout_uninterruptible(1);

But Roman, my patch conflicts with your "mm, oom: cgroup-aware OOM killer" patch
in linux-next. And it seems to me that your patch contains a bug which leads to
premature memory allocation failure explained below.

@@ -1029,6 +1050,7 @@ bool out_of_memory(struct oom_control *oc)
 {
        unsigned long freed = 0;
        enum oom_constraint constraint = CONSTRAINT_NONE;
+       bool delay = false; /* if set, delay next allocation attempt */

        if (oom_killer_disabled)
                return false;
@@ -1073,27 +1095,39 @@ bool out_of_memory(struct oom_control *oc)
            current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                get_task_struct(current);
-               oc->chosen = current;
+               oc->chosen_task = current;
                oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
                return true;
        }

+       if (mem_cgroup_select_oom_victim(oc)) {

/* mem_cgroup_select_oom_victim() returns true if select_victim_memcg() made
   oc->chosen_memcg != NULL.
   select_victim_memcg() makes oc->chosen_memcg = INFLIGHT_VICTIM if there is
   inflight memcg. But oc->chosen_task remains NULL because it did not call
   oom_evaluate_task(), didn't it? (And if it called oom_evaluate_task(),
   put_task_struct() is missing here.) */

+               if (oom_kill_memcg_victim(oc)) {

/* oom_kill_memcg_victim() returns true if oc->chosen_memcg == INFLIGHT_VICTIM. */

+                       delay = true;
+                       goto out;
+               }
+       }
+
        select_bad_process(oc);
        /* Found nothing?!?! Either we hang forever, or we panic. */
-       if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+       if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
                dump_header(oc, NULL);
                panic("Out of memory and no killable processes...\n");
        }
-       if (oc->chosen && oc->chosen != (void *)-1UL) {
+       if (oc->chosen_task && oc->chosen_task != (void *)-1UL) {
                oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
                                 "Memory cgroup out of memory");
-               /*
-                * Give the killed process a good chance to exit before trying
-                * to allocate memory again.
-                */
-               schedule_timeout_killable(1);
+               delay = true;
        }
-       return !!oc->chosen;
+
+out:
+       /*
+        * Give the killed process a good chance to exit before trying
+        * to allocate memory again.
+        */
+       if (delay)
+               schedule_timeout_killable(1);
+

/* out_of_memory() returns false because oc->chosen_task remains NULL. */

+       return !!oc->chosen_task;
 }

Can we apply my patch prior to your "mm, oom: cgroup-aware OOM killer" patch
(which eliminates "delay" and "out:" from your patch) so that people can easily
backport my patch? Or, do you want to apply a fix (which eliminates "delay" and
"out:" from linux-next) prior to my patch?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-23 10:24           ` Tetsuo Handa
@ 2018-05-23 11:57             ` Michal Hocko
  2018-05-23 13:45               ` Tetsuo Handa
  2018-05-29  7:17             ` Michal Hocko
  1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-23 11:57 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Wed 23-05-18 19:24:48, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > > I don't understand why you are talking about PF_WQ_WORKER case.
> > 
> > Because that seems to be the reason to have it there as per your
> > comment.
> 
> OK. Then, I will fold below change into my patch.
> 
>         if (did_some_progress) {
>                 no_progress_loops = 0;
>  +              /*
> -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> ++               * Try to give the OOM killer/reaper/victims some time for
> ++               * releasing memory.
>  +               */
>  +              if (!tsk_is_oom_victim(current))
>  +                      schedule_timeout_uninterruptible(1);

Do you really need this? You are still fiddling with this path at all? I
see how removing the timeout might be reasonable after recent changes
but why do you insist in adding it outside of the lock.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-23 11:57             ` Michal Hocko
@ 2018-05-23 13:45               ` Tetsuo Handa
  2018-05-23 14:56                 ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-23 13:45 UTC (permalink / raw)
  To: mhocko; +Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Wed 23-05-18 19:24:48, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > > I don't understand why you are talking about PF_WQ_WORKER case.
> > > 
> > > Because that seems to be the reason to have it there as per your
> > > comment.
> > 
> > OK. Then, I will fold below change into my patch.
> > 
> >         if (did_some_progress) {
> >                 no_progress_loops = 0;
> >  +              /*
> > -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> > -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > ++               * Try to give the OOM killer/reaper/victims some time for
> > ++               * releasing memory.
> >  +               */
> >  +              if (!tsk_is_oom_victim(current))
> >  +                      schedule_timeout_uninterruptible(1);
> 
> Do you really need this? You are still fiddling with this path at all? I
> see how removing the timeout might be reasonable after recent changes
> but why do you insist in adding it outside of the lock.

Sigh... We can't remove this sleep without further changes. That's why I added

 * This schedule_timeout_*() serves as a guaranteed sleep for
 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.

so that we won't by error remove this sleep without further changes.

This sleep is not only for waiting for OOM victims. Any thread who is holding
oom_lock needs CPU resources in order to make forward progress.

If oom_notify_list callbacks are registered, this sleep helps the owner of
oom_lock to reclaim memory by processing the callbacks.

If oom_notify_list callbacks did not release memory, this sleep still helps
the owner of oom_lock to check whether there is inflight OOM victims.

If there is no inflight OOM victims, this sleep still helps the owner of
oom_lock to select a new OOM victim and call printk().

If there are already inflight OOM victims, this sleep still helps the OOM
reaper and the OOM victims to release memory.

Printing messages to consoles and reclaiming memory need CPU resources.
More reliable way is to use mutex_lock_killable(&oom_lock) instead of
mutex_trylock(&oom_lock) in __alloc_pages_may_oom(), but I'm giving way
for now. There is no valid reason for removing this sleep now.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-23 13:45               ` Tetsuo Handa
@ 2018-05-23 14:56                 ` Michal Hocko
  2018-05-24 10:51                   ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-23 14:56 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Wed 23-05-18 22:45:20, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Wed 23-05-18 19:24:48, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > > I don't understand why you are talking about PF_WQ_WORKER case.
> > > > 
> > > > Because that seems to be the reason to have it there as per your
> > > > comment.
> > > 
> > > OK. Then, I will fold below change into my patch.
> > > 
> > >         if (did_some_progress) {
> > >                 no_progress_loops = 0;
> > >  +              /*
> > > -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> > > -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > > ++               * Try to give the OOM killer/reaper/victims some time for
> > > ++               * releasing memory.
> > >  +               */
> > >  +              if (!tsk_is_oom_victim(current))
> > >  +                      schedule_timeout_uninterruptible(1);
> > 
> > Do you really need this? You are still fiddling with this path at all? I
> > see how removing the timeout might be reasonable after recent changes
> > but why do you insist in adding it outside of the lock.
> 
> Sigh... We can't remove this sleep without further changes. That's why I added
> 
>  * This schedule_timeout_*() serves as a guaranteed sleep for
>  * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> 
> so that we won't by error remove this sleep without further changes.

Look. I am fed up with this discussion. You are fiddling with the code
and moving hacks around with a lot of hand waving. Rahter than trying to
look at the underlying problem. Your patch completely ignores PREEMPT as
I've mentioned in previous versions.

I do admit that the underlying problem is non-trivial to handle and it
requires a deeper consideration. Fair enough. You can spend that time on
the matter and come up with something clever. That would be great. But
moving a sleep around because of some yada yada yada is not a way we
want to treat this code.

I would be OK with removing the sleep from the out_of_memory path based
on your argumentation that we have a _proper_ synchronization with the
exit path now. That would be a patch that has actually a solid
background behind. Is it possible that something would wait longer or
wouldn't preempt etc.? Yes possible but those need to be analyzed and
thing through properly. See the difference from "we may need it because
we've always been doing that and there is here and there that might
happen". This cargo cult way of programming will only grow more and more
hacks nobody can reason about long term.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-23 14:56                 ` Michal Hocko
@ 2018-05-24 10:51                   ` Tetsuo Handa
  2018-05-24 11:50                     ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-24 10:51 UTC (permalink / raw)
  To: mhocko; +Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> Look. I am fed up with this discussion. You are fiddling with the code
> and moving hacks around with a lot of hand waving. Rahter than trying to
> look at the underlying problem. Your patch completely ignores PREEMPT as
> I've mentioned in previous versions.

I'm not ignoring PREEMPT. To fix this OOM lockup problem properly, as much
efforts as fixing Spectre/Meltdown problems will be required. This patch is
a mitigation for regression introduced by fixing CVE-2018-1000200. Nothing
is good with deferring this patch.

> I would be OK with removing the sleep from the out_of_memory path based
> on your argumentation that we have a _proper_ synchronization with the
> exit path now.

Such attempt should be made in a separate patch.

You suggested removing this sleep from my patch without realizing that
we need explicit schedule_timeout_*() for PF_WQ_WORKER threads. My patch
is trying to be as conservative/safe as possible (for easier backport)
while reducing the risk of falling into OOM lockup.

I worry that you are completely overlooking

                char *fmt, ...)
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
-		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 

part in this patch.

Currently, the short sleep is so random/inconsistent that
schedule_timeout_uninterruptible(1) is called when we failed to grab
oom_lock (even if current thread was already marked as an OOM victim),
schedule_timeout_killable(1) is called when we killed a new OOM victim,
and no sleep at all if we found that there are inflight OOM victims.

This patch centralized the location to call
schedule_timeout_uninterruptible(1) to "goto retry;" path so that
current thread surely yields CPU resource to the owner of oom_lock.

You are free to propose removing this centralized sleep after my change
is applied. Of course, you are responsible for convincing that removing
this centralized sleep (unless PF_WQ_WORKER threads) does not negatively
affect the owner of oom_lock (e.g. a SCHED_IDLE thread who is holding
oom_lock gets blocked longer than mine).

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-24 10:51                   ` Tetsuo Handa
@ 2018-05-24 11:50                     ` Michal Hocko
  2018-05-25  1:17                       ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-24 11:50 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Thu 24-05-18 19:51:24, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > Look. I am fed up with this discussion. You are fiddling with the code
> > and moving hacks around with a lot of hand waving. Rahter than trying to
> > look at the underlying problem. Your patch completely ignores PREEMPT as
> > I've mentioned in previous versions.
> 
> I'm not ignoring PREEMPT. To fix this OOM lockup problem properly, as much
> efforts as fixing Spectre/Meltdown problems will be required. This patch is
> a mitigation for regression introduced by fixing CVE-2018-1000200. Nothing
> is good with deferring this patch.
> 
> > I would be OK with removing the sleep from the out_of_memory path based
> > on your argumentation that we have a _proper_ synchronization with the
> > exit path now.
> 
> Such attempt should be made in a separate patch.
> 
> You suggested removing this sleep from my patch without realizing that
> we need explicit schedule_timeout_*() for PF_WQ_WORKER threads.

And that sleep is in should_reclaim_retry. If that is not sufficient it
should be addressed rather than spilling more of that crud all over the
place.

> My patch
> is trying to be as conservative/safe as possible (for easier backport)
> while reducing the risk of falling into OOM lockup.

And it adss more crud
 
> I worry that you are completely overlooking
> 
>                 char *fmt, ...)
>  	 */
>  	if (!mutex_trylock(&oom_lock)) {
>  		*did_some_progress = 1;
> -		schedule_timeout_uninterruptible(1);
>  		return NULL;
>  	}
>  
> 
> part in this patch.

I am not. But it doesn't really make much sense to convince you if you
are not reading what I am writing. I am done with this thread.
Discussion is just a waste of time.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-24 11:50                     ` Michal Hocko
@ 2018-05-25  1:17                       ` Tetsuo Handa
  2018-05-25  8:31                         ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-25  1:17 UTC (permalink / raw)
  To: Michal Hocko
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Thu 24-05-18 19:51:24, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > Look. I am fed up with this discussion. You are fiddling with the code
> > > and moving hacks around with a lot of hand waving. Rahter than trying to
> > > look at the underlying problem. Your patch completely ignores PREEMPT as
> > > I've mentioned in previous versions.
> > 
> > I'm not ignoring PREEMPT. To fix this OOM lockup problem properly, as much
> > efforts as fixing Spectre/Meltdown problems will be required. This patch is
> > a mitigation for regression introduced by fixing CVE-2018-1000200. Nothing
> > is good with deferring this patch.
> > 
> > > I would be OK with removing the sleep from the out_of_memory path based
> > > on your argumentation that we have a _proper_ synchronization with the
> > > exit path now.
> > 
> > Such attempt should be made in a separate patch.
> > 
> > You suggested removing this sleep from my patch without realizing that
> > we need explicit schedule_timeout_*() for PF_WQ_WORKER threads.
> 
> And that sleep is in should_reclaim_retry. If that is not sufficient it
> should be addressed rather than spilling more of that crud all over the
> place.

Then, please show me (by writing a patch yourself) how to tell whether
we should sleep there. What I can come up is shown below.

-@@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
-       /* Retry as long as the OOM killer is making progress */
-       if (did_some_progress) {
-               no_progress_loops = 0;
-+              /*
-+               * This schedule_timeout_*() serves as a guaranteed sleep for
-+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
-+               */
-+              if (!tsk_is_oom_victim(current))
-+                      schedule_timeout_uninterruptible(1);
-               goto retry;
-       }
+@@ -3927,6 +3926,14 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+               (*no_progress_loops)++;

+       /*
++       * We do a short sleep here if the OOM killer/reaper/victims are
++       * holding oom_lock, in order to try to give them some CPU resources
++       * for releasing memory.
++       */
++      if (mutex_is_locked(&oom_lock) && !tsk_is_oom_victim(current))
++              schedule_timeout_uninterruptible(1);
++
++      /*
+        * Make sure we converge to OOM if we cannot make any progress
+        * several times in the row.
+        */

As far as I know, whether a domain which the current thread belongs to is
already OOM is not known as of should_reclaim_retry(). Therefore, sleeping
there can become a pointless delay if the domain which the current thread
belongs to and the domain which the owner of oom_lock (it can be a random
thread inside out_of_memory() or exit_mmap()) belongs to differs.

But you insist sleeping there means that you don't care about such
pointless delay?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-25  1:17                       ` Tetsuo Handa
@ 2018-05-25  8:31                         ` Michal Hocko
  2018-05-25 10:57                           ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-25  8:31 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Fri 25-05-18 10:17:42, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Thu 24-05-18 19:51:24, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > Look. I am fed up with this discussion. You are fiddling with the code
> > > > and moving hacks around with a lot of hand waving. Rahter than trying to
> > > > look at the underlying problem. Your patch completely ignores PREEMPT as
> > > > I've mentioned in previous versions.
> > > 
> > > I'm not ignoring PREEMPT. To fix this OOM lockup problem properly, as much
> > > efforts as fixing Spectre/Meltdown problems will be required. This patch is
> > > a mitigation for regression introduced by fixing CVE-2018-1000200. Nothing
> > > is good with deferring this patch.
> > > 
> > > > I would be OK with removing the sleep from the out_of_memory path based
> > > > on your argumentation that we have a _proper_ synchronization with the
> > > > exit path now.
> > > 
> > > Such attempt should be made in a separate patch.
> > > 
> > > You suggested removing this sleep from my patch without realizing that
> > > we need explicit schedule_timeout_*() for PF_WQ_WORKER threads.
> > 
> > And that sleep is in should_reclaim_retry. If that is not sufficient it
> > should be addressed rather than spilling more of that crud all over the
> > place.
> 
> Then, please show me (by writing a patch yourself) how to tell whether
> we should sleep there. What I can come up is shown below.
> 
> -@@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> -       /* Retry as long as the OOM killer is making progress */
> -       if (did_some_progress) {
> -               no_progress_loops = 0;
> -+              /*
> -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> -+               */
> -+              if (!tsk_is_oom_victim(current))
> -+                      schedule_timeout_uninterruptible(1);
> -               goto retry;
> -       }
> +@@ -3927,6 +3926,14 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> +               (*no_progress_loops)++;
> 
> +       /*
> ++       * We do a short sleep here if the OOM killer/reaper/victims are
> ++       * holding oom_lock, in order to try to give them some CPU resources
> ++       * for releasing memory.
> ++       */
> ++      if (mutex_is_locked(&oom_lock) && !tsk_is_oom_victim(current))
> ++              schedule_timeout_uninterruptible(1);
> ++
> ++      /*
> +        * Make sure we converge to OOM if we cannot make any progress
> +        * several times in the row.
> +        */
> 
> As far as I know, whether a domain which the current thread belongs to is
> already OOM is not known as of should_reclaim_retry(). Therefore, sleeping
> there can become a pointless delay if the domain which the current thread
> belongs to and the domain which the owner of oom_lock (it can be a random
> thread inside out_of_memory() or exit_mmap()) belongs to differs.
> 
> But you insist sleeping there means that you don't care about such
> pointless delay?

What is wrong with the folliwing? should_reclaim_retry should be a
natural reschedule point. PF_WQ_WORKER is a special case which needs a
stronger rescheduling policy. Doing that unconditionally seems more
straightforward than depending on a zone being a good candidate for a
further reclaim.

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c6f4008ea55..b01b19d3d596 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3925,6 +3925,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 {
 	struct zone *zone;
 	struct zoneref *z;
+	bool ret = false;
 
 	/*
 	 * Costly allocations might have made a progress but this doesn't mean
@@ -3988,25 +3989,26 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 				}
 			}
 
-			/*
-			 * Memory allocation/reclaim might be called from a WQ
-			 * context and the current implementation of the WQ
-			 * concurrency control doesn't recognize that
-			 * a particular WQ is congested if the worker thread is
-			 * looping without ever sleeping. Therefore we have to
-			 * do a short sleep here rather than calling
-			 * cond_resched().
-			 */
-			if (current->flags & PF_WQ_WORKER)
-				schedule_timeout_uninterruptible(1);
-			else
-				cond_resched();
-
-			return true;
+			ret = true;
+			goto out;
 		}
 	}
 
-	return false;
+out:
+	/*
+	 * Memory allocation/reclaim might be called from a WQ
+	 * context and the current implementation of the WQ
+	 * concurrency control doesn't recognize that
+	 * a particular WQ is congested if the worker thread is
+	 * looping without ever sleeping. Therefore we have to
+	 * do a short sleep here rather than calling
+	 * cond_resched().
+	 */
+	if (current->flags & PF_WQ_WORKER)
+		schedule_timeout_uninterruptible(1);
+	else
+		cond_resched();
+	return ret;
 }
 
 static inline bool
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-25  8:31                         ` Michal Hocko
@ 2018-05-25 10:57                           ` Tetsuo Handa
  2018-05-25 11:42                             ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-25 10:57 UTC (permalink / raw)
  To: mhocko; +Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Fri 25-05-18 10:17:42, Tetsuo Handa wrote:
> > Then, please show me (by writing a patch yourself) how to tell whether
> > we should sleep there. What I can come up is shown below.
> > 
> > -@@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > -       /* Retry as long as the OOM killer is making progress */
> > -       if (did_some_progress) {
> > -               no_progress_loops = 0;
> > -+              /*
> > -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> > -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > -+               */
> > -+              if (!tsk_is_oom_victim(current))
> > -+                      schedule_timeout_uninterruptible(1);
> > -               goto retry;
> > -       }
> > +@@ -3927,6 +3926,14 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > +               (*no_progress_loops)++;
> > 
> > +       /*
> > ++       * We do a short sleep here if the OOM killer/reaper/victims are
> > ++       * holding oom_lock, in order to try to give them some CPU resources
> > ++       * for releasing memory.
> > ++       */
> > ++      if (mutex_is_locked(&oom_lock) && !tsk_is_oom_victim(current))
> > ++              schedule_timeout_uninterruptible(1);
> > ++
> > ++      /*
> > +        * Make sure we converge to OOM if we cannot make any progress
> > +        * several times in the row.
> > +        */
> > 
> > As far as I know, whether a domain which the current thread belongs to is
> > already OOM is not known as of should_reclaim_retry(). Therefore, sleeping
> > there can become a pointless delay if the domain which the current thread
> > belongs to and the domain which the owner of oom_lock (it can be a random
> > thread inside out_of_memory() or exit_mmap()) belongs to differs.
> > 
> > But you insist sleeping there means that you don't care about such
> > pointless delay?
> 
> What is wrong with the folliwing? should_reclaim_retry should be a
> natural reschedule point. PF_WQ_WORKER is a special case which needs a
> stronger rescheduling policy. Doing that unconditionally seems more
> straightforward than depending on a zone being a good candidate for a
> further reclaim.

Where is schedule_timeout_uninterruptible(1) for !PF_KTHREAD threads?
My concern is that cond_resched() might be a too short sleep to give
enough CPU resources to the owner of the oom_lock.

#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif

#ifndef CONFIG_PREEMPT
int __sched _cond_resched(void)
{
	if (should_resched(0)) {
		preempt_schedule_common();
		return 1;
	}
	rcu_all_qs();
	return 0;
}
EXPORT_SYMBOL(_cond_resched);
#endif

#define cond_resched() ({                       \
        ___might_sleep(__FILE__, __LINE__, 0);  \
        _cond_resched();                        \
})

How do you prove that cond_resched() is an appropriate replacement for
schedule_timeout_killable(1) in out_of_memory() and
schedule_timeout_uninterruptible(1) in __alloc_pages_may_oom() ?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-25 10:57                           ` Tetsuo Handa
@ 2018-05-25 11:42                             ` Michal Hocko
  2018-05-25 11:46                               ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-25 11:42 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Fri 25-05-18 19:57:32, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 25-05-18 10:17:42, Tetsuo Handa wrote:
> > > Then, please show me (by writing a patch yourself) how to tell whether
> > > we should sleep there. What I can come up is shown below.
> > > 
> > > -@@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > > -       /* Retry as long as the OOM killer is making progress */
> > > -       if (did_some_progress) {
> > > -               no_progress_loops = 0;
> > > -+              /*
> > > -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> > > -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> > > -+               */
> > > -+              if (!tsk_is_oom_victim(current))
> > > -+                      schedule_timeout_uninterruptible(1);
> > > -               goto retry;
> > > -       }
> > > +@@ -3927,6 +3926,14 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > > +               (*no_progress_loops)++;
> > > 
> > > +       /*
> > > ++       * We do a short sleep here if the OOM killer/reaper/victims are
> > > ++       * holding oom_lock, in order to try to give them some CPU resources
> > > ++       * for releasing memory.
> > > ++       */
> > > ++      if (mutex_is_locked(&oom_lock) && !tsk_is_oom_victim(current))
> > > ++              schedule_timeout_uninterruptible(1);
> > > ++
> > > ++      /*
> > > +        * Make sure we converge to OOM if we cannot make any progress
> > > +        * several times in the row.
> > > +        */
> > > 
> > > As far as I know, whether a domain which the current thread belongs to is
> > > already OOM is not known as of should_reclaim_retry(). Therefore, sleeping
> > > there can become a pointless delay if the domain which the current thread
> > > belongs to and the domain which the owner of oom_lock (it can be a random
> > > thread inside out_of_memory() or exit_mmap()) belongs to differs.
> > > 
> > > But you insist sleeping there means that you don't care about such
> > > pointless delay?
> > 
> > What is wrong with the folliwing? should_reclaim_retry should be a
> > natural reschedule point. PF_WQ_WORKER is a special case which needs a
> > stronger rescheduling policy. Doing that unconditionally seems more
> > straightforward than depending on a zone being a good candidate for a
> > further reclaim.
> 
> Where is schedule_timeout_uninterruptible(1) for !PF_KTHREAD threads?

Re-read what I've said.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-25 11:42                             ` Michal Hocko
@ 2018-05-25 11:46                               ` Tetsuo Handa
  2018-05-28 12:43                                 ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-25 11:46 UTC (permalink / raw)
  To: mhocko; +Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Fri 25-05-18 19:57:32, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > What is wrong with the folliwing? should_reclaim_retry should be a
> > > natural reschedule point. PF_WQ_WORKER is a special case which needs a
> > > stronger rescheduling policy. Doing that unconditionally seems more
> > > straightforward than depending on a zone being a good candidate for a
> > > further reclaim.
> > 
> > Where is schedule_timeout_uninterruptible(1) for !PF_KTHREAD threads?
> 
> Re-read what I've said.

Please show me as a complete patch. Then, I will test your patch.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-25 11:46                               ` Tetsuo Handa
@ 2018-05-28 12:43                                 ` Michal Hocko
  2018-05-28 20:57                                   ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-28 12:43 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Fri 25-05-18 20:46:21, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 25-05-18 19:57:32, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > What is wrong with the folliwing? should_reclaim_retry should be a
> > > > natural reschedule point. PF_WQ_WORKER is a special case which needs a
> > > > stronger rescheduling policy. Doing that unconditionally seems more
> > > > straightforward than depending on a zone being a good candidate for a
> > > > further reclaim.
> > > 
> > > Where is schedule_timeout_uninterruptible(1) for !PF_KTHREAD threads?
> > 
> > Re-read what I've said.
> 
> Please show me as a complete patch. Then, I will test your patch.

So how about we start as simple as the following? If we _really_ need to
touch should_reclaim_retry then it should be done in a separate patch
with some numbers/tracing data backing that story.
---

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-28 12:43                                 ` Michal Hocko
@ 2018-05-28 20:57                                   ` Tetsuo Handa
  2018-05-29  7:17                                     ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-28 20:57 UTC (permalink / raw)
  To: mhocko; +Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

Michal Hocko wrote:
> On Fri 25-05-18 20:46:21, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > On Fri 25-05-18 19:57:32, Tetsuo Handa wrote:
> > > > Michal Hocko wrote:
> > > > > What is wrong with the folliwing? should_reclaim_retry should be a
> > > > > natural reschedule point. PF_WQ_WORKER is a special case which needs a
> > > > > stronger rescheduling policy. Doing that unconditionally seems more
> > > > > straightforward than depending on a zone being a good candidate for a
> > > > > further reclaim.
> > > > 
> > > > Where is schedule_timeout_uninterruptible(1) for !PF_KTHREAD threads?
> > > 
> > > Re-read what I've said.
> > 
> > Please show me as a complete patch. Then, I will test your patch.
> 
> So how about we start as simple as the following? If we _really_ need to
> touch should_reclaim_retry then it should be done in a separate patch
> with some numbers/tracing data backing that story.

This patch is incorrect that it ignores the bug in Roman's
"mm, oom: cgroup-aware OOM killer" patch in linux-next. I suggest applying
this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-23 10:24           ` Tetsuo Handa
  2018-05-23 11:57             ` Michal Hocko
@ 2018-05-29  7:17             ` Michal Hocko
  2018-05-29  8:16               ` Michal Hocko
  1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-29  7:17 UTC (permalink / raw)
  To: guro, Tetsuo Handa
  Cc: rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Wed 23-05-18 19:24:48, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > > I don't understand why you are talking about PF_WQ_WORKER case.
> > 
> > Because that seems to be the reason to have it there as per your
> > comment.
> 
> OK. Then, I will fold below change into my patch.
> 
>         if (did_some_progress) {
>                 no_progress_loops = 0;
>  +              /*
> -+               * This schedule_timeout_*() serves as a guaranteed sleep for
> -+               * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
> ++               * Try to give the OOM killer/reaper/victims some time for
> ++               * releasing memory.
>  +               */
>  +              if (!tsk_is_oom_victim(current))
>  +                      schedule_timeout_uninterruptible(1);
> 
> But Roman, my patch conflicts with your "mm, oom: cgroup-aware OOM killer" patch
> in linux-next. And it seems to me that your patch contains a bug which leads to
> premature memory allocation failure explained below.
> 
> @@ -1029,6 +1050,7 @@ bool out_of_memory(struct oom_control *oc)
>  {
>         unsigned long freed = 0;
>         enum oom_constraint constraint = CONSTRAINT_NONE;
> +       bool delay = false; /* if set, delay next allocation attempt */
> 
>         if (oom_killer_disabled)
>                 return false;
> @@ -1073,27 +1095,39 @@ bool out_of_memory(struct oom_control *oc)
>             current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
>             current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
>                 get_task_struct(current);
> -               oc->chosen = current;
> +               oc->chosen_task = current;
>                 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
>                 return true;
>         }
> 
> +       if (mem_cgroup_select_oom_victim(oc)) {
> 
> /* mem_cgroup_select_oom_victim() returns true if select_victim_memcg() made
>    oc->chosen_memcg != NULL.
>    select_victim_memcg() makes oc->chosen_memcg = INFLIGHT_VICTIM if there is
>    inflight memcg. But oc->chosen_task remains NULL because it did not call
>    oom_evaluate_task(), didn't it? (And if it called oom_evaluate_task(),
>    put_task_struct() is missing here.) */
> 
> +               if (oom_kill_memcg_victim(oc)) {
> 
> /* oom_kill_memcg_victim() returns true if oc->chosen_memcg == INFLIGHT_VICTIM. */
> 
> +                       delay = true;
> +                       goto out;
> +               }
> +       }
> +
>         select_bad_process(oc);
>         /* Found nothing?!?! Either we hang forever, or we panic. */
> -       if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> +       if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
>                 dump_header(oc, NULL);
>                 panic("Out of memory and no killable processes...\n");
>         }
> -       if (oc->chosen && oc->chosen != (void *)-1UL) {
> +       if (oc->chosen_task && oc->chosen_task != (void *)-1UL) {
>                 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
>                                  "Memory cgroup out of memory");
> -               /*
> -                * Give the killed process a good chance to exit before trying
> -                * to allocate memory again.
> -                */
> -               schedule_timeout_killable(1);
> +               delay = true;
>         }
> -       return !!oc->chosen;
> +
> +out:
> +       /*
> +        * Give the killed process a good chance to exit before trying
> +        * to allocate memory again.
> +        */
> +       if (delay)
> +               schedule_timeout_killable(1);
> +
> 
> /* out_of_memory() returns false because oc->chosen_task remains NULL. */
> 
> +       return !!oc->chosen_task;
>  }
> 

What about this fix Roman?
---
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 565e7da55318..fc06af041447 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1174,7 +1174,7 @@ bool out_of_memory(struct oom_control *oc)
 	if (delay)
 		schedule_timeout_killable(1);
 
-	return !!oc->chosen_task;
+	return !!(oc->chosen_task | oc->chosen_memcg);
 }
 
 /*
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-28 20:57                                   ` Tetsuo Handa
@ 2018-05-29  7:17                                     ` Michal Hocko
  2018-05-29 23:07                                       ` Andrew Morton
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-29  7:17 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Tue 29-05-18 05:57:16, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 25-05-18 20:46:21, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > On Fri 25-05-18 19:57:32, Tetsuo Handa wrote:
> > > > > Michal Hocko wrote:
> > > > > > What is wrong with the folliwing? should_reclaim_retry should be a
> > > > > > natural reschedule point. PF_WQ_WORKER is a special case which needs a
> > > > > > stronger rescheduling policy. Doing that unconditionally seems more
> > > > > > straightforward than depending on a zone being a good candidate for a
> > > > > > further reclaim.
> > > > > 
> > > > > Where is schedule_timeout_uninterruptible(1) for !PF_KTHREAD threads?
> > > > 
> > > > Re-read what I've said.
> > > 
> > > Please show me as a complete patch. Then, I will test your patch.
> > 
> > So how about we start as simple as the following? If we _really_ need to
> > touch should_reclaim_retry then it should be done in a separate patch
> > with some numbers/tracing data backing that story.
> 
> This patch is incorrect that it ignores the bug in Roman's
> "mm, oom: cgroup-aware OOM killer" patch in linux-next.

I've expected Roman to comment on that. The fix should be trivial. But
does that prevent from further testing of this patch? Are you actually
using cgroup OOM killer? If not the bug should be a non-issue, right?

> I suggest applying
> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.

Well, I hope the whole pile gets merged in the upcoming merge window
rather than stall even more. This patch however can wait some more.
There is no hurry to merge it right away.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29  7:17             ` Michal Hocko
@ 2018-05-29  8:16               ` Michal Hocko
  2018-05-29 14:33                 ` Tetsuo Handa
  2018-05-31 16:31                 ` [PATCH] mm, memcg, oom: fix pre-mature allocation failures kbuild test robot
  0 siblings, 2 replies; 51+ messages in thread
From: Michal Hocko @ 2018-05-29  8:16 UTC (permalink / raw)
  To: guro, Tetsuo Handa
  Cc: rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

With the full changelog. This can be either folded into the respective
patch or applied on top.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29  8:16               ` Michal Hocko
@ 2018-05-29 14:33                 ` Tetsuo Handa
  2018-05-29 17:18                   ` Michal Hocko
  2018-05-31 16:31                 ` [PATCH] mm, memcg, oom: fix pre-mature allocation failures kbuild test robot
  1 sibling, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-29 14:33 UTC (permalink / raw)
  To: Michal Hocko, guro
  Cc: rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On 2018/05/29 17:16, Michal Hocko wrote:
> With the full changelog. This can be either folded into the respective
> patch or applied on top.
> 
>>From 0bd619e7a68337c97bdaed288e813e96a14ba339 Mon Sep 17 00:00:00 2001
> From: Michal Hocko <mhocko@suse.com>
> Date: Tue, 29 May 2018 10:09:33 +0200
> Subject: [PATCH] mm, memcg, oom: fix pre-mature allocation failures
> 
> Tetsuo has noticed that "mm, oom: cgroup-aware OOM killer" can lead to a
> pre-mature allocation failure if the cgroup aware oom killer is enabled
> and select_victim_memcg doesn't pick up any memcg to kill because there
> is a memcg already being killed. oc->chosen_memcg will become INFLIGHT_VICTIM
> and oom_kill_memcg_victim will bail out early. oc->chosen_task will
> stay NULL, however, and out_of_memory will therefore return false which
> forces __alloc_pages_may_oom to not set did_some_progress and the page
> allocator backs out and fails the allocation.
> U
> Fix this by checking both chosen_task and chosen_memcg in out_of_memory
> and return false only when _both_ are NULL.

I don't like this patch. It is not easy to understand and is fragile to
future changes. Currently the only case !!oc->chosen can become false is that
there was no eligible tasks when SysRq-f was requested or memcg OOM occurred.

	/* Found nothing?!?! Either we hang forever, or we panic. */
	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {

With this patch applied, what happens if
mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc) forgot to set
oc->chosen_memcg to NULL and called select_bad_process(oc) and reached

        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {

but did not trigger panic() because of is_sysrq_oom(oc) || is_memcg_oom(oc)
and reached the last "!!(oc->chosen_task | oc->chosen_memcg)" line?
It will by error return "true" when no eligible tasks found...

Don't make return conditions complicated.
The appropriate fix is to kill "delay" and "goto out;" now! My patch does it!!

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29 14:33                 ` Tetsuo Handa
@ 2018-05-29 17:18                   ` Michal Hocko
  2018-05-29 17:28                     ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-29 17:18 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Tue 29-05-18 23:33:13, Tetsuo Handa wrote:
> On 2018/05/29 17:16, Michal Hocko wrote:
> > With the full changelog. This can be either folded into the respective
> > patch or applied on top.
> > 
> >>From 0bd619e7a68337c97bdaed288e813e96a14ba339 Mon Sep 17 00:00:00 2001
> > From: Michal Hocko <mhocko@suse.com>
> > Date: Tue, 29 May 2018 10:09:33 +0200
> > Subject: [PATCH] mm, memcg, oom: fix pre-mature allocation failures
> > 
> > Tetsuo has noticed that "mm, oom: cgroup-aware OOM killer" can lead to a
> > pre-mature allocation failure if the cgroup aware oom killer is enabled
> > and select_victim_memcg doesn't pick up any memcg to kill because there
> > is a memcg already being killed. oc->chosen_memcg will become INFLIGHT_VICTIM
> > and oom_kill_memcg_victim will bail out early. oc->chosen_task will
> > stay NULL, however, and out_of_memory will therefore return false which
> > forces __alloc_pages_may_oom to not set did_some_progress and the page
> > allocator backs out and fails the allocation.
> > U
> > Fix this by checking both chosen_task and chosen_memcg in out_of_memory
> > and return false only when _both_ are NULL.
> 
> I don't like this patch. It is not easy to understand and is fragile to
> future changes. Currently the only case !!oc->chosen can become false is that
> there was no eligible tasks when SysRq-f was requested or memcg OOM occurred.

Well, the current contract is not easy unfortunatelly. We have two
different modes of operation. We are either killing whole cgroups or a
task from a cgroup. In any case, the contract says that if we have any
killable entity then at least one of chosen* is set to INFLIGHT_VICTIM.
Other than that one of them has to be !NULL or we have no eligible
killable entity. The return value reflects all these cases.

> 	/* Found nothing?!?! Either we hang forever, or we panic. */
> 	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> 
> With this patch applied, what happens if
> mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc) forgot to set
> oc->chosen_memcg to NULL and called select_bad_process(oc) and reached

Well, this would be a clear bug and breaking the expected contract. I do
not see such a bug. Do you?

>         /* Found nothing?!?! Either we hang forever, or we panic. */
>         if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> 
> but did not trigger panic() because of is_sysrq_oom(oc) || is_memcg_oom(oc)
> and reached the last "!!(oc->chosen_task | oc->chosen_memcg)" line?
> It will by error return "true" when no eligible tasks found...

What are you talking about?

> Don't make return conditions complicated.
> The appropriate fix is to kill "delay" and "goto out;" now! My patch does it!!

I will leave the decision about which fix to take to Roman.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29 17:18                   ` Michal Hocko
@ 2018-05-29 17:28                     ` Michal Hocko
  0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2018-05-29 17:28 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, akpm, torvalds

On Tue 29-05-18 19:18:33, Michal Hocko wrote:
> On Tue 29-05-18 23:33:13, Tetsuo Handa wrote:
> > On 2018/05/29 17:16, Michal Hocko wrote:
> > > With the full changelog. This can be either folded into the respective
> > > patch or applied on top.
> > > 
> > >>From 0bd619e7a68337c97bdaed288e813e96a14ba339 Mon Sep 17 00:00:00 2001
> > > From: Michal Hocko <mhocko@suse.com>
> > > Date: Tue, 29 May 2018 10:09:33 +0200
> > > Subject: [PATCH] mm, memcg, oom: fix pre-mature allocation failures
> > > 
> > > Tetsuo has noticed that "mm, oom: cgroup-aware OOM killer" can lead to a
> > > pre-mature allocation failure if the cgroup aware oom killer is enabled
> > > and select_victim_memcg doesn't pick up any memcg to kill because there
> > > is a memcg already being killed. oc->chosen_memcg will become INFLIGHT_VICTIM
> > > and oom_kill_memcg_victim will bail out early. oc->chosen_task will
> > > stay NULL, however, and out_of_memory will therefore return false which
> > > forces __alloc_pages_may_oom to not set did_some_progress and the page
> > > allocator backs out and fails the allocation.
> > > U
> > > Fix this by checking both chosen_task and chosen_memcg in out_of_memory
> > > and return false only when _both_ are NULL.
> > 
> > I don't like this patch. It is not easy to understand and is fragile to
> > future changes. Currently the only case !!oc->chosen can become false is that
> > there was no eligible tasks when SysRq-f was requested or memcg OOM occurred.
> 
> Well, the current contract is not easy unfortunatelly. We have two
> different modes of operation. We are either killing whole cgroups or a
> task from a cgroup. In any case, the contract says that if we have any
> killable entity then at least one of chosen* is set to INFLIGHT_VICTIM.
> Other than that one of them has to be !NULL or we have no eligible
> killable entity. The return value reflects all these cases.

Btw. if your concern is the readability then we can add a helper and
decsribe all the above in the comment.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29  7:17                                     ` Michal Hocko
@ 2018-05-29 23:07                                       ` Andrew Morton
  2018-05-31 10:10                                         ` Tetsuo Handa
  2018-06-01 15:28                                         ` Michal Hocko
  0 siblings, 2 replies; 51+ messages in thread
From: Andrew Morton @ 2018-05-29 23:07 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tetsuo Handa, guro, rientjes, hannes, vdavydov.dev, tj, linux-mm,
	torvalds

On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:

> > I suggest applying
> > this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> 
> Well, I hope the whole pile gets merged in the upcoming merge window
> rather than stall even more.

I'm more inclined to drop it all.  David has identified significant
shortcomings and I'm not seeing a way of addressing those shortcomings
in a backward-compatible fashion.  Therefore there is no way forward
at present.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29 23:07                                       ` Andrew Morton
@ 2018-05-31 10:10                                         ` Tetsuo Handa
  2018-05-31 10:44                                           ` Michal Hocko
  2018-06-01 15:28                                         ` Michal Hocko
  1 sibling, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-31 10:10 UTC (permalink / raw)
  To: Andrew Morton, Michal Hocko, torvalds
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm

On 2018/05/30 8:07, Andrew Morton wrote:
> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> 
>>> I suggest applying
>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
>>
>> Well, I hope the whole pile gets merged in the upcoming merge window
>> rather than stall even more.
> 
> I'm more inclined to drop it all.  David has identified significant
> shortcomings and I'm not seeing a way of addressing those shortcomings
> in a backward-compatible fashion.  Therefore there is no way forward
> at present.
> 

Can we apply my patch as-is first? My patch mitigates lockup regression
which should be able to be easily backported to stable kernels. We can
later evaluate whether moving the short sleep to should_reclaim_retry()
has negative impact.

Also we can eliminate the short sleep in Roman's patch before deciding
whether we can merge Roman's patchset in the upcoming merge window.



>From 4b356c742a3f1b720d5b709792fe68b25d800902 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sat, 12 May 2018 12:27:52 +0900
Subject: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.

When I was examining a bug which occurs under CPU + memory pressure, I
observed that a thread which called out_of_memory() can sleep for minutes
at schedule_timeout_killable(1) with oom_lock held when many threads are
doing direct reclaim.

The whole point of the sleep is give the OOM victim some time to exit.
But since commit 27ae357fa82be5ab ("mm, oom: fix concurrent munlock and
oom reaper unmap, v3") changed the OOM victim to wait for oom_lock in order
to close race window at exit_mmap(), the whole point of this sleep is lost
now. We need to make sure that the thread which called out_of_memory() will
release oom_lock shortly. Therefore, this patch brings the sleep to outside
of the OOM path.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
---
 mm/oom_kill.c   | 38 +++++++++++++++++---------------------
 mm/page_alloc.c |  7 ++++++-
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ba6cb8..23ce67f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -479,6 +479,21 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
+/*
+ * We have to make sure not to cause premature new oom victim selection.
+ *
+ * __alloc_pages_may_oom()     oom_reap_task_mm()/exit_mmap()
+ *   mutex_trylock(&oom_lock)
+ *   get_page_from_freelist(ALLOC_WMARK_HIGH) # fails
+ *                               unmap_page_range() # frees some memory
+ *                               set_bit(MMF_OOM_SKIP)
+ *   out_of_memory()
+ *     select_bad_process()
+ *       test_bit(MMF_OOM_SKIP) # selects new oom victim
+ *   mutex_unlock(&oom_lock)
+ *
+ * Therefore, the callers hold oom_lock when calling this function.
+ */
 void __oom_reap_task_mm(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -523,20 +538,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	bool ret = true;
 
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
 	mutex_lock(&oom_lock);
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
@@ -1077,15 +1078,9 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen && oc->chosen != (void *)-1UL) {
+	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
-		/*
-		 * Give the killed process a good chance to exit before trying
-		 * to allocate memory again.
-		 */
-		schedule_timeout_killable(1);
-	}
 	return !!oc->chosen;
 }
 
@@ -1111,4 +1106,5 @@ void pagefault_out_of_memory(void)
 		return;
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
+	schedule_timeout_killable(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 905db9d..458ed32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3478,7 +3478,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
-		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 
@@ -4241,6 +4240,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
 		no_progress_loops = 0;
+		/*
+		 * This schedule_timeout_*() serves as a guaranteed sleep for
+		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
+		 */
+		if (!tsk_is_oom_victim(current))
+			schedule_timeout_uninterruptible(1);
 		goto retry;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-31 10:10                                         ` Tetsuo Handa
@ 2018-05-31 10:44                                           ` Michal Hocko
  2018-05-31 15:23                                             ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-31 10:44 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, torvalds, guro, rientjes, hannes, vdavydov.dev,
	tj, linux-mm

On Thu 31-05-18 19:10:48, Tetsuo Handa wrote:
> On 2018/05/30 8:07, Andrew Morton wrote:
> > On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> > 
> >>> I suggest applying
> >>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> >>
> >> Well, I hope the whole pile gets merged in the upcoming merge window
> >> rather than stall even more.
> > 
> > I'm more inclined to drop it all.  David has identified significant
> > shortcomings and I'm not seeing a way of addressing those shortcomings
> > in a backward-compatible fashion.  Therefore there is no way forward
> > at present.
> > 
> 
> Can we apply my patch as-is first?

No. As already explained before. Sprinkling new sleeps without a strong
reason is not acceptable. The issue you are seeing is pretty artificial
and as such doesn're really warrant an immediate fix. We should rather
go with a well thought trhough fix. In other words we should simply drop
the sleep inside the oom_lock for starter unless it causes some really
unexpected behavior change.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-31 10:44                                           ` Michal Hocko
@ 2018-05-31 15:23                                             ` Tetsuo Handa
  2018-05-31 18:47                                               ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-05-31 15:23 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, torvalds, guro, rientjes, hannes, vdavydov.dev,
	tj, linux-mm

On 2018/05/31 19:44, Michal Hocko wrote:
> On Thu 31-05-18 19:10:48, Tetsuo Handa wrote:
>> On 2018/05/30 8:07, Andrew Morton wrote:
>>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
>>>
>>>>> I suggest applying
>>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
>>>>
>>>> Well, I hope the whole pile gets merged in the upcoming merge window
>>>> rather than stall even more.
>>>
>>> I'm more inclined to drop it all.  David has identified significant
>>> shortcomings and I'm not seeing a way of addressing those shortcomings
>>> in a backward-compatible fashion.  Therefore there is no way forward
>>> at present.
>>>
>>
>> Can we apply my patch as-is first?
> 
> No. As already explained before. Sprinkling new sleeps without a strong
> reason is not acceptable. The issue you are seeing is pretty artificial
> and as such doesn're really warrant an immediate fix. We should rather
> go with a well thought trhough fix. In other words we should simply drop
> the sleep inside the oom_lock for starter unless it causes some really
> unexpected behavior change.
> 

The OOM killer did not require schedule_timeout_killable(1) to return
as long as the OOM victim can call __mmput(). But now the OOM killer
requires schedule_timeout_killable(1) to return in order to allow the
OOM victim to call __oom_reap_task_mm(). Thus, this is a regression.

Artificial cannot become the reason to postpone my patch. If we don't care
artificialness/maliciousness, we won't need to care Spectre/Meltdown bugs.

I'm not sprinkling new sleeps. I'm just merging existing sleeps (i.e.
mutex_trylock() case and !mutex_trylock() case) and updating the outdated
comments.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm, memcg, oom: fix pre-mature allocation failures
  2018-05-29  8:16               ` Michal Hocko
  2018-05-29 14:33                 ` Tetsuo Handa
@ 2018-05-31 16:31                 ` kbuild test robot
  1 sibling, 0 replies; 51+ messages in thread
From: kbuild test robot @ 2018-05-31 16:31 UTC (permalink / raw)
  To: Michal Hocko
  Cc: kbuild-all, guro, Tetsuo Handa, rientjes, hannes, vdavydov.dev,
	tj, linux-mm, akpm, torvalds

[-- Attachment #1: Type: text/plain, Size: 4603 bytes --]

Hi Michal,

I love your patch! Yet something to improve:

[auto build test ERROR on mmotm/master]
[also build test ERROR on next-20180531]
[cannot apply to v4.17-rc7]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Michal-Hocko/mm-memcg-oom-fix-pre-mature-allocation-failures/20180531-220120
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: i386-randconfig-x078-201821 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   mm/oom_kill.c: In function 'out_of_memory':
>> mm/oom_kill.c:1157:28: error: invalid operands to binary | (have 'struct task_struct *' and 'struct mem_cgroup *')
     return !!(oc->chosen_task | oc->chosen_memcg);
               ~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~
>> mm/oom_kill.c:1158:1: warning: control reaches end of non-void function [-Wreturn-type]
    }
    ^

vim +1157 mm/oom_kill.c

  1068	
  1069	/**
  1070	 * out_of_memory - kill the "best" process when we run out of memory
  1071	 * @oc: pointer to struct oom_control
  1072	 *
  1073	 * If we run out of memory, we have the choice between either
  1074	 * killing a random task (bad), letting the system crash (worse)
  1075	 * OR try to be smart about which process to kill. Note that we
  1076	 * don't have to be perfect here, we just have to be good.
  1077	 */
  1078	bool out_of_memory(struct oom_control *oc)
  1079	{
  1080		unsigned long freed = 0;
  1081		enum oom_constraint constraint = CONSTRAINT_NONE;
  1082		bool delay = false; /* if set, delay next allocation attempt */
  1083	
  1084		if (oom_killer_disabled)
  1085			return false;
  1086	
  1087		if (!is_memcg_oom(oc)) {
  1088			blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
  1089			if (freed > 0)
  1090				/* Got some memory back in the last second. */
  1091				return true;
  1092		}
  1093	
  1094		/*
  1095		 * If current has a pending SIGKILL or is exiting, then automatically
  1096		 * select it.  The goal is to allow it to allocate so that it may
  1097		 * quickly exit and free its memory.
  1098		 */
  1099		if (task_will_free_mem(current)) {
  1100			mark_oom_victim(current);
  1101			wake_oom_reaper(current);
  1102			return true;
  1103		}
  1104	
  1105		/*
  1106		 * The OOM killer does not compensate for IO-less reclaim.
  1107		 * pagefault_out_of_memory lost its gfp context so we have to
  1108		 * make sure exclude 0 mask - all other users should have at least
  1109		 * ___GFP_DIRECT_RECLAIM to get here.
  1110		 */
  1111		if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
  1112			return true;
  1113	
  1114		/*
  1115		 * Check if there were limitations on the allocation (only relevant for
  1116		 * NUMA and memcg) that may require different handling.
  1117		 */
  1118		constraint = constrained_alloc(oc);
  1119		if (constraint != CONSTRAINT_MEMORY_POLICY)
  1120			oc->nodemask = NULL;
  1121		check_panic_on_oom(oc, constraint);
  1122	
  1123		if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
  1124		    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
  1125		    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
  1126			get_task_struct(current);
  1127			oc->chosen_task = current;
  1128			oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
  1129			return true;
  1130		}
  1131	
  1132		if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
  1133			delay = true;
  1134			goto out;
  1135		}
  1136	
  1137		select_bad_process(oc);
  1138		/* Found nothing?!?! Either we hang forever, or we panic. */
  1139		if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
  1140			dump_header(oc, NULL);
  1141			panic("Out of memory and no killable processes...\n");
  1142		}
  1143		if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
  1144			oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
  1145					 "Memory cgroup out of memory");
  1146			delay = true;
  1147		}
  1148	
  1149	out:
  1150		/*
  1151		 * Give the killed process a good chance to exit before trying
  1152		 * to allocate memory again.
  1153		 */
  1154		if (delay)
  1155			schedule_timeout_killable(1);
  1156	
> 1157		return !!(oc->chosen_task | oc->chosen_memcg);
> 1158	}
  1159	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27001 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-31 15:23                                             ` Tetsuo Handa
@ 2018-05-31 18:47                                               ` Michal Hocko
  2018-06-01  1:21                                                 ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-05-31 18:47 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, torvalds, guro, rientjes, hannes, vdavydov.dev,
	tj, linux-mm

On Fri 01-06-18 00:23:57, Tetsuo Handa wrote:
> On 2018/05/31 19:44, Michal Hocko wrote:
> > On Thu 31-05-18 19:10:48, Tetsuo Handa wrote:
> >> On 2018/05/30 8:07, Andrew Morton wrote:
> >>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> >>>
> >>>>> I suggest applying
> >>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> >>>>
> >>>> Well, I hope the whole pile gets merged in the upcoming merge window
> >>>> rather than stall even more.
> >>>
> >>> I'm more inclined to drop it all.  David has identified significant
> >>> shortcomings and I'm not seeing a way of addressing those shortcomings
> >>> in a backward-compatible fashion.  Therefore there is no way forward
> >>> at present.
> >>>
> >>
> >> Can we apply my patch as-is first?
> > 
> > No. As already explained before. Sprinkling new sleeps without a strong
> > reason is not acceptable. The issue you are seeing is pretty artificial
> > and as such doesn're really warrant an immediate fix. We should rather
> > go with a well thought trhough fix. In other words we should simply drop
> > the sleep inside the oom_lock for starter unless it causes some really
> > unexpected behavior change.
> > 
> 
> The OOM killer did not require schedule_timeout_killable(1) to return
> as long as the OOM victim can call __mmput(). But now the OOM killer
> requires schedule_timeout_killable(1) to return in order to allow the
> OOM victim to call __oom_reap_task_mm(). Thus, this is a regression.
> 
> Artificial cannot become the reason to postpone my patch. If we don't care
> artificialness/maliciousness, we won't need to care Spectre/Meltdown bugs.
> 
> I'm not sprinkling new sleeps. I'm just merging existing sleeps (i.e.
> mutex_trylock() case and !mutex_trylock() case) and updating the outdated
> comments.

Sigh. So what exactly is wrong with going simple and do
http://lkml.kernel.org/r/20180528124313.GC27180@dhcp22.suse.cz ?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-31 18:47                                               ` Michal Hocko
@ 2018-06-01  1:21                                                 ` Tetsuo Handa
  2018-06-01  8:04                                                   ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-06-01  1:21 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, torvalds, guro, rientjes, hannes, vdavydov.dev,
	tj, linux-mm

Michal Hocko wrote:
> On Fri 01-06-18 00:23:57, Tetsuo Handa wrote:
> > On 2018/05/31 19:44, Michal Hocko wrote:
> > > On Thu 31-05-18 19:10:48, Tetsuo Handa wrote:
> > >> On 2018/05/30 8:07, Andrew Morton wrote:
> > >>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> > >>>
> > >>>>> I suggest applying
> > >>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> > >>>>
> > >>>> Well, I hope the whole pile gets merged in the upcoming merge window
> > >>>> rather than stall even more.
> > >>>
> > >>> I'm more inclined to drop it all.  David has identified significant
> > >>> shortcomings and I'm not seeing a way of addressing those shortcomings
> > >>> in a backward-compatible fashion.  Therefore there is no way forward
> > >>> at present.
> > >>>
> > >>
> > >> Can we apply my patch as-is first?
> > > 
> > > No. As already explained before. Sprinkling new sleeps without a strong
> > > reason is not acceptable. The issue you are seeing is pretty artificial
> > > and as such doesn're really warrant an immediate fix. We should rather
> > > go with a well thought trhough fix. In other words we should simply drop
> > > the sleep inside the oom_lock for starter unless it causes some really
> > > unexpected behavior change.
> > > 
> > 
> > The OOM killer did not require schedule_timeout_killable(1) to return
> > as long as the OOM victim can call __mmput(). But now the OOM killer
> > requires schedule_timeout_killable(1) to return in order to allow the
> > OOM victim to call __oom_reap_task_mm(). Thus, this is a regression.
> > 
> > Artificial cannot become the reason to postpone my patch. If we don't care
> > artificialness/maliciousness, we won't need to care Spectre/Meltdown bugs.
> > 
> > I'm not sprinkling new sleeps. I'm just merging existing sleeps (i.e.
> > mutex_trylock() case and !mutex_trylock() case) and updating the outdated
> > comments.
> 
> Sigh. So what exactly is wrong with going simple and do
> http://lkml.kernel.org/r/20180528124313.GC27180@dhcp22.suse.cz ?
> 

Because

  (1) You are trying to apply this fix after Roman's patchset which
      Andrew Morton is more inclined to drop.

  (2) You are making this fix difficult to backport because this
      patch depends on Roman's patchset.

  (3) You are not fixing the bug in Roman's patchset.

  (4) You are not updating the outdated comment in my patch and
      Roman's patchset.

  (5) You are not evaluating the side effect of not sleeping
      outside of the OOM path, despite you said

      > If we _really_ need to touch should_reclaim_retry then
      > it should be done in a separate patch with some numbers/tracing
      > data backing that story.

      and I consider that "whether moving the short sleep to
      should_reclaim_retry() has negative impact" and "whether
      eliminating the short sleep has negative impact" should be
      evaluated in a separate patch.

but I will tolerate below patch if you can accept below patch "as-is"
(because it explicitly notes what actually happens and there might be
unexpected side effect of not sleeping outside of the OOM path).

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-01  1:21                                                 ` Tetsuo Handa
@ 2018-06-01  8:04                                                   ` Michal Hocko
  0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2018-06-01  8:04 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, torvalds, guro, rientjes, hannes, vdavydov.dev,
	tj, linux-mm

On Fri 01-06-18 10:21:13, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 01-06-18 00:23:57, Tetsuo Handa wrote:
> > > On 2018/05/31 19:44, Michal Hocko wrote:
> > > > On Thu 31-05-18 19:10:48, Tetsuo Handa wrote:
> > > >> On 2018/05/30 8:07, Andrew Morton wrote:
> > > >>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> > > >>>
> > > >>>>> I suggest applying
> > > >>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> > > >>>>
> > > >>>> Well, I hope the whole pile gets merged in the upcoming merge window
> > > >>>> rather than stall even more.
> > > >>>
> > > >>> I'm more inclined to drop it all.  David has identified significant
> > > >>> shortcomings and I'm not seeing a way of addressing those shortcomings
> > > >>> in a backward-compatible fashion.  Therefore there is no way forward
> > > >>> at present.
> > > >>>
> > > >>
> > > >> Can we apply my patch as-is first?
> > > > 
> > > > No. As already explained before. Sprinkling new sleeps without a strong
> > > > reason is not acceptable. The issue you are seeing is pretty artificial
> > > > and as such doesn're really warrant an immediate fix. We should rather
> > > > go with a well thought trhough fix. In other words we should simply drop
> > > > the sleep inside the oom_lock for starter unless it causes some really
> > > > unexpected behavior change.
> > > > 
> > > 
> > > The OOM killer did not require schedule_timeout_killable(1) to return
> > > as long as the OOM victim can call __mmput(). But now the OOM killer
> > > requires schedule_timeout_killable(1) to return in order to allow the
> > > OOM victim to call __oom_reap_task_mm(). Thus, this is a regression.
> > > 
> > > Artificial cannot become the reason to postpone my patch. If we don't care
> > > artificialness/maliciousness, we won't need to care Spectre/Meltdown bugs.
> > > 
> > > I'm not sprinkling new sleeps. I'm just merging existing sleeps (i.e.
> > > mutex_trylock() case and !mutex_trylock() case) and updating the outdated
> > > comments.
> > 
> > Sigh. So what exactly is wrong with going simple and do
> > http://lkml.kernel.org/r/20180528124313.GC27180@dhcp22.suse.cz ?
> > 
> 
> Because
> 
>   (1) You are trying to apply this fix after Roman's patchset which
>       Andrew Morton is more inclined to drop.
> 
>   (2) You are making this fix difficult to backport because this
>       patch depends on Roman's patchset.
> 
>   (3) You are not fixing the bug in Roman's patchset.

Sigh. Would you be more happy if this was on top of linus tree? I mean
this is trivial to do so. I have provided _something_ for testing
exactly as you asked for. Considering that the cgroup aware oom killer
shouldn't stand in a way for global oom killer without a special
configurion I do no see what is the actual problem.

>   (4) You are not updating the outdated comment in my patch and
>       Roman's patchset.

But the comment is not really related to the sleep in any way. This
should be a separate patch AFAICS.

>   (5) You are not evaluating the side effect of not sleeping
>       outside of the OOM path, despite you said

Exactly. There is no point in preserving the status quo if you cannot
reasonably argue of the effect. And I do not see the actual problem to
be honest. If there are any, we can always fix up (note that OOM are
rare events we do not optimize for) with the proper justification.

>       > If we _really_ need to touch should_reclaim_retry then
>       > it should be done in a separate patch with some numbers/tracing
>       > data backing that story.
> 
>       and I consider that "whether moving the short sleep to
>       should_reclaim_retry() has negative impact" and "whether
>       eliminating the short sleep has negative impact" should be
>       evaluated in a separate patch.
> 
> but I will tolerate below patch if you can accept below patch "as-is"
> (because it explicitly notes what actually happens and there might be
> unexpected side effect of not sleeping outside of the OOM path).

Well, I do not mind. If you really insist then I just do not care enough
to argue. But please note that this very patch breaks your 1, 2, 3 :p
So if you are really serious you should probably apply and test the
following on top of Linus tree:

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ba6cb88cf58..ed9d473c571e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1077,15 +1077,9 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen && oc->chosen != (void *)-1UL) {
+	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
-		/*
-		 * Give the killed process a good chance to exit before trying
-		 * to allocate memory again.
-		 */
-		schedule_timeout_killable(1);
-	}
 	return !!oc->chosen;
 }
 
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-05-29 23:07                                       ` Andrew Morton
  2018-05-31 10:10                                         ` Tetsuo Handa
@ 2018-06-01 15:28                                         ` Michal Hocko
  2018-06-01 21:11                                           ` Andrew Morton
  1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-06-01 15:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tetsuo Handa, guro, rientjes, hannes, vdavydov.dev, tj, linux-mm,
	torvalds

On Tue 29-05-18 16:07:00, Andrew Morton wrote:
> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> 
> > > I suggest applying
> > > this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> > 
> > Well, I hope the whole pile gets merged in the upcoming merge window
> > rather than stall even more.
> 
> I'm more inclined to drop it all.  David has identified significant
> shortcomings and I'm not seeing a way of addressing those shortcomings
> in a backward-compatible fashion.  Therefore there is no way forward
> at present.

Well, I thought we have argued about those "shortcomings" back and forth
and expressed that they are not really a problem for workloads which are
going to use the feature. The backward compatibility has been explained
as well AFAICT. Anyway if this is your position on the matter then I
just give up. I've tried to do my best to review the feature (as !author
nor the end user) and I cannot do really much more. I find it quite sad
though to be honest.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-01 15:28                                         ` Michal Hocko
@ 2018-06-01 21:11                                           ` Andrew Morton
  2018-06-04  7:04                                             ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Morton @ 2018-06-01 21:11 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tetsuo Handa, guro, rientjes, hannes, vdavydov.dev, tj, linux-mm,
	torvalds

On Fri, 1 Jun 2018 17:28:01 +0200 Michal Hocko <mhocko@kernel.org> wrote:

> On Tue 29-05-18 16:07:00, Andrew Morton wrote:
> > On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> > 
> > > > I suggest applying
> > > > this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> > > 
> > > Well, I hope the whole pile gets merged in the upcoming merge window
> > > rather than stall even more.
> > 
> > I'm more inclined to drop it all.  David has identified significant
> > shortcomings and I'm not seeing a way of addressing those shortcomings
> > in a backward-compatible fashion.  Therefore there is no way forward
> > at present.
> 
> Well, I thought we have argued about those "shortcomings" back and forth
> and expressed that they are not really a problem for workloads which are
> going to use the feature. The backward compatibility has been explained
> as well AFAICT.

Feel free to re-explain.  It's the only way we'll get there.

David has proposed an alternative patchset.  IIRC Roman gave that a
one-line positive response but I don't think it has seen a lot of
attention?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-01 21:11                                           ` Andrew Morton
@ 2018-06-04  7:04                                             ` Michal Hocko
  2018-06-04 10:41                                               ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-06-04  7:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tetsuo Handa, guro, rientjes, hannes, vdavydov.dev, tj, linux-mm,
	torvalds

On Fri 01-06-18 14:11:10, Andrew Morton wrote:
> On Fri, 1 Jun 2018 17:28:01 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> 
> > On Tue 29-05-18 16:07:00, Andrew Morton wrote:
> > > On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> > > 
> > > > > I suggest applying
> > > > > this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> > > > 
> > > > Well, I hope the whole pile gets merged in the upcoming merge window
> > > > rather than stall even more.
> > > 
> > > I'm more inclined to drop it all.  David has identified significant
> > > shortcomings and I'm not seeing a way of addressing those shortcomings
> > > in a backward-compatible fashion.  Therefore there is no way forward
> > > at present.
> > 
> > Well, I thought we have argued about those "shortcomings" back and forth
> > and expressed that they are not really a problem for workloads which are
> > going to use the feature. The backward compatibility has been explained
> > as well AFAICT.
> 
> Feel free to re-explain.  It's the only way we'll get there.

OK, I will go and my points to the last version of the patchset.

> David has proposed an alternative patchset.  IIRC Roman gave that a
> one-line positive response but I don't think it has seen a lot of
> attention?

I plan to go and revisit that. My preliminary feedback is that a more
generic policy API is really tricky and the patchset has many holes
there. But I will come with a more specific feedback in the respective
thread.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-04  7:04                                             ` Michal Hocko
@ 2018-06-04 10:41                                               ` Tetsuo Handa
  2018-06-04 11:22                                                 ` Michal Hocko
  2018-06-06  9:02                                                 ` David Rientjes
  0 siblings, 2 replies; 51+ messages in thread
From: Tetsuo Handa @ 2018-06-04 10:41 UTC (permalink / raw)
  To: Michal Hocko, Andrew Morton
  Cc: guro, rientjes, hannes, vdavydov.dev, tj, linux-mm, torvalds

On 2018/06/04 16:04, Michal Hocko wrote:
> On Fri 01-06-18 14:11:10, Andrew Morton wrote:
>> On Fri, 1 Jun 2018 17:28:01 +0200 Michal Hocko <mhocko@kernel.org> wrote:
>>
>>> On Tue 29-05-18 16:07:00, Andrew Morton wrote:
>>>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
>>>>
>>>>>> I suggest applying
>>>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
>>>>>
>>>>> Well, I hope the whole pile gets merged in the upcoming merge window
>>>>> rather than stall even more.
>>>>
>>>> I'm more inclined to drop it all.  David has identified significant
>>>> shortcomings and I'm not seeing a way of addressing those shortcomings
>>>> in a backward-compatible fashion.  Therefore there is no way forward
>>>> at present.
>>>
>>> Well, I thought we have argued about those "shortcomings" back and forth
>>> and expressed that they are not really a problem for workloads which are
>>> going to use the feature. The backward compatibility has been explained
>>> as well AFAICT.
>>
>> Feel free to re-explain.  It's the only way we'll get there.
> 
> OK, I will go and my points to the last version of the patchset.
> 
>> David has proposed an alternative patchset.  IIRC Roman gave that a
>> one-line positive response but I don't think it has seen a lot of
>> attention?
> 
> I plan to go and revisit that. My preliminary feedback is that a more
> generic policy API is really tricky and the patchset has many holes
> there. But I will come with a more specific feedback in the respective
> thread.
> 
Is current version of "mm, oom: cgroup-aware OOM killer" patchset going to be
dropped for now? I want to know which state should I use for baseline for my patch.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-04 10:41                                               ` Tetsuo Handa
@ 2018-06-04 11:22                                                 ` Michal Hocko
  2018-06-04 11:30                                                   ` Tetsuo Handa
  2018-06-06  9:02                                                 ` David Rientjes
  1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-06-04 11:22 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, guro, rientjes, hannes, vdavydov.dev, tj,
	linux-mm, torvalds

On Mon 04-06-18 19:41:01, Tetsuo Handa wrote:
> On 2018/06/04 16:04, Michal Hocko wrote:
> > On Fri 01-06-18 14:11:10, Andrew Morton wrote:
> >> On Fri, 1 Jun 2018 17:28:01 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> >>
> >>> On Tue 29-05-18 16:07:00, Andrew Morton wrote:
> >>>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> >>>>
> >>>>>> I suggest applying
> >>>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
> >>>>>
> >>>>> Well, I hope the whole pile gets merged in the upcoming merge window
> >>>>> rather than stall even more.
> >>>>
> >>>> I'm more inclined to drop it all.  David has identified significant
> >>>> shortcomings and I'm not seeing a way of addressing those shortcomings
> >>>> in a backward-compatible fashion.  Therefore there is no way forward
> >>>> at present.
> >>>
> >>> Well, I thought we have argued about those "shortcomings" back and forth
> >>> and expressed that they are not really a problem for workloads which are
> >>> going to use the feature. The backward compatibility has been explained
> >>> as well AFAICT.
> >>
> >> Feel free to re-explain.  It's the only way we'll get there.
> > 
> > OK, I will go and my points to the last version of the patchset.
> > 
> >> David has proposed an alternative patchset.  IIRC Roman gave that a
> >> one-line positive response but I don't think it has seen a lot of
> >> attention?
> > 
> > I plan to go and revisit that. My preliminary feedback is that a more
> > generic policy API is really tricky and the patchset has many holes
> > there. But I will come with a more specific feedback in the respective
> > thread.
> > 
> Is current version of "mm, oom: cgroup-aware OOM killer" patchset going to be
> dropped for now? I want to know which state should I use for baseline for my patch.

Is it that urgent that it cannot wait until after the merge window when
thing should settle down?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-04 11:22                                                 ` Michal Hocko
@ 2018-06-04 11:30                                                   ` Tetsuo Handa
  0 siblings, 0 replies; 51+ messages in thread
From: Tetsuo Handa @ 2018-06-04 11:30 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, guro, rientjes, hannes, vdavydov.dev, tj,
	linux-mm, torvalds

On 2018/06/04 20:22, Michal Hocko wrote:
> On Mon 04-06-18 19:41:01, Tetsuo Handa wrote:
>> On 2018/06/04 16:04, Michal Hocko wrote:
>>> On Fri 01-06-18 14:11:10, Andrew Morton wrote:
>>>> On Fri, 1 Jun 2018 17:28:01 +0200 Michal Hocko <mhocko@kernel.org> wrote:
>>>>
>>>>> On Tue 29-05-18 16:07:00, Andrew Morton wrote:
>>>>>> On Tue, 29 May 2018 09:17:41 +0200 Michal Hocko <mhocko@kernel.org> wrote:
>>>>>>
>>>>>>>> I suggest applying
>>>>>>>> this patch first, and then fix "mm, oom: cgroup-aware OOM killer" patch.
>>>>>>>
>>>>>>> Well, I hope the whole pile gets merged in the upcoming merge window
>>>>>>> rather than stall even more.
>>>>>>
>>>>>> I'm more inclined to drop it all.  David has identified significant
>>>>>> shortcomings and I'm not seeing a way of addressing those shortcomings
>>>>>> in a backward-compatible fashion.  Therefore there is no way forward
>>>>>> at present.
>>>>>
>>>>> Well, I thought we have argued about those "shortcomings" back and forth
>>>>> and expressed that they are not really a problem for workloads which are
>>>>> going to use the feature. The backward compatibility has been explained
>>>>> as well AFAICT.
>>>>
>>>> Feel free to re-explain.  It's the only way we'll get there.
>>>
>>> OK, I will go and my points to the last version of the patchset.
>>>
>>>> David has proposed an alternative patchset.  IIRC Roman gave that a
>>>> one-line positive response but I don't think it has seen a lot of
>>>> attention?
>>>
>>> I plan to go and revisit that. My preliminary feedback is that a more
>>> generic policy API is really tricky and the patchset has many holes
>>> there. But I will come with a more specific feedback in the respective
>>> thread.
>>>
>> Is current version of "mm, oom: cgroup-aware OOM killer" patchset going to be
>> dropped for now? I want to know which state should I use for baseline for my patch.
> 
> Is it that urgent that it cannot wait until after the merge window when
> thing should settle down?
> 
Yes, for it is a regression fix which I expected to be in time for 4.17.
I want to apply it before OOM killer code gets complicated by cgroup-aware OOM killer.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-04 10:41                                               ` Tetsuo Handa
  2018-06-04 11:22                                                 ` Michal Hocko
@ 2018-06-06  9:02                                                 ` David Rientjes
  2018-06-06 13:37                                                   ` Tetsuo Handa
  1 sibling, 1 reply; 51+ messages in thread
From: David Rientjes @ 2018-06-06  9:02 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Michal Hocko, Andrew Morton, guro, hannes, vdavydov.dev, tj,
	linux-mm, torvalds

On Mon, 4 Jun 2018, Tetsuo Handa wrote:

> Is current version of "mm, oom: cgroup-aware OOM killer" patchset going to be
> dropped for now? I want to know which state should I use for baseline for my patch.
> 

My patchset to fix the issues with regard to the cgroup-aware oom killer 
to fix its calculations (current version in -mm is completey buggy for 
oom_score_adj, fixed in my patch 4/6), its context based errors 
(discounting mempolicy oom kills, fixed in my patch 6/6) and make it 
generally useful beyond highly specialized usecases in a backwards 
compatible way was posted on March 22 at 
https://marc.info/?l=linux-kernel&m=152175564104466.

The base patchset is seemingly abandoned in -mm, unfortunately, so I think 
all oom killer patches should be based on Linus's tree.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-06  9:02                                                 ` David Rientjes
@ 2018-06-06 13:37                                                   ` Tetsuo Handa
  2018-06-06 18:44                                                     ` David Rientjes
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-06-06 13:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Michal Hocko, Andrew Morton, guro, hannes, vdavydov.dev, tj,
	linux-mm, torvalds

On 2018/06/06 18:02, David Rientjes wrote:
> On Mon, 4 Jun 2018, Tetsuo Handa wrote:
> 
>> Is current version of "mm, oom: cgroup-aware OOM killer" patchset going to be
>> dropped for now? I want to know which state should I use for baseline for my patch.
>>
> 
> My patchset to fix the issues with regard to the cgroup-aware oom killer 
> to fix its calculations (current version in -mm is completey buggy for 
> oom_score_adj, fixed in my patch 4/6), its context based errors 
> (discounting mempolicy oom kills, fixed in my patch 6/6) and make it 
> generally useful beyond highly specialized usecases in a backwards 
> compatible way was posted on March 22 at 
> https://marc.info/?l=linux-kernel&m=152175564104466.
> 
> The base patchset is seemingly abandoned in -mm, unfortunately, so I think 
> all oom killer patches should be based on Linus's tree.
> 
OK. I will use linux.git as a base.

By the way, does "[RFC] Getting rid of INFLIGHT_VICTIM" simplify or break
your cgroup-aware oom killer? If it simplifies your work, I'd like to apply
it as well.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-06-06 13:37                                                   ` Tetsuo Handa
@ 2018-06-06 18:44                                                     ` David Rientjes
  0 siblings, 0 replies; 51+ messages in thread
From: David Rientjes @ 2018-06-06 18:44 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Michal Hocko, Andrew Morton, guro, hannes, vdavydov.dev, tj,
	linux-mm, torvalds

On Wed, 6 Jun 2018, Tetsuo Handa wrote:

> OK. I will use linux.git as a base.
> 
> By the way, does "[RFC] Getting rid of INFLIGHT_VICTIM" simplify or break
> your cgroup-aware oom killer? If it simplifies your work, I'd like to apply
> it as well.
> 

I think it impacts the proposal to allow the oom reaper to operate over 
several different mm's in its list without processing one, waiting to give 
up, removing it, and moving on to the next one.  It doesn't impact the 
cgroup-aware oom killer extension that I made other than trivial patch 
conflicts.  I think if we can iterate over the oom reaper list to 
determine inflight victims it's simpler.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-03-22 11:45 ` Michal Hocko
@ 2018-03-22 13:16   ` Tetsuo Handa
  0 siblings, 0 replies; 51+ messages in thread
From: Tetsuo Handa @ 2018-03-22 13:16 UTC (permalink / raw)
  To: mhocko; +Cc: linux-mm, rientjes, hannes, guro, tj, vdavydov.dev

Michal Hocko wrote:
> On Thu 22-03-18 19:51:56, Tetsuo Handa wrote:
> [...]
> > The whole point of the sleep is to give the OOM victim some time to exit.
> 
> Yes, and that is why we sleep under the lock because that would rule all
> other potential out_of_memory callers from jumping in.

As long as there is !MMF_OOM_SKIP mm, jumping in does not cause problems.

But since this patch did not remove mutex_lock() from the OOM reaper,
nobody can jump in until the OOM reaper completes the first reclaim attempt.
And since it is likely that mutex_trylock() by the OOM reaper succeeds,
somebody unlikely finds !MMF_OOM_SKIP mm when it jumped in.

> 
> > However, the sleep can prevent contending allocating paths from hitting
> > the OOM path again even if the OOM victim was able to exit. We need to
> > make sure that the thread which called out_of_memory() will release
> > oom_lock shortly. Thus, this patch brings the sleep to outside of the OOM
> > path. Since the OOM reaper waits for the oom_lock, this patch unlikely
> > allows contending allocating paths to hit the OOM path earlier than now.
> 
> The sleep outside of the lock doesn't make much sense to me. It is
> basically contradicting its original purpose. If we do want to throttle
> direct reclaimers than OK but this patch is not the way how to do that.
> 
> If you really believe that the sleep is more harmful than useful, then
> fair enough, I would rather see it removed than shuffled all over
> outside the lock. 

Yes, I do believe that the sleep with oom_lock held is more harmful than useful.
Please remove the sleep (but be careful not to lose the guaranteed sleep for
PF_WQ_WORKER).

> 
> So
> Nacked-by: Michal Hocko <mhocko@suse.com>
> -- 
> Michal Hocko
> SUSE Labs
> 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-03-22 10:51 [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held Tetsuo Handa
@ 2018-03-22 11:45 ` Michal Hocko
  2018-03-22 13:16   ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-03-22 11:45 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: linux-mm, David Rientjes, Johannes Weiner, Roman Gushchin,
	Tejun Heo, Vladimir Davydov

On Thu 22-03-18 19:51:56, Tetsuo Handa wrote:
[...]
> The whole point of the sleep is to give the OOM victim some time to exit.

Yes, and that is why we sleep under the lock because that would rule all
other potential out_of_memory callers from jumping in.

> However, the sleep can prevent contending allocating paths from hitting
> the OOM path again even if the OOM victim was able to exit. We need to
> make sure that the thread which called out_of_memory() will release
> oom_lock shortly. Thus, this patch brings the sleep to outside of the OOM
> path. Since the OOM reaper waits for the oom_lock, this patch unlikely
> allows contending allocating paths to hit the OOM path earlier than now.

The sleep outside of the lock doesn't make much sense to me. It is
basically contradicting its original purpose. If we do want to throttle
direct reclaimers than OK but this patch is not the way how to do that.

If you really believe that the sleep is more harmful than useful, then
fair enough, I would rather see it removed than shuffled all over
outside the lock. 

So
Nacked-by: Michal Hocko <mhocko@suse.com>
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
@ 2018-03-22 10:51 Tetsuo Handa
  2018-03-22 11:45 ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-03-22 10:51 UTC (permalink / raw)
  To: linux-mm
  Cc: Tetsuo Handa, David Rientjes, Johannes Weiner, Michal Hocko,
	Roman Gushchin, Tejun Heo, Vladimir Davydov

When I was examining a bug which occurs under CPU + memory pressure, I
observed that a thread which called out_of_memory() can sleep for minutes
at schedule_timeout_killable(1) with oom_lock held when many threads are
doing direct reclaim.

--------------------
[  163.357628] b.out invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null), order=0, oom_score_adj=0
[  163.360946] CPU: 0 PID: 554 Comm: b.out Not tainted 4.15.0-rc8+ #216
(...snipped...)
[  163.470193] Out of memory: Kill process 548 (b.out) score 6 or sacrifice child
[  163.471813] Killed process 1191 (b.out) total-vm:2108kB, anon-rss:60kB, file-rss:4kB, shmem-rss:0kB
(...snipped...)
[  248.016033] sysrq: SysRq : Show State
(...snipped...)
[  249.625720] b.out           R  running task        0   554    538 0x00000004
[  249.627778] Call Trace:
[  249.628513]  __schedule+0x142/0x4b2
[  249.629394]  schedule+0x27/0x70
[  249.630114]  schedule_timeout+0xd1/0x160
[  249.631029]  ? oom_kill_process+0x396/0x400
[  249.632039]  ? __next_timer_interrupt+0xc0/0xc0
[  249.633087]  schedule_timeout_killable+0x15/0x20
[  249.634097]  out_of_memory+0xea/0x270
[  249.634901]  __alloc_pages_nodemask+0x715/0x880
[  249.635920]  handle_mm_fault+0x538/0xe40
[  249.636888]  ? __enqueue_entity+0x63/0x70
[  249.637787]  ? set_next_entity+0x4b/0x80
[  249.638687]  __do_page_fault+0x199/0x430
[  249.639535]  ? vmalloc_sync_all+0x180/0x180
[  249.640452]  do_page_fault+0x1a/0x1e
[  249.641283]  common_exception+0x82/0x8a
(...snipped...)
[  462.676366] oom_reaper: reaped process 1191 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
--------------------

--------------------
[  269.985819] b.out invoked oom-killer: gfp_mask=0x14200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0
[  269.988570] CPU: 0 PID: 9079 Comm: b.out Not tainted 4.15.0-rc8+ #217
(...snipped...)
[  270.073050] Out of memory: Kill process 914 (b.out) score 9 or sacrifice child
[  270.074660] Killed process 2208 (b.out) total-vm:2108kB, anon-rss:64kB, file-rss:4kB, shmem-rss:0kB
[  297.562824] sysrq: SysRq : Show State
(...snipped...)
[  471.716610] b.out           R  running task        0  9079   7400 0x00000000
[  471.718203] Call Trace:
[  471.718784]  __schedule+0x142/0x4b2
[  471.719577]  schedule+0x27/0x70
[  471.720294]  schedule_timeout+0xd1/0x160
[  471.721207]  ? oom_kill_process+0x396/0x400
[  471.722151]  ? __next_timer_interrupt+0xc0/0xc0
[  471.723215]  schedule_timeout_killable+0x15/0x20
[  471.724350]  out_of_memory+0xea/0x270
[  471.725201]  __alloc_pages_nodemask+0x715/0x880
[  471.726238]  ? radix_tree_lookup_slot+0x1f/0x50
[  471.727253]  filemap_fault+0x346/0x510
[  471.728120]  ? filemap_map_pages+0x245/0x2d0
[  471.729105]  ? unlock_page+0x30/0x30
[  471.729987]  __xfs_filemap_fault.isra.18+0x2d/0xb0
[  471.731488]  ? unlock_page+0x30/0x30
[  471.732364]  xfs_filemap_fault+0xa/0x10
[  471.733260]  __do_fault+0x11/0x30
[  471.734033]  handle_mm_fault+0x8e8/0xe40
[  471.735200]  __do_page_fault+0x199/0x430
[  471.736163]  ? common_exception+0x82/0x8a
[  471.737102]  ? vmalloc_sync_all+0x180/0x180
[  471.738061]  do_page_fault+0x1a/0x1e
[  471.738881]  common_exception+0x82/0x8a
(...snipped...)
[  566.969400] oom_reaper: reaped process 2208 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
--------------------

The whole point of the sleep is to give the OOM victim some time to exit.
However, the sleep can prevent contending allocating paths from hitting
the OOM path again even if the OOM victim was able to exit. We need to
make sure that the thread which called out_of_memory() will release
oom_lock shortly. Thus, this patch brings the sleep to outside of the OOM
path. Since the OOM reaper waits for the oom_lock, this patch unlikely
allows contending allocating paths to hit the OOM path earlier than now.

For __alloc_pages_may_oom() case, this patch uses uninterruptible sleep
than killable sleep because fatal_signal_pending() threads won't be able
to use memory reserves unless tsk_is_oom_victim() becomes true.

Even with this patch, cond_resched() from printk() or CONFIG_PREEMPT=y can
allow other contending allocating paths to disturb the owner of oom_lock.
According to [1], long term Michal wants to focus on making the OOM
context not preemptible. But we could not make progress for that direction
for two years. Making the OOM context non preemptible should be addressed
by future patches.

[1] http://lkml.kernel.org/r/20160304160519.GG31257@dhcp22.suse.cz

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
---
 mm/oom_kill.c   | 39 +++++++++++++--------------------------
 mm/page_alloc.c |  7 ++++++-
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dfd3705..dcdb642 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -487,18 +487,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	bool ret = true;
 
 	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * __oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
+	 * We have to make sure not to cause premature new oom victim selection:
+	 * __alloc_pages_may_oom()     __oom_reap_task_mm()
+	 *   mutex_trylock(&oom_lock)
+	 *   get_page_from_freelist(ALLOC_WMARK_HIGH) # fails
+	 *                               unmap_page_range() # frees some memory
+	 *                               set_bit(MMF_OOM_SKIP)
+	 *   out_of_memory()
+	 *     select_bad_process()
+	 *       test_bit(MMF_OOM_SKIP) # selects new oom victim
+	 *   mutex_unlock(&oom_lock)
 	 */
 	mutex_lock(&oom_lock);
 
@@ -1074,7 +1072,6 @@ bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-	bool delay = false; /* if set, delay next allocation attempt */
 
 	if (oom_killer_disabled)
 		return false;
@@ -1124,10 +1121,8 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
-	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
-		delay = true;
+	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc))
 		goto out;
-	}
 
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
@@ -1135,20 +1130,11 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
+	if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
-		delay = true;
-	}
 
 out:
-	/*
-	 * Give the killed process a good chance to exit before trying
-	 * to allocate memory again.
-	 */
-	if (delay)
-		schedule_timeout_killable(1);
-
 	return !!oc->chosen_task;
 }
 
@@ -1174,4 +1160,5 @@ void pagefault_out_of_memory(void)
 		return;
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
+	schedule_timeout_killable(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2244366..44b8eba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3475,7 +3475,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
-		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 
@@ -4238,6 +4237,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
 		no_progress_loops = 0;
+		/*
+		 * This schedule_timeout_*() serves as a guaranteed sleep for
+		 * PF_WQ_WORKER threads when __zone_watermark_ok() == false.
+		 */
+		if (!tsk_is_oom_victim(current))
+			schedule_timeout_uninterruptible(1);
 		goto retry;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-01-24 13:28       ` Tetsuo Handa
@ 2018-02-13 11:58         ` Tetsuo Handa
  0 siblings, 0 replies; 51+ messages in thread
From: Tetsuo Handa @ 2018-02-13 11:58 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, rientjes, hannes, guro, tj, vdavydov.dev, torvalds

Michal, if you are busy, can I test my version until you get time?

Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Tue 23-01-18 21:07:03, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > To be completely host, I am not in love with this
> > > > schedule_timeout_uninterruptible(1). It is an ugly hack. It used to be
> > > > much more important in the past when the oom victim test was too
> > > > fragile. I strongly suspect that it is not needed this days so rather
> > > > than moving the sleep around I would try to remove it altogether.
> > > 
> > > But this schedule_timeout_uninterruptible(1) serves as a guaranteed
> > > sleep for PF_WQ_WORKER threads
> > > ( http://lkml.kernel.org/r/20170830064019.mfihbeu3mm5ygcrb@dhcp22.suse.cz ).
> > > 
> > >     > If we are under memory pressure, __zone_watermark_ok() can return false.
> > >     > If __zone_watermark_ok() == false, when is schedule_timeout_*() called explicitly?
> > > 
> > >     If all zones fail with the watermark check then we should hit the oom
> > >     path and sleep there. We do not do so for all cases though.
> > > 
> > > Thus, you cannot simply remove it.
> > 
> > Then I would rather make should_reclaim_retry more robust.
> 
> I'm OK with that if we can remove schedule_timeout_*() with oom_lock held.
> 
> > 
> > > > Also, your changelog silently skips over some important details. The
> > > > system must be really overloaded when a short sleep can take minutes.
> > > 
> > > Yes, the system was really overloaded, for I was testing below reproducer
> > > on a x86_32 kernel.
> > [...]
> > > > I would trongly suspect that such an overloaded system doesn't need
> > > > a short sleep to hold the oom lock for too long. All you need is to be
> > > > preempted. So this patch doesn't really solve any _real_ problem.
> > > 
> > > Preemption makes the OOM situation much worse. The only way to avoid all OOM
> > > lockups caused by lack of CPU resource is to replace mutex_trylock(&oom_lock)
> > > in __alloc_pages_may_oom() with mutex_lock(&oom_lock) (or similar) in order to
> > > guarantee that all threads waiting for the OOM killer/reaper to make forward
> > > progress shall give enough CPU resource.
> > 
> > And how exactly does that help when the page allocator gets preempted by
> > somebody not doing any allocation?
> 
> The page allocator is not responsible for wasting CPU resource for something
> other than memory allocation request. Wasting CPU resource due to unable to
> allow the OOM killer/reaper to make forward progress is the page allocator's
> bug.
> 
> There are currently ways to artificially choke the OOM killer/reaper (e.g. let
> a SCHED_IDLE priority thread which is allowed to run on only one specific CPU
> invoke the OOM killer). To mitigate it, offloading the OOM killer to a dedicated
> kernel thread (like the OOM reaper) which has reasonable scheduling priority and
> is allowed to run on any idle CPU will help. But such enhancement is out of scope
> for this patch. This patch is intended for avoid sleeping for minutes at
> schedule_timeout_killable(1) with oom_lock held which can be reproduced without
> using SCHED_IDLE priority and/or runnable CPU restrictions.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-01-23 12:42     ` Michal Hocko
@ 2018-01-24 13:28       ` Tetsuo Handa
  2018-02-13 11:58         ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-01-24 13:28 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, rientjes, hannes, guro, tj, vdavydov.dev, torvalds

Michal Hocko wrote:
> On Tue 23-01-18 21:07:03, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > To be completely host, I am not in love with this
> > > schedule_timeout_uninterruptible(1). It is an ugly hack. It used to be
> > > much more important in the past when the oom victim test was too
> > > fragile. I strongly suspect that it is not needed this days so rather
> > > than moving the sleep around I would try to remove it altogether.
> > 
> > But this schedule_timeout_uninterruptible(1) serves as a guaranteed
> > sleep for PF_WQ_WORKER threads
> > ( http://lkml.kernel.org/r/20170830064019.mfihbeu3mm5ygcrb@dhcp22.suse.cz ).
> > 
> >     > If we are under memory pressure, __zone_watermark_ok() can return false.
> >     > If __zone_watermark_ok() == false, when is schedule_timeout_*() called explicitly?
> > 
> >     If all zones fail with the watermark check then we should hit the oom
> >     path and sleep there. We do not do so for all cases though.
> > 
> > Thus, you cannot simply remove it.
> 
> Then I would rather make should_reclaim_retry more robust.

I'm OK with that if we can remove schedule_timeout_*() with oom_lock held.

> 
> > > Also, your changelog silently skips over some important details. The
> > > system must be really overloaded when a short sleep can take minutes.
> > 
> > Yes, the system was really overloaded, for I was testing below reproducer
> > on a x86_32 kernel.
> [...]
> > > I would trongly suspect that such an overloaded system doesn't need
> > > a short sleep to hold the oom lock for too long. All you need is to be
> > > preempted. So this patch doesn't really solve any _real_ problem.
> > 
> > Preemption makes the OOM situation much worse. The only way to avoid all OOM
> > lockups caused by lack of CPU resource is to replace mutex_trylock(&oom_lock)
> > in __alloc_pages_may_oom() with mutex_lock(&oom_lock) (or similar) in order to
> > guarantee that all threads waiting for the OOM killer/reaper to make forward
> > progress shall give enough CPU resource.
> 
> And how exactly does that help when the page allocator gets preempted by
> somebody not doing any allocation?

The page allocator is not responsible for wasting CPU resource for something
other than memory allocation request. Wasting CPU resource due to unable to
allow the OOM killer/reaper to make forward progress is the page allocator's
bug.

There are currently ways to artificially choke the OOM killer/reaper (e.g. let
a SCHED_IDLE priority thread which is allowed to run on only one specific CPU
invoke the OOM killer). To mitigate it, offloading the OOM killer to a dedicated
kernel thread (like the OOM reaper) which has reasonable scheduling priority and
is allowed to run on any idle CPU will help. But such enhancement is out of scope
for this patch. This patch is intended for avoid sleeping for minutes at
schedule_timeout_killable(1) with oom_lock held which can be reproduced without
using SCHED_IDLE priority and/or runnable CPU restrictions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-01-23 12:07   ` Tetsuo Handa
@ 2018-01-23 12:42     ` Michal Hocko
  2018-01-24 13:28       ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-01-23 12:42 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: akpm, linux-mm, rientjes, hannes, guro, tj, vdavydov.dev, torvalds

On Tue 23-01-18 21:07:03, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Mon 22-01-18 22:46:22, Tetsuo Handa wrote:
> > > When I was examining a bug which occurs under CPU + memory pressure, I
> > > observed that a thread which called out_of_memory() can sleep for minutes
> > > at schedule_timeout_killable(1) with oom_lock held when many threads are
> > > doing direct reclaim.
> > > 
> > > --------------------
> > > [  163.357628] b.out invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null), order=0, oom_score_adj=0
> > > [  163.360946] CPU: 0 PID: 554 Comm: b.out Not tainted 4.15.0-rc8+ #216
> > > (...snipped...)
> > > [  163.470193] Out of memory: Kill process 548 (b.out) score 6 or sacrifice child
> > > [  163.471813] Killed process 1191 (b.out) total-vm:2108kB, anon-rss:60kB, file-rss:4kB, shmem-rss:0kB
> > > (...snipped...)
> > > [  248.016033] sysrq: SysRq : Show State
> > > (...snipped...)
> > > [  249.625720] b.out           R  running task        0   554    538 0x00000004
> > > [  249.627778] Call Trace:
> > > [  249.628513]  __schedule+0x142/0x4b2
> > > [  249.629394]  schedule+0x27/0x70
> > > [  249.630114]  schedule_timeout+0xd1/0x160
> > > [  249.631029]  ? oom_kill_process+0x396/0x400
> > > [  249.632039]  ? __next_timer_interrupt+0xc0/0xc0
> > > [  249.633087]  schedule_timeout_killable+0x15/0x20
> > > [  249.634097]  out_of_memory+0xea/0x270
> > > [  249.634901]  __alloc_pages_nodemask+0x715/0x880
> > > [  249.635920]  handle_mm_fault+0x538/0xe40
> > > [  249.636888]  ? __enqueue_entity+0x63/0x70
> > > [  249.637787]  ? set_next_entity+0x4b/0x80
> > > [  249.638687]  __do_page_fault+0x199/0x430
> > > [  249.639535]  ? vmalloc_sync_all+0x180/0x180
> > > [  249.640452]  do_page_fault+0x1a/0x1e
> > > [  249.641283]  common_exception+0x82/0x8a
> > > (...snipped...)
> > > [  462.676366] oom_reaper: reaped process 1191 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> > > --------------------
> > > 
> > > --------------------
> > > [  269.985819] b.out invoked oom-killer: gfp_mask=0x14200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0
> > > [  269.988570] CPU: 0 PID: 9079 Comm: b.out Not tainted 4.15.0-rc8+ #217
> > > (...snipped...)
> > > [  270.073050] Out of memory: Kill process 914 (b.out) score 9 or sacrifice child
> > > [  270.074660] Killed process 2208 (b.out) total-vm:2108kB, anon-rss:64kB, file-rss:4kB, shmem-rss:0kB
> > > [  297.562824] sysrq: SysRq : Show State
> > > (...snipped...)
> > > [  471.716610] b.out           R  running task        0  9079   7400 0x00000000
> > > [  471.718203] Call Trace:
> > > [  471.718784]  __schedule+0x142/0x4b2
> > > [  471.719577]  schedule+0x27/0x70
> > > [  471.720294]  schedule_timeout+0xd1/0x160
> > > [  471.721207]  ? oom_kill_process+0x396/0x400
> > > [  471.722151]  ? __next_timer_interrupt+0xc0/0xc0
> > > [  471.723215]  schedule_timeout_killable+0x15/0x20
> > > [  471.724350]  out_of_memory+0xea/0x270
> > > [  471.725201]  __alloc_pages_nodemask+0x715/0x880
> > > [  471.726238]  ? radix_tree_lookup_slot+0x1f/0x50
> > > [  471.727253]  filemap_fault+0x346/0x510
> > > [  471.728120]  ? filemap_map_pages+0x245/0x2d0
> > > [  471.729105]  ? unlock_page+0x30/0x30
> > > [  471.729987]  __xfs_filemap_fault.isra.18+0x2d/0xb0
> > > [  471.731488]  ? unlock_page+0x30/0x30
> > > [  471.732364]  xfs_filemap_fault+0xa/0x10
> > > [  471.733260]  __do_fault+0x11/0x30
> > > [  471.734033]  handle_mm_fault+0x8e8/0xe40
> > > [  471.735200]  __do_page_fault+0x199/0x430
> > > [  471.736163]  ? common_exception+0x82/0x8a
> > > [  471.737102]  ? vmalloc_sync_all+0x180/0x180
> > > [  471.738061]  do_page_fault+0x1a/0x1e
> > > [  471.738881]  common_exception+0x82/0x8a
> > > (...snipped...)
> > > [  566.969400] oom_reaper: reaped process 2208 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> > > --------------------
> > > 
> > > Allowing the OOM reaper to start reclaiming memory without waiting for
> > > the oom_lock is not sufficient if the OOM reaper did not reclaim enough
> > > memory. We need to make sure that the thread which called out_of_memory()
> > > will release oom_lock shortly. Thus, this patch brings the short sleep
> > > to outside of the OOM killer.
> > 
> > And why does sleeping outside of the lock make any sense? The whole
> > point of the sleep is to give the victim some time to exit and if we
> > sleep outside of the lock then contending allocating paths hit the oom
> > path early.
> 
> Why does subsequent threads call out_of_memory() again without giving
> the OOM victim some time to exit matter? MMF_OOM_SKIP test will prevent
> subsequent out_of_memory() calls from selecting next OOM victims.

Yes, it is possible that the _current_ code doesn't need this as I've
written below.

> > To be completely host, I am not in love with this
> > schedule_timeout_uninterruptible(1). It is an ugly hack. It used to be
> > much more important in the past when the oom victim test was too
> > fragile. I strongly suspect that it is not needed this days so rather
> > than moving the sleep around I would try to remove it altogether.
> 
> But this schedule_timeout_uninterruptible(1) serves as a guaranteed
> sleep for PF_WQ_WORKER threads
> ( http://lkml.kernel.org/r/20170830064019.mfihbeu3mm5ygcrb@dhcp22.suse.cz ).
> 
>     > If we are under memory pressure, __zone_watermark_ok() can return false.
>     > If __zone_watermark_ok() == false, when is schedule_timeout_*() called explicitly?
> 
>     If all zones fail with the watermark check then we should hit the oom
>     path and sleep there. We do not do so for all cases though.
> 
> Thus, you cannot simply remove it.

Then I would rather make should_reclaim_retry more robust.

> > Also, your changelog silently skips over some important details. The
> > system must be really overloaded when a short sleep can take minutes.
> 
> Yes, the system was really overloaded, for I was testing below reproducer
> on a x86_32 kernel.
[...]
> > I would trongly suspect that such an overloaded system doesn't need
> > a short sleep to hold the oom lock for too long. All you need is to be
> > preempted. So this patch doesn't really solve any _real_ problem.
> 
> Preemption makes the OOM situation much worse. The only way to avoid all OOM
> lockups caused by lack of CPU resource is to replace mutex_trylock(&oom_lock)
> in __alloc_pages_may_oom() with mutex_lock(&oom_lock) (or similar) in order to
> guarantee that all threads waiting for the OOM killer/reaper to make forward
> progress shall give enough CPU resource.

And how exactly does that help when the page allocator gets preempted by
somebody not doing any allocation?
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-01-23  8:38 ` Michal Hocko
@ 2018-01-23 12:07   ` Tetsuo Handa
  2018-01-23 12:42     ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-01-23 12:07 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, rientjes, hannes, guro, tj, vdavydov.dev, torvalds

Michal Hocko wrote:
> On Mon 22-01-18 22:46:22, Tetsuo Handa wrote:
> > When I was examining a bug which occurs under CPU + memory pressure, I
> > observed that a thread which called out_of_memory() can sleep for minutes
> > at schedule_timeout_killable(1) with oom_lock held when many threads are
> > doing direct reclaim.
> > 
> > --------------------
> > [  163.357628] b.out invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null), order=0, oom_score_adj=0
> > [  163.360946] CPU: 0 PID: 554 Comm: b.out Not tainted 4.15.0-rc8+ #216
> > (...snipped...)
> > [  163.470193] Out of memory: Kill process 548 (b.out) score 6 or sacrifice child
> > [  163.471813] Killed process 1191 (b.out) total-vm:2108kB, anon-rss:60kB, file-rss:4kB, shmem-rss:0kB
> > (...snipped...)
> > [  248.016033] sysrq: SysRq : Show State
> > (...snipped...)
> > [  249.625720] b.out           R  running task        0   554    538 0x00000004
> > [  249.627778] Call Trace:
> > [  249.628513]  __schedule+0x142/0x4b2
> > [  249.629394]  schedule+0x27/0x70
> > [  249.630114]  schedule_timeout+0xd1/0x160
> > [  249.631029]  ? oom_kill_process+0x396/0x400
> > [  249.632039]  ? __next_timer_interrupt+0xc0/0xc0
> > [  249.633087]  schedule_timeout_killable+0x15/0x20
> > [  249.634097]  out_of_memory+0xea/0x270
> > [  249.634901]  __alloc_pages_nodemask+0x715/0x880
> > [  249.635920]  handle_mm_fault+0x538/0xe40
> > [  249.636888]  ? __enqueue_entity+0x63/0x70
> > [  249.637787]  ? set_next_entity+0x4b/0x80
> > [  249.638687]  __do_page_fault+0x199/0x430
> > [  249.639535]  ? vmalloc_sync_all+0x180/0x180
> > [  249.640452]  do_page_fault+0x1a/0x1e
> > [  249.641283]  common_exception+0x82/0x8a
> > (...snipped...)
> > [  462.676366] oom_reaper: reaped process 1191 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> > --------------------
> > 
> > --------------------
> > [  269.985819] b.out invoked oom-killer: gfp_mask=0x14200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0
> > [  269.988570] CPU: 0 PID: 9079 Comm: b.out Not tainted 4.15.0-rc8+ #217
> > (...snipped...)
> > [  270.073050] Out of memory: Kill process 914 (b.out) score 9 or sacrifice child
> > [  270.074660] Killed process 2208 (b.out) total-vm:2108kB, anon-rss:64kB, file-rss:4kB, shmem-rss:0kB
> > [  297.562824] sysrq: SysRq : Show State
> > (...snipped...)
> > [  471.716610] b.out           R  running task        0  9079   7400 0x00000000
> > [  471.718203] Call Trace:
> > [  471.718784]  __schedule+0x142/0x4b2
> > [  471.719577]  schedule+0x27/0x70
> > [  471.720294]  schedule_timeout+0xd1/0x160
> > [  471.721207]  ? oom_kill_process+0x396/0x400
> > [  471.722151]  ? __next_timer_interrupt+0xc0/0xc0
> > [  471.723215]  schedule_timeout_killable+0x15/0x20
> > [  471.724350]  out_of_memory+0xea/0x270
> > [  471.725201]  __alloc_pages_nodemask+0x715/0x880
> > [  471.726238]  ? radix_tree_lookup_slot+0x1f/0x50
> > [  471.727253]  filemap_fault+0x346/0x510
> > [  471.728120]  ? filemap_map_pages+0x245/0x2d0
> > [  471.729105]  ? unlock_page+0x30/0x30
> > [  471.729987]  __xfs_filemap_fault.isra.18+0x2d/0xb0
> > [  471.731488]  ? unlock_page+0x30/0x30
> > [  471.732364]  xfs_filemap_fault+0xa/0x10
> > [  471.733260]  __do_fault+0x11/0x30
> > [  471.734033]  handle_mm_fault+0x8e8/0xe40
> > [  471.735200]  __do_page_fault+0x199/0x430
> > [  471.736163]  ? common_exception+0x82/0x8a
> > [  471.737102]  ? vmalloc_sync_all+0x180/0x180
> > [  471.738061]  do_page_fault+0x1a/0x1e
> > [  471.738881]  common_exception+0x82/0x8a
> > (...snipped...)
> > [  566.969400] oom_reaper: reaped process 2208 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> > --------------------
> > 
> > Allowing the OOM reaper to start reclaiming memory without waiting for
> > the oom_lock is not sufficient if the OOM reaper did not reclaim enough
> > memory. We need to make sure that the thread which called out_of_memory()
> > will release oom_lock shortly. Thus, this patch brings the short sleep
> > to outside of the OOM killer.
> 
> And why does sleeping outside of the lock make any sense? The whole
> point of the sleep is to give the victim some time to exit and if we
> sleep outside of the lock then contending allocating paths hit the oom
> path early.

Why does subsequent threads call out_of_memory() again without giving
the OOM victim some time to exit matter? MMF_OOM_SKIP test will prevent
subsequent out_of_memory() calls from selecting next OOM victims.

> 
> To be completely host, I am not in love with this
> schedule_timeout_uninterruptible(1). It is an ugly hack. It used to be
> much more important in the past when the oom victim test was too
> fragile. I strongly suspect that it is not needed this days so rather
> than moving the sleep around I would try to remove it altogether.

But this schedule_timeout_uninterruptible(1) serves as a guaranteed
sleep for PF_WQ_WORKER threads
( http://lkml.kernel.org/r/20170830064019.mfihbeu3mm5ygcrb@dhcp22.suse.cz ).

    > If we are under memory pressure, __zone_watermark_ok() can return false.
    > If __zone_watermark_ok() == false, when is schedule_timeout_*() called explicitly?

    If all zones fail with the watermark check then we should hit the oom
    path and sleep there. We do not do so for all cases though.

Thus, you cannot simply remove it.

> 
> Also, your changelog silently skips over some important details. The
> system must be really overloaded when a short sleep can take minutes.

Yes, the system was really overloaded, for I was testing below reproducer
on a x86_32 kernel.

----------------------------------------
#define _FILE_OFFSET_BITS 64
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>

static void run(void)
{
	unsigned long long size;
	char *buf = NULL;
	unsigned long long i;
	for (i = 0; i < 24; i++) {
		if (fork() == 0) {
			static char buf[4096];
			const int fd = open("file", O_CREAT | O_WRONLY | O_APPEND, 0600);
			while (write(fd, buf, sizeof(buf)) == sizeof(buf));
			close(fd);
			_exit(0);
		}
	}
	for (size = 1048576; size < 512ULL * (1 << 30); size += 1048576) {
		char *cp = realloc(buf, size);
		if (!cp) {
			size -= 1048576;
			break;
		}
		buf = cp;
	}
	for (i = 0; i < size; i += 4096)
		buf[i] = 0;
	_exit(0);
} 

int main(int argc, char *argv[])
{
	if (argc != 1)
		run();
	else
		while (1)
			if (fork() == 0)
				execlp(argv[0], argv[0], "", NULL);
	return 0;
}
----------------------------------------

> I would trongly suspect that such an overloaded system doesn't need
> a short sleep to hold the oom lock for too long. All you need is to be
> preempted. So this patch doesn't really solve any _real_ problem.

Preemption makes the OOM situation much worse. The only way to avoid all OOM
lockups caused by lack of CPU resource is to replace mutex_trylock(&oom_lock)
in __alloc_pages_may_oom() with mutex_lock(&oom_lock) (or similar) in order to
guarantee that all threads waiting for the OOM killer/reaper to make forward
progress shall give enough CPU resource.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
  2018-01-22 13:46 Tetsuo Handa
@ 2018-01-23  8:38 ` Michal Hocko
  2018-01-23 12:07   ` Tetsuo Handa
  0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2018-01-23  8:38 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: akpm, linux-mm, David Rientjes, Johannes Weiner, Roman Gushchin,
	Tejun Heo, Vladimir Davydov

On Mon 22-01-18 22:46:22, Tetsuo Handa wrote:
> When I was examining a bug which occurs under CPU + memory pressure, I
> observed that a thread which called out_of_memory() can sleep for minutes
> at schedule_timeout_killable(1) with oom_lock held when many threads are
> doing direct reclaim.
> 
> --------------------
> [  163.357628] b.out invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null), order=0, oom_score_adj=0
> [  163.360946] CPU: 0 PID: 554 Comm: b.out Not tainted 4.15.0-rc8+ #216
> (...snipped...)
> [  163.470193] Out of memory: Kill process 548 (b.out) score 6 or sacrifice child
> [  163.471813] Killed process 1191 (b.out) total-vm:2108kB, anon-rss:60kB, file-rss:4kB, shmem-rss:0kB
> (...snipped...)
> [  248.016033] sysrq: SysRq : Show State
> (...snipped...)
> [  249.625720] b.out           R  running task        0   554    538 0x00000004
> [  249.627778] Call Trace:
> [  249.628513]  __schedule+0x142/0x4b2
> [  249.629394]  schedule+0x27/0x70
> [  249.630114]  schedule_timeout+0xd1/0x160
> [  249.631029]  ? oom_kill_process+0x396/0x400
> [  249.632039]  ? __next_timer_interrupt+0xc0/0xc0
> [  249.633087]  schedule_timeout_killable+0x15/0x20
> [  249.634097]  out_of_memory+0xea/0x270
> [  249.634901]  __alloc_pages_nodemask+0x715/0x880
> [  249.635920]  handle_mm_fault+0x538/0xe40
> [  249.636888]  ? __enqueue_entity+0x63/0x70
> [  249.637787]  ? set_next_entity+0x4b/0x80
> [  249.638687]  __do_page_fault+0x199/0x430
> [  249.639535]  ? vmalloc_sync_all+0x180/0x180
> [  249.640452]  do_page_fault+0x1a/0x1e
> [  249.641283]  common_exception+0x82/0x8a
> (...snipped...)
> [  462.676366] oom_reaper: reaped process 1191 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> --------------------
> 
> --------------------
> [  269.985819] b.out invoked oom-killer: gfp_mask=0x14200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0
> [  269.988570] CPU: 0 PID: 9079 Comm: b.out Not tainted 4.15.0-rc8+ #217
> (...snipped...)
> [  270.073050] Out of memory: Kill process 914 (b.out) score 9 or sacrifice child
> [  270.074660] Killed process 2208 (b.out) total-vm:2108kB, anon-rss:64kB, file-rss:4kB, shmem-rss:0kB
> [  297.562824] sysrq: SysRq : Show State
> (...snipped...)
> [  471.716610] b.out           R  running task        0  9079   7400 0x00000000
> [  471.718203] Call Trace:
> [  471.718784]  __schedule+0x142/0x4b2
> [  471.719577]  schedule+0x27/0x70
> [  471.720294]  schedule_timeout+0xd1/0x160
> [  471.721207]  ? oom_kill_process+0x396/0x400
> [  471.722151]  ? __next_timer_interrupt+0xc0/0xc0
> [  471.723215]  schedule_timeout_killable+0x15/0x20
> [  471.724350]  out_of_memory+0xea/0x270
> [  471.725201]  __alloc_pages_nodemask+0x715/0x880
> [  471.726238]  ? radix_tree_lookup_slot+0x1f/0x50
> [  471.727253]  filemap_fault+0x346/0x510
> [  471.728120]  ? filemap_map_pages+0x245/0x2d0
> [  471.729105]  ? unlock_page+0x30/0x30
> [  471.729987]  __xfs_filemap_fault.isra.18+0x2d/0xb0
> [  471.731488]  ? unlock_page+0x30/0x30
> [  471.732364]  xfs_filemap_fault+0xa/0x10
> [  471.733260]  __do_fault+0x11/0x30
> [  471.734033]  handle_mm_fault+0x8e8/0xe40
> [  471.735200]  __do_page_fault+0x199/0x430
> [  471.736163]  ? common_exception+0x82/0x8a
> [  471.737102]  ? vmalloc_sync_all+0x180/0x180
> [  471.738061]  do_page_fault+0x1a/0x1e
> [  471.738881]  common_exception+0x82/0x8a
> (...snipped...)
> [  566.969400] oom_reaper: reaped process 2208 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> --------------------
> 
> Allowing the OOM reaper to start reclaiming memory without waiting for
> the oom_lock is not sufficient if the OOM reaper did not reclaim enough
> memory. We need to make sure that the thread which called out_of_memory()
> will release oom_lock shortly. Thus, this patch brings the short sleep
> to outside of the OOM killer.

And why does sleeping outside of the lock make any sense? The whole
point of the sleep is to give the victim some time to exit and if we
sleep outside of the lock then contending allocating paths hit the oom
path early.

To be completely host, I am not in love with this
schedule_timeout_uninterruptible(1). It is an ugly hack. It used to be
much more important in the past when the oom victim test was too
fragile. I strongly suspect that it is not needed this days so rather
than moving the sleep around I would try to remove it altogether.

Also, your changelog silently skips over some important details. The
system must be really overloaded when a short sleep can take minutes.
I would trongly suspect that such an overloaded system doesn't need
a short sleep to hold the oom lock for too long. All you need is to be
preempted. So this patch doesn't really solve any _real_ problem.

[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held.
@ 2018-01-22 13:46 Tetsuo Handa
  2018-01-23  8:38 ` Michal Hocko
  0 siblings, 1 reply; 51+ messages in thread
From: Tetsuo Handa @ 2018-01-22 13:46 UTC (permalink / raw)
  To: akpm
  Cc: linux-mm, Tetsuo Handa, David Rientjes, Johannes Weiner,
	Michal Hocko, Roman Gushchin, Tejun Heo, Vladimir Davydov

When I was examining a bug which occurs under CPU + memory pressure, I
observed that a thread which called out_of_memory() can sleep for minutes
at schedule_timeout_killable(1) with oom_lock held when many threads are
doing direct reclaim.

--------------------
[  163.357628] b.out invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null), order=0, oom_score_adj=0
[  163.360946] CPU: 0 PID: 554 Comm: b.out Not tainted 4.15.0-rc8+ #216
(...snipped...)
[  163.470193] Out of memory: Kill process 548 (b.out) score 6 or sacrifice child
[  163.471813] Killed process 1191 (b.out) total-vm:2108kB, anon-rss:60kB, file-rss:4kB, shmem-rss:0kB
(...snipped...)
[  248.016033] sysrq: SysRq : Show State
(...snipped...)
[  249.625720] b.out           R  running task        0   554    538 0x00000004
[  249.627778] Call Trace:
[  249.628513]  __schedule+0x142/0x4b2
[  249.629394]  schedule+0x27/0x70
[  249.630114]  schedule_timeout+0xd1/0x160
[  249.631029]  ? oom_kill_process+0x396/0x400
[  249.632039]  ? __next_timer_interrupt+0xc0/0xc0
[  249.633087]  schedule_timeout_killable+0x15/0x20
[  249.634097]  out_of_memory+0xea/0x270
[  249.634901]  __alloc_pages_nodemask+0x715/0x880
[  249.635920]  handle_mm_fault+0x538/0xe40
[  249.636888]  ? __enqueue_entity+0x63/0x70
[  249.637787]  ? set_next_entity+0x4b/0x80
[  249.638687]  __do_page_fault+0x199/0x430
[  249.639535]  ? vmalloc_sync_all+0x180/0x180
[  249.640452]  do_page_fault+0x1a/0x1e
[  249.641283]  common_exception+0x82/0x8a
(...snipped...)
[  462.676366] oom_reaper: reaped process 1191 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
--------------------

--------------------
[  269.985819] b.out invoked oom-killer: gfp_mask=0x14200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0
[  269.988570] CPU: 0 PID: 9079 Comm: b.out Not tainted 4.15.0-rc8+ #217
(...snipped...)
[  270.073050] Out of memory: Kill process 914 (b.out) score 9 or sacrifice child
[  270.074660] Killed process 2208 (b.out) total-vm:2108kB, anon-rss:64kB, file-rss:4kB, shmem-rss:0kB
[  297.562824] sysrq: SysRq : Show State
(...snipped...)
[  471.716610] b.out           R  running task        0  9079   7400 0x00000000
[  471.718203] Call Trace:
[  471.718784]  __schedule+0x142/0x4b2
[  471.719577]  schedule+0x27/0x70
[  471.720294]  schedule_timeout+0xd1/0x160
[  471.721207]  ? oom_kill_process+0x396/0x400
[  471.722151]  ? __next_timer_interrupt+0xc0/0xc0
[  471.723215]  schedule_timeout_killable+0x15/0x20
[  471.724350]  out_of_memory+0xea/0x270
[  471.725201]  __alloc_pages_nodemask+0x715/0x880
[  471.726238]  ? radix_tree_lookup_slot+0x1f/0x50
[  471.727253]  filemap_fault+0x346/0x510
[  471.728120]  ? filemap_map_pages+0x245/0x2d0
[  471.729105]  ? unlock_page+0x30/0x30
[  471.729987]  __xfs_filemap_fault.isra.18+0x2d/0xb0
[  471.731488]  ? unlock_page+0x30/0x30
[  471.732364]  xfs_filemap_fault+0xa/0x10
[  471.733260]  __do_fault+0x11/0x30
[  471.734033]  handle_mm_fault+0x8e8/0xe40
[  471.735200]  __do_page_fault+0x199/0x430
[  471.736163]  ? common_exception+0x82/0x8a
[  471.737102]  ? vmalloc_sync_all+0x180/0x180
[  471.738061]  do_page_fault+0x1a/0x1e
[  471.738881]  common_exception+0x82/0x8a
(...snipped...)
[  566.969400] oom_reaper: reaped process 2208 (b.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
--------------------

Allowing the OOM reaper to start reclaiming memory without waiting for
the oom_lock is not sufficient if the OOM reaper did not reclaim enough
memory. We need to make sure that the thread which called out_of_memory()
will release oom_lock shortly. Thus, this patch brings the short sleep
to outside of the OOM killer.

For __alloc_pages_may_oom() case, this patch uses uninterruptible sleep
than killable sleep because fatal_signal_pending() threads won't be able
to use memory reserves unless tsk_is_oom_victim() becomes true.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
---
 mm/oom_kill.c   | 18 +++---------------
 mm/page_alloc.c |  3 ++-
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8219001..47212442 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1078,7 +1078,6 @@ bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-	bool delay = false; /* if set, delay next allocation attempt */
 
 	if (oom_killer_disabled)
 		return false;
@@ -1128,10 +1127,8 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
-	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
-		delay = true;
+	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc))
 		goto out;
-	}
 
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
@@ -1139,20 +1136,10 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
+	if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
-		delay = true;
-	}
-
 out:
-	/*
-	 * Give the killed process a good chance to exit before trying
-	 * to allocate memory again.
-	 */
-	if (delay)
-		schedule_timeout_killable(1);
-
 	return !!oc->chosen_task;
 }
 
@@ -1178,4 +1165,5 @@ void pagefault_out_of_memory(void)
 		return;
 	out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
+	schedule_timeout_killable(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4093728..e93bff1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3355,7 +3355,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
-		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 
@@ -4116,6 +4115,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
 		no_progress_loops = 0;
+		if (!tsk_is_oom_victim(current))
+			schedule_timeout_uninterruptible(1);
 		goto retry;
 	}
 
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2018-06-06 18:45 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-12 14:18 [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held Tetsuo Handa
2018-05-15  9:16 ` Michal Hocko
2018-05-18 10:14   ` Tetsuo Handa
2018-05-18 12:20     ` Michal Hocko
2018-05-20 15:56       ` Tetsuo Handa
2018-05-22  6:18         ` Michal Hocko
2018-05-23 10:24           ` Tetsuo Handa
2018-05-23 11:57             ` Michal Hocko
2018-05-23 13:45               ` Tetsuo Handa
2018-05-23 14:56                 ` Michal Hocko
2018-05-24 10:51                   ` Tetsuo Handa
2018-05-24 11:50                     ` Michal Hocko
2018-05-25  1:17                       ` Tetsuo Handa
2018-05-25  8:31                         ` Michal Hocko
2018-05-25 10:57                           ` Tetsuo Handa
2018-05-25 11:42                             ` Michal Hocko
2018-05-25 11:46                               ` Tetsuo Handa
2018-05-28 12:43                                 ` Michal Hocko
2018-05-28 20:57                                   ` Tetsuo Handa
2018-05-29  7:17                                     ` Michal Hocko
2018-05-29 23:07                                       ` Andrew Morton
2018-05-31 10:10                                         ` Tetsuo Handa
2018-05-31 10:44                                           ` Michal Hocko
2018-05-31 15:23                                             ` Tetsuo Handa
2018-05-31 18:47                                               ` Michal Hocko
2018-06-01  1:21                                                 ` Tetsuo Handa
2018-06-01  8:04                                                   ` Michal Hocko
2018-06-01 15:28                                         ` Michal Hocko
2018-06-01 21:11                                           ` Andrew Morton
2018-06-04  7:04                                             ` Michal Hocko
2018-06-04 10:41                                               ` Tetsuo Handa
2018-06-04 11:22                                                 ` Michal Hocko
2018-06-04 11:30                                                   ` Tetsuo Handa
2018-06-06  9:02                                                 ` David Rientjes
2018-06-06 13:37                                                   ` Tetsuo Handa
2018-06-06 18:44                                                     ` David Rientjes
2018-05-29  7:17             ` Michal Hocko
2018-05-29  8:16               ` Michal Hocko
2018-05-29 14:33                 ` Tetsuo Handa
2018-05-29 17:18                   ` Michal Hocko
2018-05-29 17:28                     ` Michal Hocko
2018-05-31 16:31                 ` [PATCH] mm, memcg, oom: fix pre-mature allocation failures kbuild test robot
  -- strict thread matches above, loose matches on Subject: below --
2018-03-22 10:51 [PATCH] mm,oom: Don't call schedule_timeout_killable() with oom_lock held Tetsuo Handa
2018-03-22 11:45 ` Michal Hocko
2018-03-22 13:16   ` Tetsuo Handa
2018-01-22 13:46 Tetsuo Handa
2018-01-23  8:38 ` Michal Hocko
2018-01-23 12:07   ` Tetsuo Handa
2018-01-23 12:42     ` Michal Hocko
2018-01-24 13:28       ` Tetsuo Handa
2018-02-13 11:58         ` Tetsuo Handa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.