All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: mgorman@suse.de, mhocko@kernel.org, viro@ZenIV.linux.org.uk
Cc: linux-mm@kvack.org, hannes@cmpxchg.org, linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH 1/2] mm, vmscan: account the number of isolated pages per zone
Date: Sat, 21 Jan 2017 16:42:42 +0900	[thread overview]
Message-ID: <201701211642.JBC39590.SFtVJHMFOLFOQO@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <201701202227.GCC13598.OHJMSQFVOtFOLF@I-love.SAKURA.ne.jp>

Tetsuo Handa wrote:
> And I think that there is a different problem if I tune a reproducer
> like below (i.e. increased the buffer size to write()/fsync() from 4096).
> 
> ----------
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> 
> int main(int argc, char *argv[])
> {
> 	static char buffer[10485760] = { }; /* or 1048576 */
> 	char *buf = NULL;
> 	unsigned long size;
> 	unsigned long i;
> 	for (i = 0; i < 1024; i++) {
> 		if (fork() == 0) {
> 			int fd = open("/proc/self/oom_score_adj", O_WRONLY);
> 			write(fd, "1000", 4);
> 			close(fd);
> 			sleep(1);
> 			snprintf(buffer, sizeof(buffer), "/tmp/file.%u", getpid());
> 			fd = open(buffer, O_WRONLY | O_CREAT | O_APPEND, 0600);
> 			while (write(fd, buffer, sizeof(buffer)) == sizeof(buffer))
> 				fsync(fd);
> 			_exit(0);
> 		}
> 	}
> 	for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) {
> 		char *cp = realloc(buf, size);
> 		if (!cp) {
> 			size >>= 1;
> 			break;
> 		}
> 		buf = cp;
> 	}
> 	sleep(2);
> 	/* Will cause OOM due to overcommit */
> 	for (i = 0; i < size; i += 4096)
> 		buf[i] = 0;
> 	pause();
> 	return 0;
> }
> ----------
> 
> Above reproducer sometimes kills all OOM killable processes and the system
> finally panics. I guess that somebody is abusing TIF_MEMDIE for needless
> allocations to the level where GFP_ATOMIC allocations start failing.

I tracked who is abusing TIF_MEMDIE using below patch.

----------------------------------------
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea088e1..d9ac53d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3038,7 +3038,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 
-	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+	if (1 || (gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 
@@ -3573,6 +3573,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	int no_progress_loops = 0;
 	unsigned long alloc_start = jiffies;
 	unsigned int stall_timeout = 10 * HZ;
+	bool victim = false;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3656,8 +3657,10 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
 
-	if (gfp_pfmemalloc_allowed(gfp_mask))
+	if (gfp_pfmemalloc_allowed(gfp_mask)) {
 		alloc_flags = ALLOC_NO_WATERMARKS;
+		victim = test_thread_flag(TIF_MEMDIE);
+	}
 
 	/*
 	 * Reset the zonelist iterators if memory policies can be ignored.
@@ -3790,6 +3793,11 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
+	if (page && victim) {
+		pr_warn("%s(%u): TIF_MEMDIE allocation: order=%d mode=%#x(%pGg)\n",
+			current->comm, current->pid, order, gfp_mask, &gfp_mask);
+		dump_stack();
+	}
 	return page;
 }
 
----------------------------------------

And I got flood of traces shown below. It seems to be consuming memory reserves
until the size passed to write() request is stored to the page cache even after
OOM-killed.

Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20170121.txt.xz .
----------------------------------------
[  202.306077] a.out(9789): TIF_MEMDIE allocation: order=0 mode=0x1c2004a(GFP_NOFS|__GFP_HIGHMEM|__GFP_HARDWALL|__GFP_MOVABLE|__GFP_WRITE)
[  202.309832] CPU: 0 PID: 9789 Comm: a.out Not tainted 4.10.0-rc4-next-20170120+ #492
[  202.312323] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[  202.315429] Call Trace:
[  202.316902]  dump_stack+0x85/0xc9
[  202.318810]  __alloc_pages_slowpath+0xa99/0xd7c
[  202.320697]  ? node_dirty_ok+0xef/0x130
[  202.322454]  __alloc_pages_nodemask+0x436/0x4d0
[  202.324506]  alloc_pages_current+0x97/0x1b0
[  202.326397]  __page_cache_alloc+0x15d/0x1a0          mm/filemap.c:728
[  202.328209]  pagecache_get_page+0x5a/0x2b0           mm/filemap.c:1331
[  202.329989]  grab_cache_page_write_begin+0x23/0x40   mm/filemap.c:2773
[  202.331905]  iomap_write_begin+0x50/0xd0             fs/iomap.c:118
[  202.333641]  iomap_write_actor+0xb5/0x1a0            fs/iomap.c:190
[  202.335377]  ? iomap_write_end+0x80/0x80             fs/iomap.c:150
[  202.337090]  iomap_apply+0xb3/0x130                  fs/iomap.c:79
[  202.338721]  iomap_file_buffered_write+0x68/0xa0     fs/iomap.c:243
[  202.340613]  ? iomap_write_end+0x80/0x80
[  202.342471]  xfs_file_buffered_aio_write+0x132/0x390 [xfs]
[  202.344501]  ? remove_wait_queue+0x59/0x60
[  202.346261]  xfs_file_write_iter+0x90/0x130 [xfs]
[  202.348082]  __vfs_write+0xe5/0x140
[  202.349743]  vfs_write+0xc7/0x1f0
[  202.351214]  ? syscall_trace_enter+0x1d0/0x380
[  202.353155]  SyS_write+0x58/0xc0
[  202.354628]  do_syscall_64+0x6c/0x200
[  202.356100]  entry_SYSCALL64_slow_path+0x25/0x25
----------------------------------------

Do we need to allow access to memory reserves for this allocation?
Or, should the caller check for SIGKILL rather than iterate the loop?

WARNING: multiple messages have this Message-ID (diff)
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: mgorman@suse.de, mhocko@kernel.org, viro@ZenIV.linux.org.uk
Cc: linux-mm@kvack.org, hannes@cmpxchg.org, linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH 1/2] mm, vmscan: account the number of isolated pages per zone
Date: Sat, 21 Jan 2017 16:42:42 +0900	[thread overview]
Message-ID: <201701211642.JBC39590.SFtVJHMFOLFOQO@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <201701202227.GCC13598.OHJMSQFVOtFOLF@I-love.SAKURA.ne.jp>

Tetsuo Handa wrote:
> And I think that there is a different problem if I tune a reproducer
> like below (i.e. increased the buffer size to write()/fsync() from 4096).
> 
> ----------
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> 
> int main(int argc, char *argv[])
> {
> 	static char buffer[10485760] = { }; /* or 1048576 */
> 	char *buf = NULL;
> 	unsigned long size;
> 	unsigned long i;
> 	for (i = 0; i < 1024; i++) {
> 		if (fork() == 0) {
> 			int fd = open("/proc/self/oom_score_adj", O_WRONLY);
> 			write(fd, "1000", 4);
> 			close(fd);
> 			sleep(1);
> 			snprintf(buffer, sizeof(buffer), "/tmp/file.%u", getpid());
> 			fd = open(buffer, O_WRONLY | O_CREAT | O_APPEND, 0600);
> 			while (write(fd, buffer, sizeof(buffer)) == sizeof(buffer))
> 				fsync(fd);
> 			_exit(0);
> 		}
> 	}
> 	for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) {
> 		char *cp = realloc(buf, size);
> 		if (!cp) {
> 			size >>= 1;
> 			break;
> 		}
> 		buf = cp;
> 	}
> 	sleep(2);
> 	/* Will cause OOM due to overcommit */
> 	for (i = 0; i < size; i += 4096)
> 		buf[i] = 0;
> 	pause();
> 	return 0;
> }
> ----------
> 
> Above reproducer sometimes kills all OOM killable processes and the system
> finally panics. I guess that somebody is abusing TIF_MEMDIE for needless
> allocations to the level where GFP_ATOMIC allocations start failing.

I tracked who is abusing TIF_MEMDIE using below patch.

----------------------------------------
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea088e1..d9ac53d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3038,7 +3038,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 
-	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+	if (1 || (gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 
@@ -3573,6 +3573,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	int no_progress_loops = 0;
 	unsigned long alloc_start = jiffies;
 	unsigned int stall_timeout = 10 * HZ;
+	bool victim = false;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3656,8 +3657,10 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
 
-	if (gfp_pfmemalloc_allowed(gfp_mask))
+	if (gfp_pfmemalloc_allowed(gfp_mask)) {
 		alloc_flags = ALLOC_NO_WATERMARKS;
+		victim = test_thread_flag(TIF_MEMDIE);
+	}
 
 	/*
 	 * Reset the zonelist iterators if memory policies can be ignored.
@@ -3790,6 +3793,11 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
+	if (page && victim) {
+		pr_warn("%s(%u): TIF_MEMDIE allocation: order=%d mode=%#x(%pGg)\n",
+			current->comm, current->pid, order, gfp_mask, &gfp_mask);
+		dump_stack();
+	}
 	return page;
 }
 
----------------------------------------

And I got flood of traces shown below. It seems to be consuming memory reserves
until the size passed to write() request is stored to the page cache even after
OOM-killed.

Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20170121.txt.xz .
----------------------------------------
[  202.306077] a.out(9789): TIF_MEMDIE allocation: order=0 mode=0x1c2004a(GFP_NOFS|__GFP_HIGHMEM|__GFP_HARDWALL|__GFP_MOVABLE|__GFP_WRITE)
[  202.309832] CPU: 0 PID: 9789 Comm: a.out Not tainted 4.10.0-rc4-next-20170120+ #492
[  202.312323] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[  202.315429] Call Trace:
[  202.316902]  dump_stack+0x85/0xc9
[  202.318810]  __alloc_pages_slowpath+0xa99/0xd7c
[  202.320697]  ? node_dirty_ok+0xef/0x130
[  202.322454]  __alloc_pages_nodemask+0x436/0x4d0
[  202.324506]  alloc_pages_current+0x97/0x1b0
[  202.326397]  __page_cache_alloc+0x15d/0x1a0          mm/filemap.c:728
[  202.328209]  pagecache_get_page+0x5a/0x2b0           mm/filemap.c:1331
[  202.329989]  grab_cache_page_write_begin+0x23/0x40   mm/filemap.c:2773
[  202.331905]  iomap_write_begin+0x50/0xd0             fs/iomap.c:118
[  202.333641]  iomap_write_actor+0xb5/0x1a0            fs/iomap.c:190
[  202.335377]  ? iomap_write_end+0x80/0x80             fs/iomap.c:150
[  202.337090]  iomap_apply+0xb3/0x130                  fs/iomap.c:79
[  202.338721]  iomap_file_buffered_write+0x68/0xa0     fs/iomap.c:243
[  202.340613]  ? iomap_write_end+0x80/0x80
[  202.342471]  xfs_file_buffered_aio_write+0x132/0x390 [xfs]
[  202.344501]  ? remove_wait_queue+0x59/0x60
[  202.346261]  xfs_file_write_iter+0x90/0x130 [xfs]
[  202.348082]  __vfs_write+0xe5/0x140
[  202.349743]  vfs_write+0xc7/0x1f0
[  202.351214]  ? syscall_trace_enter+0x1d0/0x380
[  202.353155]  SyS_write+0x58/0xc0
[  202.354628]  do_syscall_64+0x6c/0x200
[  202.356100]  entry_SYSCALL64_slow_path+0x25/0x25
----------------------------------------

Do we need to allow access to memory reserves for this allocation?
Or, should the caller check for SIGKILL rather than iterate the loop?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2017-01-21  7:43 UTC|newest]

Thread overview: 110+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-18 13:44 [RFC PATCH 0/2] fix unbounded too_many_isolated Michal Hocko
2017-01-18 13:44 ` Michal Hocko
2017-01-18 13:44 ` [RFC PATCH 1/2] mm, vmscan: account the number of isolated pages per zone Michal Hocko
2017-01-18 13:44   ` Michal Hocko
2017-01-18 14:46   ` Mel Gorman
2017-01-18 14:46     ` Mel Gorman
2017-01-18 15:15     ` Michal Hocko
2017-01-18 15:15       ` Michal Hocko
2017-01-18 15:54       ` Mel Gorman
2017-01-18 15:54         ` Mel Gorman
2017-01-18 16:17         ` Michal Hocko
2017-01-18 16:17           ` Michal Hocko
2017-01-18 17:00           ` Mel Gorman
2017-01-18 17:00             ` Mel Gorman
2017-01-18 17:29             ` Michal Hocko
2017-01-18 17:29               ` Michal Hocko
2017-01-19 10:07               ` Mel Gorman
2017-01-19 10:07                 ` Mel Gorman
2017-01-19 11:23                 ` Michal Hocko
2017-01-19 11:23                   ` Michal Hocko
2017-01-19 13:11                   ` Mel Gorman
2017-01-19 13:11                     ` Mel Gorman
2017-01-20 13:27                     ` Tetsuo Handa
2017-01-20 13:27                       ` Tetsuo Handa
2017-01-21  7:42                       ` Tetsuo Handa [this message]
2017-01-21  7:42                         ` Tetsuo Handa
2017-01-25 10:15                         ` Michal Hocko
2017-01-25 10:15                           ` Michal Hocko
2017-01-25 10:19                           ` Christoph Hellwig
2017-01-25 10:19                             ` Christoph Hellwig
2017-01-25 10:46                             ` Michal Hocko
2017-01-25 10:46                               ` Michal Hocko
2017-01-25 11:09                               ` Tetsuo Handa
2017-01-25 11:09                                 ` Tetsuo Handa
2017-01-25 13:00                                 ` Michal Hocko
2017-01-25 13:00                                   ` Michal Hocko
2017-01-27 14:49                                   ` Michal Hocko
2017-01-27 14:49                                     ` Michal Hocko
2017-01-28 15:27                                     ` Tetsuo Handa
2017-01-28 15:27                                       ` Tetsuo Handa
2017-01-30  8:55                                       ` Michal Hocko
2017-01-30  8:55                                         ` Michal Hocko
2017-02-02 10:14                                         ` Michal Hocko
2017-02-02 10:14                                           ` Michal Hocko
2017-02-03 10:57                                           ` Tetsuo Handa
2017-02-03 10:57                                             ` Tetsuo Handa
2017-02-03 14:41                                             ` Michal Hocko
2017-02-03 14:41                                               ` Michal Hocko
2017-02-03 14:50                                             ` Michal Hocko
2017-02-03 14:50                                               ` Michal Hocko
2017-02-03 17:24                                               ` Brian Foster
2017-02-03 17:24                                                 ` Brian Foster
2017-02-06  6:29                                                 ` Tetsuo Handa
2017-02-06  6:29                                                   ` Tetsuo Handa
2017-02-06 14:35                                                   ` Brian Foster
2017-02-06 14:35                                                     ` Brian Foster
2017-02-06 14:42                                                     ` Michal Hocko
2017-02-06 14:42                                                       ` Michal Hocko
2017-02-06 15:47                                                       ` Brian Foster
2017-02-06 15:47                                                         ` Brian Foster
2017-02-07 10:30                                                     ` Tetsuo Handa
2017-02-07 10:30                                                       ` Tetsuo Handa
2017-02-07 16:54                                                       ` Brian Foster
2017-02-07 16:54                                                         ` Brian Foster
2017-02-03 14:55                                             ` Michal Hocko
2017-02-03 14:55                                               ` Michal Hocko
2017-02-05 10:43                                               ` Tetsuo Handa
2017-02-05 10:43                                                 ` Tetsuo Handa
2017-02-06 10:34                                                 ` Michal Hocko
2017-02-06 10:34                                                   ` Michal Hocko
2017-02-06 10:39                                                 ` Michal Hocko
2017-02-06 10:39                                                   ` Michal Hocko
2017-02-07 21:12                                                   ` Michal Hocko
2017-02-07 21:12                                                     ` Michal Hocko
2017-02-08  9:24                                                     ` Peter Zijlstra
2017-02-08  9:24                                                       ` Peter Zijlstra
2017-02-21  9:40                                             ` Michal Hocko
2017-02-21  9:40                                               ` Michal Hocko
2017-02-21 14:35                                               ` Tetsuo Handa
2017-02-21 14:35                                                 ` Tetsuo Handa
2017-02-21 15:53                                                 ` Michal Hocko
2017-02-21 15:53                                                   ` Michal Hocko
2017-02-22  2:02                                                   ` Tetsuo Handa
2017-02-22  2:02                                                     ` Tetsuo Handa
2017-02-22  7:54                                                     ` Michal Hocko
2017-02-22  7:54                                                       ` Michal Hocko
2017-02-26  6:30                                                       ` Tetsuo Handa
2017-02-26  6:30                                                         ` Tetsuo Handa
2017-01-31 11:58                                   ` Michal Hocko
2017-01-31 11:58                                     ` Michal Hocko
2017-01-31 12:51                                     ` Christoph Hellwig
2017-01-31 12:51                                       ` Christoph Hellwig
2017-01-31 13:21                                       ` Michal Hocko
2017-01-31 13:21                                         ` Michal Hocko
2017-01-25 10:33                           ` [RFC PATCH 1/2] mm, vmscan: account the number of isolated pagesper zone Tetsuo Handa
2017-01-25 10:33                             ` Tetsuo Handa
2017-01-25 12:34                             ` Michal Hocko
2017-01-25 12:34                               ` Michal Hocko
2017-01-25 13:13                               ` [RFC PATCH 1/2] mm, vmscan: account the number of isolated pages per zone Tetsuo Handa
2017-01-25 13:13                                 ` Tetsuo Handa
2017-01-25  9:53                       ` Michal Hocko
2017-01-25  9:53                         ` Michal Hocko
2017-01-20  6:42                 ` Hillf Danton
2017-01-20  6:42                   ` Hillf Danton
2017-01-20  9:25                   ` Mel Gorman
2017-01-20  9:25                     ` Mel Gorman
2017-01-18 13:44 ` [RFC PATCH 2/2] mm, vmscan: do not loop on too_many_isolated for ever Michal Hocko
2017-01-18 13:44   ` Michal Hocko
2017-01-18 14:50   ` Mel Gorman
2017-01-18 14:50     ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=201701211642.JBC39590.SFtVJHMFOLFOQO@I-love.SAKURA.ne.jp \
    --to=penguin-kernel@i-love.sakura.ne.jp \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@kernel.org \
    --cc=viro@ZenIV.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.