All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
@ 2017-11-25 10:52 Tetsuo Handa
  2017-11-25 10:52 ` [PATCH 2/3] mm,oom: Use ALLOC_OOM for OOM victim's last second allocation Tetsuo Handa
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Tetsuo Handa @ 2017-11-25 10:52 UTC (permalink / raw)
  To: akpm
  Cc: linux-mm, Tetsuo Handa, Andrea Arcangeli, Johannes Weiner, Michal Hocko

Since selecting an OOM victim can take quite some time and the OOM
situation might be resolved meanwhile, sometimes doing last second
allocation attempt after selecting an OOM victim can succeed.

Therefore, this patch moves last second allocation attempt to after
selecting an OOM victim. This patch is expected to reduce the time
window for potentially pre-mature OOM killing considerably.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Suggested-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/oom.h | 13 +++++++++++++
 mm/oom_kill.c       | 14 ++++++++++++++
 mm/page_alloc.c     | 44 ++++++++++++++++++++++++++------------------
 3 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d8..27cd36b 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -14,6 +14,8 @@
 struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
+struct alloc_context;
+struct page;
 
 /*
  * Details of the page allocation that triggered the oom killer that are used to
@@ -38,6 +40,15 @@ struct oom_control {
 	 */
 	const int order;
 
+	/* Context for really last second allocation attempt. */
+	const struct alloc_context *ac;
+	/*
+	 * Set by the OOM killer if ac != NULL and last second allocation
+	 * attempt succeeded. If ac != NULL, the caller must check for
+	 * page != NULL.
+	 */
+	struct page *page;
+
 	/* Used by oom implementation, do not set */
 	unsigned long totalpages;
 	struct task_struct *chosen;
@@ -102,6 +113,8 @@ extern unsigned long oom_badness(struct task_struct *p,
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern struct page *alloc_pages_before_oomkill(const struct oom_control *oc);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c957be3..348ec5a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1061,6 +1061,9 @@ bool out_of_memory(struct oom_control *oc)
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+		oc->page = alloc_pages_before_oomkill(oc);
+		if (oc->page)
+			return true;
 		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
@@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
 	}
 
 	select_bad_process(oc);
+	/*
+	 * Try really last second allocation attempt after we selected an OOM
+	 * victim, for somebody might have managed to free memory while we were
+	 * selecting an OOM victim which can take quite some time.
+	 */
+	oc->page = alloc_pages_before_oomkill(oc);
+	if (oc->page) {
+		if (oc->chosen && oc->chosen != (void *)-1UL)
+			put_task_struct(oc->chosen);
+		return true;
+	}
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
 		dump_header(oc, NULL);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48b5b01..7fa95ea 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3325,8 +3325,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
+		.ac = ac,
 	};
-	struct page *page;
+	struct page *page = NULL;
 
 	*did_some_progress = 0;
 
@@ -3340,19 +3341,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		return NULL;
 	}
 
-	/*
-	 * Go through the zonelist yet one more time, keep very high watermark
-	 * here, this is only to catch a parallel oom killing, we must fail if
-	 * we're still under heavy pressure. But make sure that this reclaim
-	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
-	 * allocation which will never fail due to oom_lock already held.
-	 */
-	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
-				      ~__GFP_DIRECT_RECLAIM, order,
-				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
-	if (page)
-		goto out;
-
 	/* Coredumps can quickly deplete all memory reserves */
 	if (current->flags & PF_DUMPCORE)
 		goto out;
@@ -3387,16 +3375,18 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		goto out;
 
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+	if (out_of_memory(&oc)) {
+		*did_some_progress = 1;
+		page = oc.page;
+	} else if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 		*did_some_progress = 1;
 
 		/*
 		 * Help non-failing allocations by giving them access to memory
 		 * reserves
 		 */
-		if (gfp_mask & __GFP_NOFAIL)
-			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
-					ALLOC_NO_WATERMARKS, ac);
+		page = __alloc_pages_cpuset_fallback(gfp_mask, order,
+						     ALLOC_NO_WATERMARKS, ac);
 	}
 out:
 	mutex_unlock(&oom_lock);
@@ -4156,6 +4146,24 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return page;
 }
 
+struct page *alloc_pages_before_oomkill(const struct oom_control *oc)
+{
+	/*
+	 * Go through the zonelist yet one more time, keep very high watermark
+	 * here, this is only to catch a parallel oom killing, we must fail if
+	 * we're still under heavy pressure. But make sure that this reclaim
+	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+	 * allocation which will never fail due to oom_lock already held.
+	 */
+	int alloc_flags = ALLOC_CPUSET | ALLOC_WMARK_HIGH;
+	gfp_t gfp_mask = oc->gfp_mask | __GFP_HARDWALL;
+
+	if (!oc->ac)
+		return NULL;
+	gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+	return get_page_from_freelist(gfp_mask, oc->order, alloc_flags, oc->ac);
+}
+
 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 		int preferred_nid, nodemask_t *nodemask,
 		struct alloc_context *ac, gfp_t *alloc_mask,
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/3] mm,oom: Use ALLOC_OOM for OOM victim's last second allocation.
  2017-11-25 10:52 [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Tetsuo Handa
@ 2017-11-25 10:52 ` Tetsuo Handa
  2017-11-25 10:52 ` [PATCH 3/3] mm,oom: Remove oom_lock serialization from the OOM reaper Tetsuo Handa
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: Tetsuo Handa @ 2017-11-25 10:52 UTC (permalink / raw)
  To: akpm
  Cc: linux-mm, Tetsuo Handa, David Rientjes, Manish Jaggi,
	Michal Hocko, Oleg Nesterov, Vladimir Davydov

Manish Jaggi noticed that running LTP oom01/oom02 ltp tests with high core
count causes random kernel panics when an OOM victim which consumed memory
in a way the OOM reaper does not help was selected by the OOM killer [1].
Since commit 696453e66630ad45 ("mm, oom: task_will_free_mem should skip
oom_reaped tasks") changed task_will_free_mem(current) in out_of_memory()
to return false as soon as MMF_OOM_SKIP is set, many threads sharing the
victim's mm were not able to try allocation from memory reserves after the
OOM reaper gave up reclaiming memory.

Therefore, this patch allows OOM victims to use ALLOC_OOM watermark for
last second allocation attempt.

[1] http://lkml.kernel.org/r/e6c83a26-1d59-4afd-55cf-04e58bdde188@caviumnetworks.com

Fixes: 696453e66630ad45 ("mm, oom: task_will_free_mem should skip oom_reaped tasks")
Reported-by: Manish Jaggi <mjaggi@caviumnetworks.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7fa95ea..dbaff7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4154,13 +4154,19 @@ struct page *alloc_pages_before_oomkill(const struct oom_control *oc)
 	 * we're still under heavy pressure. But make sure that this reclaim
 	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
 	 * allocation which will never fail due to oom_lock already held.
+	 * Also, make sure that OOM victims can try ALLOC_OOM watermark
+	 * in case they haven't tried ALLOC_OOM watermark.
 	 */
 	int alloc_flags = ALLOC_CPUSET | ALLOC_WMARK_HIGH;
 	gfp_t gfp_mask = oc->gfp_mask | __GFP_HARDWALL;
+	int reserve_flags;
 
 	if (!oc->ac)
 		return NULL;
 	gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+	if (reserve_flags)
+		alloc_flags = reserve_flags;
 	return get_page_from_freelist(gfp_mask, oc->order, alloc_flags, oc->ac);
 }
 
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/3] mm,oom: Remove oom_lock serialization from the OOM reaper.
  2017-11-25 10:52 [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Tetsuo Handa
  2017-11-25 10:52 ` [PATCH 2/3] mm,oom: Use ALLOC_OOM for OOM victim's last second allocation Tetsuo Handa
@ 2017-11-25 10:52 ` Tetsuo Handa
  2017-11-28 13:04 ` [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Michal Hocko
  2017-12-01 14:33 ` Johannes Weiner
  3 siblings, 0 replies; 17+ messages in thread
From: Tetsuo Handa @ 2017-11-25 10:52 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, Tetsuo Handa, Michal Hocko

Since "mm,oom: Move last second allocation to inside the OOM killer."
changed to do last second allocation attempt after confirming that there
is no OOM victim's mm without MMF_OOM_SKIP set, we no longer need to
block the OOM reaper using oom_lock. This patch should allow start
reclaiming earlier than now.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 mm/oom_kill.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 348ec5a..3b0d0fe 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -491,22 +491,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	struct vm_area_struct *vma;
 	bool ret = true;
 
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * __oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
-	mutex_lock(&oom_lock);
-
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		ret = false;
 		trace_skip_task_reaping(tsk->pid);
@@ -580,7 +564,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
 	trace_finish_task_reaping(tsk->pid);
 unlock_oom:
-	mutex_unlock(&oom_lock);
 	return ret;
 }
 
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-11-25 10:52 [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Tetsuo Handa
  2017-11-25 10:52 ` [PATCH 2/3] mm,oom: Use ALLOC_OOM for OOM victim's last second allocation Tetsuo Handa
  2017-11-25 10:52 ` [PATCH 3/3] mm,oom: Remove oom_lock serialization from the OOM reaper Tetsuo Handa
@ 2017-11-28 13:04 ` Michal Hocko
  2017-12-01 14:33 ` Johannes Weiner
  3 siblings, 0 replies; 17+ messages in thread
From: Michal Hocko @ 2017-11-28 13:04 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, Andrea Arcangeli, Johannes Weiner

On Sat 25-11-17 19:52:47, Tetsuo Handa wrote:
> Since selecting an OOM victim can take quite some time and the OOM
> situation might be resolved meanwhile, sometimes doing last second
> allocation attempt after selecting an OOM victim can succeed.
> 
> Therefore, this patch moves last second allocation attempt to after
> selecting an OOM victim. This patch is expected to reduce the time
> window for potentially pre-mature OOM killing considerably.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  include/linux/oom.h | 13 +++++++++++++
>  mm/oom_kill.c       | 14 ++++++++++++++
>  mm/page_alloc.c     | 44 ++++++++++++++++++++++++++------------------
>  3 files changed, 53 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 01c91d8..27cd36b 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -14,6 +14,8 @@
>  struct notifier_block;
>  struct mem_cgroup;
>  struct task_struct;
> +struct alloc_context;
> +struct page;
>  
>  /*
>   * Details of the page allocation that triggered the oom killer that are used to
> @@ -38,6 +40,15 @@ struct oom_control {
>  	 */
>  	const int order;
>  
> +	/* Context for really last second allocation attempt. */
> +	const struct alloc_context *ac;
> +	/*
> +	 * Set by the OOM killer if ac != NULL and last second allocation
> +	 * attempt succeeded. If ac != NULL, the caller must check for
> +	 * page != NULL.
> +	 */
> +	struct page *page;
> +
>  	/* Used by oom implementation, do not set */
>  	unsigned long totalpages;
>  	struct task_struct *chosen;
> @@ -102,6 +113,8 @@ extern unsigned long oom_badness(struct task_struct *p,
>  
>  extern struct task_struct *find_lock_task_mm(struct task_struct *p);
>  
> +extern struct page *alloc_pages_before_oomkill(const struct oom_control *oc);
> +
>  /* sysctls */
>  extern int sysctl_oom_dump_tasks;
>  extern int sysctl_oom_kill_allocating_task;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index c957be3..348ec5a 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -1061,6 +1061,9 @@ bool out_of_memory(struct oom_control *oc)
>  	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
>  	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
>  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> +		oc->page = alloc_pages_before_oomkill(oc);
> +		if (oc->page)
> +			return true;
>  		get_task_struct(current);
>  		oc->chosen = current;
>  		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
> @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
>  	}
>  
>  	select_bad_process(oc);
> +	/*
> +	 * Try really last second allocation attempt after we selected an OOM
> +	 * victim, for somebody might have managed to free memory while we were
> +	 * selecting an OOM victim which can take quite some time.
> +	 */
> +	oc->page = alloc_pages_before_oomkill(oc);
> +	if (oc->page) {
> +		if (oc->chosen && oc->chosen != (void *)-1UL)
> +			put_task_struct(oc->chosen);
> +		return true;
> +	}
>  	/* Found nothing?!?! Either we hang forever, or we panic. */
>  	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
>  		dump_header(oc, NULL);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 48b5b01..7fa95ea 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3325,8 +3325,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
>  		.memcg = NULL,
>  		.gfp_mask = gfp_mask,
>  		.order = order,
> +		.ac = ac,
>  	};
> -	struct page *page;
> +	struct page *page = NULL;
>  
>  	*did_some_progress = 0;
>  
> @@ -3340,19 +3341,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
>  		return NULL;
>  	}
>  
> -	/*
> -	 * Go through the zonelist yet one more time, keep very high watermark
> -	 * here, this is only to catch a parallel oom killing, we must fail if
> -	 * we're still under heavy pressure. But make sure that this reclaim
> -	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
> -	 * allocation which will never fail due to oom_lock already held.
> -	 */
> -	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
> -				      ~__GFP_DIRECT_RECLAIM, order,
> -				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
> -	if (page)
> -		goto out;
> -
>  	/* Coredumps can quickly deplete all memory reserves */
>  	if (current->flags & PF_DUMPCORE)
>  		goto out;
> @@ -3387,16 +3375,18 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
>  		goto out;
>  
>  	/* Exhausted what can be done so it's blamo time */
> -	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
> +	if (out_of_memory(&oc)) {
> +		*did_some_progress = 1;
> +		page = oc.page;
> +	} else if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
>  		*did_some_progress = 1;
>  
>  		/*
>  		 * Help non-failing allocations by giving them access to memory
>  		 * reserves
>  		 */
> -		if (gfp_mask & __GFP_NOFAIL)
> -			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
> -					ALLOC_NO_WATERMARKS, ac);
> +		page = __alloc_pages_cpuset_fallback(gfp_mask, order,
> +						     ALLOC_NO_WATERMARKS, ac);
>  	}
>  out:
>  	mutex_unlock(&oom_lock);
> @@ -4156,6 +4146,24 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
>  	return page;
>  }
>  
> +struct page *alloc_pages_before_oomkill(const struct oom_control *oc)
> +{
> +	/*
> +	 * Go through the zonelist yet one more time, keep very high watermark
> +	 * here, this is only to catch a parallel oom killing, we must fail if
> +	 * we're still under heavy pressure. But make sure that this reclaim
> +	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
> +	 * allocation which will never fail due to oom_lock already held.
> +	 */
> +	int alloc_flags = ALLOC_CPUSET | ALLOC_WMARK_HIGH;
> +	gfp_t gfp_mask = oc->gfp_mask | __GFP_HARDWALL;
> +
> +	if (!oc->ac)
> +		return NULL;
> +	gfp_mask &= ~__GFP_DIRECT_RECLAIM;
> +	return get_page_from_freelist(gfp_mask, oc->order, alloc_flags, oc->ac);
> +}
> +
>  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>  		int preferred_nid, nodemask_t *nodemask,
>  		struct alloc_context *ac, gfp_t *alloc_mask,
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-11-25 10:52 [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Tetsuo Handa
                   ` (2 preceding siblings ...)
  2017-11-28 13:04 ` [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Michal Hocko
@ 2017-12-01 14:33 ` Johannes Weiner
  2017-12-01 14:46   ` Michal Hocko
  3 siblings, 1 reply; 17+ messages in thread
From: Johannes Weiner @ 2017-12-01 14:33 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, Andrea Arcangeli, Michal Hocko

On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
>  	}
>  
>  	select_bad_process(oc);
> +	/*
> +	 * Try really last second allocation attempt after we selected an OOM
> +	 * victim, for somebody might have managed to free memory while we were
> +	 * selecting an OOM victim which can take quite some time.

Somebody might free some memory right after this attempt fails. OOM
can always be a temporary state that resolves on its own.

What keeps us from declaring OOM prematurely is the fact that we
already scanned the entire LRU list without success, not last second
or last-last second, or REALLY last-last-last-second allocations.

Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 14:33 ` Johannes Weiner
@ 2017-12-01 14:46   ` Michal Hocko
  2017-12-01 14:56     ` Johannes Weiner
  0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2017-12-01 14:46 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> >  	}
> >  
> >  	select_bad_process(oc);
> > +	/*
> > +	 * Try really last second allocation attempt after we selected an OOM
> > +	 * victim, for somebody might have managed to free memory while we were
> > +	 * selecting an OOM victim which can take quite some time.
> 
> Somebody might free some memory right after this attempt fails. OOM
> can always be a temporary state that resolves on its own.
> 
> What keeps us from declaring OOM prematurely is the fact that we
> already scanned the entire LRU list without success, not last second
> or last-last second, or REALLY last-last-last-second allocations.

You are right that this is inherently racy. The point here is, however,
that the race window between the last check and the kill can be _huge_!
Another argument is that the allocator itself could have changed its
allocation capabilities - e.g. become the OOM victim itself since the
last time it the allocator could have reflected that fact.

So this is a pragmatic way to reduce weird corner cases while the overal
complexity doesn't grow too much.

> Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 14:46   ` Michal Hocko
@ 2017-12-01 14:56     ` Johannes Weiner
  2017-12-01 15:17       ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Johannes Weiner @ 2017-12-01 14:56 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Fri, Dec 01, 2017 at 03:46:34PM +0100, Michal Hocko wrote:
> On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> > On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> > >  	}
> > >  
> > >  	select_bad_process(oc);
> > > +	/*
> > > +	 * Try really last second allocation attempt after we selected an OOM
> > > +	 * victim, for somebody might have managed to free memory while we were
> > > +	 * selecting an OOM victim which can take quite some time.
> > 
> > Somebody might free some memory right after this attempt fails. OOM
> > can always be a temporary state that resolves on its own.
> > 
> > What keeps us from declaring OOM prematurely is the fact that we
> > already scanned the entire LRU list without success, not last second
> > or last-last second, or REALLY last-last-last-second allocations.
> 
> You are right that this is inherently racy. The point here is, however,
> that the race window between the last check and the kill can be _huge_!

My point is that it's irrelevant. We already sampled the entire LRU
list; compared to that, the delay before the kill is immaterial.

> Another argument is that the allocator itself could have changed its
> allocation capabilities - e.g. become the OOM victim itself since the
> last time it the allocator could have reflected that fact.

Can you outline how this would happen exactly?

> > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 14:56     ` Johannes Weiner
@ 2017-12-01 15:17       ` Michal Hocko
  2017-12-01 15:57         ` Johannes Weiner
  0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2017-12-01 15:17 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Fri 01-12-17 14:56:38, Johannes Weiner wrote:
> On Fri, Dec 01, 2017 at 03:46:34PM +0100, Michal Hocko wrote:
> > On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> > > On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > > > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> > > >  	}
> > > >  
> > > >  	select_bad_process(oc);
> > > > +	/*
> > > > +	 * Try really last second allocation attempt after we selected an OOM
> > > > +	 * victim, for somebody might have managed to free memory while we were
> > > > +	 * selecting an OOM victim which can take quite some time.
> > > 
> > > Somebody might free some memory right after this attempt fails. OOM
> > > can always be a temporary state that resolves on its own.
> > > 
> > > What keeps us from declaring OOM prematurely is the fact that we
> > > already scanned the entire LRU list without success, not last second
> > > or last-last second, or REALLY last-last-last-second allocations.
> > 
> > You are right that this is inherently racy. The point here is, however,
> > that the race window between the last check and the kill can be _huge_!
> 
> My point is that it's irrelevant. We already sampled the entire LRU
> list; compared to that, the delay before the kill is immaterial.

Well, I would disagree. I have seen OOM reports with a free memory.
Closer debugging shown that an existing process was on the way out and
the oom victim selection took way too long and fired after a large
process manage. There were different hacks^Wheuristics to cover those
cases but they turned out to just cause different corner cases. Moving
the existing last moment allocation after a potentially very time
consuming action is relatively cheap and safe measure to cover those
cases without any negative side effects I can think of.

Anyway, if the delay is immaterial than the existing last-retry is
even more pointless because it is executed right _after_ we gave up
reclaim retries. Compare that to the select_bad_process time window. And
really, that can take quite a lot of time. Especially in weird priority
inversion situations.

> > Another argument is that the allocator itself could have changed its
> > allocation capabilities - e.g. become the OOM victim itself since the
> > last time it the allocator could have reflected that fact.
> 
> Can you outline how this would happen exactly?

http://lkml.kernel.org/r/20171101135855.bqg2kuj6ao2cicqi@dhcp22.suse.cz

As I try to explain the workload is really pathological but this (resp.
the follow up based on this patch) as a workaround is moderately ugly
wrt. it actually can help.

> > > Nacked-by: Johannes Weiner <hannes@cmpxchg.org>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 15:17       ` Michal Hocko
@ 2017-12-01 15:57         ` Johannes Weiner
  2017-12-01 16:38           ` Michal Hocko
  2017-12-01 16:52           ` Tetsuo Handa
  0 siblings, 2 replies; 17+ messages in thread
From: Johannes Weiner @ 2017-12-01 15:57 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Fri, Dec 01, 2017 at 04:17:15PM +0100, Michal Hocko wrote:
> On Fri 01-12-17 14:56:38, Johannes Weiner wrote:
> > On Fri, Dec 01, 2017 at 03:46:34PM +0100, Michal Hocko wrote:
> > > On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> > > > On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > > > > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> > > > >  	}
> > > > >  
> > > > >  	select_bad_process(oc);
> > > > > +	/*
> > > > > +	 * Try really last second allocation attempt after we selected an OOM
> > > > > +	 * victim, for somebody might have managed to free memory while we were
> > > > > +	 * selecting an OOM victim which can take quite some time.
> > > > 
> > > > Somebody might free some memory right after this attempt fails. OOM
> > > > can always be a temporary state that resolves on its own.
> > > > 
> > > > What keeps us from declaring OOM prematurely is the fact that we
> > > > already scanned the entire LRU list without success, not last second
> > > > or last-last second, or REALLY last-last-last-second allocations.
> > > 
> > > You are right that this is inherently racy. The point here is, however,
> > > that the race window between the last check and the kill can be _huge_!
> > 
> > My point is that it's irrelevant. We already sampled the entire LRU
> > list; compared to that, the delay before the kill is immaterial.
> 
> Well, I would disagree. I have seen OOM reports with a free memory.
> Closer debugging shown that an existing process was on the way out and
> the oom victim selection took way too long and fired after a large
> process manage. There were different hacks^Wheuristics to cover those
> cases but they turned out to just cause different corner cases. Moving
> the existing last moment allocation after a potentially very time
> consuming action is relatively cheap and safe measure to cover those
> cases without any negative side effects I can think of.

An existing process can exit right after you pull the trigger. How big
is *that* race window? By this logic you could add a sleep(5) before
the last-second allocation because it would increase the likelihood of
somebody else exiting voluntarily.

This patch is making the time it takes to select a victim an integral
part of OOM semantics. Think about it: if somebody later speeds up the
OOM selection process, they shrink the window in which somebody could
volunteer memory for the last-second allocation. By optimizing that
code, you're probabilistically increasing the rate of OOM kills.

A guaranteed 5 second window would in fact be better behavior.

This is bananas. I'm sticking with my nak.

> > > Another argument is that the allocator itself could have changed its
> > > allocation capabilities - e.g. become the OOM victim itself since the
> > > last time it the allocator could have reflected that fact.
> > 
> > Can you outline how this would happen exactly?
> 
> http://lkml.kernel.org/r/20171101135855.bqg2kuj6ao2cicqi@dhcp22.suse.cz
> 
> As I try to explain the workload is really pathological but this (resp.
> the follow up based on this patch) as a workaround is moderately ugly
> wrt. it actually can help.

That's not a real case which matters. It's really unfortunate how much
churn the OOM killer has been seeing based on artificial stress tests.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 15:57         ` Johannes Weiner
@ 2017-12-01 16:38           ` Michal Hocko
  2017-12-05 10:46             ` Johannes Weiner
  2017-12-01 16:52           ` Tetsuo Handa
  1 sibling, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2017-12-01 16:38 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Fri 01-12-17 15:57:11, Johannes Weiner wrote:
> On Fri, Dec 01, 2017 at 04:17:15PM +0100, Michal Hocko wrote:
> > On Fri 01-12-17 14:56:38, Johannes Weiner wrote:
> > > On Fri, Dec 01, 2017 at 03:46:34PM +0100, Michal Hocko wrote:
> > > > On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> > > > > On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > > > > > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> > > > > >  	}
> > > > > >  
> > > > > >  	select_bad_process(oc);
> > > > > > +	/*
> > > > > > +	 * Try really last second allocation attempt after we selected an OOM
> > > > > > +	 * victim, for somebody might have managed to free memory while we were
> > > > > > +	 * selecting an OOM victim which can take quite some time.
> > > > > 
> > > > > Somebody might free some memory right after this attempt fails. OOM
> > > > > can always be a temporary state that resolves on its own.
> > > > > 
> > > > > What keeps us from declaring OOM prematurely is the fact that we
> > > > > already scanned the entire LRU list without success, not last second
> > > > > or last-last second, or REALLY last-last-last-second allocations.
> > > > 
> > > > You are right that this is inherently racy. The point here is, however,
> > > > that the race window between the last check and the kill can be _huge_!
> > > 
> > > My point is that it's irrelevant. We already sampled the entire LRU
> > > list; compared to that, the delay before the kill is immaterial.
> > 
> > Well, I would disagree. I have seen OOM reports with a free memory.
> > Closer debugging shown that an existing process was on the way out and
> > the oom victim selection took way too long and fired after a large
> > process manage. There were different hacks^Wheuristics to cover those
> > cases but they turned out to just cause different corner cases. Moving
> > the existing last moment allocation after a potentially very time
> > consuming action is relatively cheap and safe measure to cover those
> > cases without any negative side effects I can think of.
> 
> An existing process can exit right after you pull the trigger. How big
> is *that* race window? By this logic you could add a sleep(5) before
> the last-second allocation because it would increase the likelihood of
> somebody else exiting voluntarily.

Please read what I wrote above again. I am not saying this is _closing_
the any race. It however reduces the race window which I find generally
a good thing. Especially when there are no other negative side effects.
 
> This patch is making the time it takes to select a victim an integral
> part of OOM semantics. Think about it: if somebody later speeds up the
> OOM selection process, they shrink the window in which somebody could
> volunteer memory for the last-second allocation. By optimizing that
> code, you're probabilistically increasing the rate of OOM kills.
>
> A guaranteed 5 second window would in fact be better behavior.
> 
> This is bananas. I'm sticking with my nak.

So are you saying that the existing last allocation attempt is more
reasonable? I've tried to remove it [1] and you were against that.

All I'am trying to tell is that _if_ we want to have something like
the last moment allocation after reclaim gave up then it should happen
closer to the killing the actual disruptive operation. The current
attempt in __alloc_pages_may_oom makes only very little sense to me.

[1] http://lkml.kernel.org/r/1454013603-3682-1-git-send-email-mhocko@kernel.org
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 15:57         ` Johannes Weiner
  2017-12-01 16:38           ` Michal Hocko
@ 2017-12-01 16:52           ` Tetsuo Handa
  1 sibling, 0 replies; 17+ messages in thread
From: Tetsuo Handa @ 2017-12-01 16:52 UTC (permalink / raw)
  To: hannes, mhocko; +Cc: akpm, linux-mm, aarcange

Johannes Weiner wrote:
> On Fri, Dec 01, 2017 at 04:17:15PM +0100, Michal Hocko wrote:
> > On Fri 01-12-17 14:56:38, Johannes Weiner wrote:
> > > On Fri, Dec 01, 2017 at 03:46:34PM +0100, Michal Hocko wrote:
> > > > On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> > > > > On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > > > > > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> > > > > >  	}
> > > > > >  
> > > > > >  	select_bad_process(oc);
> > > > > > +	/*
> > > > > > +	 * Try really last second allocation attempt after we selected an OOM
> > > > > > +	 * victim, for somebody might have managed to free memory while we were
> > > > > > +	 * selecting an OOM victim which can take quite some time.
> > > > > 
> > > > > Somebody might free some memory right after this attempt fails. OOM
> > > > > can always be a temporary state that resolves on its own.

"[PATCH 3/3] mm,oom: Remove oom_lock serialization from the OOM reaper." says
that doing last second allocation attempt after select_bad_process() should
help the OOM reaper to free memory compared to doing last second allocation
before select_bad_process().

> > > > > 
> > > > > What keeps us from declaring OOM prematurely is the fact that we
> > > > > already scanned the entire LRU list without success, not last second
> > > > > or last-last second, or REALLY last-last-last-second allocations.
> > > > 
> > > > You are right that this is inherently racy. The point here is, however,
> > > > that the race window between the last check and the kill can be _huge_!
> > > 
> > > My point is that it's irrelevant. We already sampled the entire LRU
> > > list; compared to that, the delay before the kill is immaterial.
> > 
> > Well, I would disagree. I have seen OOM reports with a free memory.
> > Closer debugging shown that an existing process was on the way out and
> > the oom victim selection took way too long and fired after a large
> > process manage. There were different hacks^Wheuristics to cover those
> > cases but they turned out to just cause different corner cases. Moving
> > the existing last moment allocation after a potentially very time
> > consuming action is relatively cheap and safe measure to cover those
> > cases without any negative side effects I can think of.
> 
> An existing process can exit right after you pull the trigger. How big
> is *that* race window? By this logic you could add a sleep(5) before
> the last-second allocation because it would increase the likelihood of
> somebody else exiting voluntarily.

Sleeping with oom_lock held is bad. Even schedule_timeout_killable(1) at
out_of_memory() can allow the owner of oom_lock sleep effectively forever
when many threads are hitting mutex_trylock(&oom_lock) at
__alloc_pages_may_oom(). Let alone adding sleep(5) before sending SIGKILL
and waking up the OOM reaper.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-01 16:38           ` Michal Hocko
@ 2017-12-05 10:46             ` Johannes Weiner
  2017-12-05 13:02               ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Johannes Weiner @ 2017-12-05 10:46 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Fri, Dec 01, 2017 at 05:38:30PM +0100, Michal Hocko wrote:
> On Fri 01-12-17 15:57:11, Johannes Weiner wrote:
> > On Fri, Dec 01, 2017 at 04:17:15PM +0100, Michal Hocko wrote:
> > > On Fri 01-12-17 14:56:38, Johannes Weiner wrote:
> > > > On Fri, Dec 01, 2017 at 03:46:34PM +0100, Michal Hocko wrote:
> > > > > On Fri 01-12-17 14:33:17, Johannes Weiner wrote:
> > > > > > On Sat, Nov 25, 2017 at 07:52:47PM +0900, Tetsuo Handa wrote:
> > > > > > > @@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
> > > > > > >  	}
> > > > > > >  
> > > > > > >  	select_bad_process(oc);
> > > > > > > +	/*
> > > > > > > +	 * Try really last second allocation attempt after we selected an OOM
> > > > > > > +	 * victim, for somebody might have managed to free memory while we were
> > > > > > > +	 * selecting an OOM victim which can take quite some time.
> > > > > > 
> > > > > > Somebody might free some memory right after this attempt fails. OOM
> > > > > > can always be a temporary state that resolves on its own.
> > > > > > 
> > > > > > What keeps us from declaring OOM prematurely is the fact that we
> > > > > > already scanned the entire LRU list without success, not last second
> > > > > > or last-last second, or REALLY last-last-last-second allocations.
> > > > > 
> > > > > You are right that this is inherently racy. The point here is, however,
> > > > > that the race window between the last check and the kill can be _huge_!
> > > > 
> > > > My point is that it's irrelevant. We already sampled the entire LRU
> > > > list; compared to that, the delay before the kill is immaterial.
> > > 
> > > Well, I would disagree. I have seen OOM reports with a free memory.
> > > Closer debugging shown that an existing process was on the way out and
> > > the oom victim selection took way too long and fired after a large
> > > process manage. There were different hacks^Wheuristics to cover those
> > > cases but they turned out to just cause different corner cases. Moving
> > > the existing last moment allocation after a potentially very time
> > > consuming action is relatively cheap and safe measure to cover those
> > > cases without any negative side effects I can think of.
> > 
> > An existing process can exit right after you pull the trigger. How big
> > is *that* race window? By this logic you could add a sleep(5) before
> > the last-second allocation because it would increase the likelihood of
> > somebody else exiting voluntarily.
> 
> Please read what I wrote above again. I am not saying this is _closing_
> the any race. It however reduces the race window which I find generally
> a good thing. Especially when there are no other negative side effects.

Please read what I wrote. OOM conditions are not steady states, so you
are shaving cycles off a race window that is indefinite in size.

> > This patch is making the time it takes to select a victim an integral
> > part of OOM semantics. Think about it: if somebody later speeds up the
> > OOM selection process, they shrink the window in which somebody could
> > volunteer memory for the last-second allocation. By optimizing that
> > code, you're probabilistically increasing the rate of OOM kills.
> >
> > A guaranteed 5 second window would in fact be better behavior.
> > 
> > This is bananas. I'm sticking with my nak.
> 
> So are you saying that the existing last allocation attempt is more
> reasonable? I've tried to remove it [1] and you were against that.
> 
> All I'am trying to tell is that _if_ we want to have something like
> the last moment allocation after reclaim gave up then it should happen
> closer to the killing the actual disruptive operation. The current
> attempt in __alloc_pages_may_oom makes only very little sense to me.

Yes, you claim that, but you're not making a convincing case to me.

That last attempt serializes OOM conditions. It doesn't matter where
it is before the OOM kill as long as it's inside the OOM lock, because
these are the outcomes from the locked section:

	1. It's the first invocation, nothing is on the freelist, no
	task has TIF_MEMDIE set. Choose a victim and kill.

	2. It's the second invocation, the first invocation is still
	active. The trylock fails and we retry.

	3. It's the second invocation, a victim has been dispatched
	but nothing has been freed. TIF_MEMDIE is found, we retry.

	4. It's the second invocation, a victim has died (or been
	reaped) and freed memory. The allocation succeeds.

That's how the OOM state machine works in the presence of multiple
allocating threads, and the code as is makes perfect sense to me.

Your argument for moving the allocation attempt closer to the kill is
because the OOM kill is destructive and we don't want it to happen
when something unrelated happens to free memory during the victim
selection. I do understand that.

My argument against doing that is that the OOM kill is destructive and
we want it tied to memory pressure as determined by reclaim, not
random events we don't have control over, so that users can test the
safety of the memory pressure created by their applications before
putting them into production environments.

We'd give up a certain amount of determinism and reproducibility, and
introduce unexpected implementation-defined semantics (currently the
sampling window for pressure is reclaim time, afterwards it would
include OOM victim selection time), in an attempt to probabilistically
reduce OOM kills under severe memory pressure by an unknown factor.

This might sound intriguing when you only focus on the split second
between the last reclaim attempt and when we issue the kill - "hey,
look, here is one individual instance of a kill I could have avoided
by exploiting a race condition."

But it's bad system behavior. For most users OOM kills are extremely
disruptive. Literally the only way to make them any worse is by making
them unpredictable and less reproducible.

I do understand the upsides you're advocating for - although you
haven't quantified them. They're just not worth the downsides.

Hence the nak.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-05 10:46             ` Johannes Weiner
@ 2017-12-05 13:02               ` Michal Hocko
  2017-12-05 13:17                 ` Tetsuo Handa
  0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2017-12-05 13:02 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Tetsuo Handa, akpm, linux-mm, Andrea Arcangeli

On Tue 05-12-17 10:46:01, Johannes Weiner wrote:
> On Fri, Dec 01, 2017 at 05:38:30PM +0100, Michal Hocko wrote:
[...]
> > So are you saying that the existing last allocation attempt is more
> > reasonable? I've tried to remove it [1] and you were against that.
> > 
> > All I'am trying to tell is that _if_ we want to have something like
> > the last moment allocation after reclaim gave up then it should happen
> > closer to the killing the actual disruptive operation. The current
> > attempt in __alloc_pages_may_oom makes only very little sense to me.
> 
> Yes, you claim that, but you're not making a convincing case to me.
> 
> That last attempt serializes OOM conditions. It doesn't matter where
> it is before the OOM kill as long as it's inside the OOM lock, because
> these are the outcomes from the locked section:
> 
> 	1. It's the first invocation, nothing is on the freelist, no
> 	task has TIF_MEMDIE set. Choose a victim and kill.
> 
> 	2. It's the second invocation, the first invocation is still
> 	active. The trylock fails and we retry.
> 
> 	3. It's the second invocation, a victim has been dispatched
> 	but nothing has been freed. TIF_MEMDIE is found, we retry.
> 
> 	4. It's the second invocation, a victim has died (or been
> 	reaped) and freed memory. The allocation succeeds.
> 
> That's how the OOM state machine works in the presence of multiple
> allocating threads, and the code as is makes perfect sense to me.
> 
> Your argument for moving the allocation attempt closer to the kill is
> because the OOM kill is destructive and we don't want it to happen
> when something unrelated happens to free memory during the victim
> selection. I do understand that.
> 
> My argument against doing that is that the OOM kill is destructive and
> we want it tied to memory pressure as determined by reclaim, not
> random events we don't have control over, so that users can test the
> safety of the memory pressure created by their applications before
> putting them into production environments.
> 
> We'd give up a certain amount of determinism and reproducibility, and
> introduce unexpected implementation-defined semantics (currently the
> sampling window for pressure is reclaim time, afterwards it would
> include OOM victim selection time), in an attempt to probabilistically
> reduce OOM kills under severe memory pressure by an unknown factor.
> 
> This might sound intriguing when you only focus on the split second
> between the last reclaim attempt and when we issue the kill - "hey,
> look, here is one individual instance of a kill I could have avoided
> by exploiting a race condition."
> 
> But it's bad system behavior. For most users OOM kills are extremely
> disruptive. Literally the only way to make them any worse is by making
> them unpredictable and less reproducible.

Thanks for the extended clarification. I understand your concern much
more now. I do not fully agree, though.

OOM killer has always had that "try to prevent killing a victim"
approach in it and on some cases it is a good thing. Basically anytime
when there are reasonable changes of a forward progress then a saved
kill might save a workload and user data. That is something we really
care about much more than a determinism which is quite limited by the
fact that the memory reclaim itself cannot be deterministic because
there way too many parties to interact together on a highly complex
system.

On the other hand we used to have some back-off heuristics which were
promissing a forward progress yet they had some rough edges and were too
livelock happy. So this is definitely a tricky area.

> I do understand the upsides you're advocating for - although you
> haven't quantified them. They're just not worth the downsides.

OK, fair enough. Let's drop the patch then. There is no _strong_
justification for it and what I've seen as "nice to have" is indeed
really hard to quantify and not really worth merging without a full
consensus.

Thanks!
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-05 13:02               ` Michal Hocko
@ 2017-12-05 13:17                 ` Tetsuo Handa
  2017-12-05 13:42                   ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Tetsuo Handa @ 2017-12-05 13:17 UTC (permalink / raw)
  To: mhocko, hannes; +Cc: akpm, linux-mm, aarcange

Michal Hocko wrote:
> > I do understand the upsides you're advocating for - although you
> > haven't quantified them. They're just not worth the downsides.
> 
> OK, fair enough. Let's drop the patch then. There is no _strong_
> justification for it and what I've seen as "nice to have" is indeed
> really hard to quantify and not really worth merging without a full
> consensus.

Dropping "mm,oom: move last second allocation to inside the OOM killer"
means dropping "mm,oom: remove oom_lock serialization from the OOM reaper"
together, right? The latter patch helped mitigating
schedule_timeout_killable(1) lockup problem though...

Also, what is the alternative for "mm,oom: use ALLOC_OOM for OOM victim's
last second allocation" ? I proposed "mm, oom: task_will_free_mem(current)
should ignore MMF_OOM_SKIP for once." and rejected by you. I also proposed
"mm,oom: Set ->signal->oom_mm to all thread groups sharing the victim's mm."
and rejected by you.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-05 13:17                 ` Tetsuo Handa
@ 2017-12-05 13:42                   ` Michal Hocko
  2017-12-05 14:07                     ` Tetsuo Handa
  0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2017-12-05 13:42 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: hannes, akpm, linux-mm, aarcange

On Tue 05-12-17 22:17:27, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > > I do understand the upsides you're advocating for - although you
> > > haven't quantified them. They're just not worth the downsides.
> > 
> > OK, fair enough. Let's drop the patch then. There is no _strong_
> > justification for it and what I've seen as "nice to have" is indeed
> > really hard to quantify and not really worth merging without a full
> > consensus.
> 
> Dropping "mm,oom: move last second allocation to inside the OOM killer"
> means dropping "mm,oom: remove oom_lock serialization from the OOM reaper"
> together, right?

No, I believe that we can drop the lock even without this patch. This
will need more investigation though.

> The latter patch helped mitigating
> schedule_timeout_killable(1) lockup problem though...
> 
> Also, what is the alternative for "mm,oom: use ALLOC_OOM for OOM victim's
> last second allocation" ? I proposed "mm, oom: task_will_free_mem(current)
> should ignore MMF_OOM_SKIP for once." and rejected by you. I also proposed
> "mm,oom: Set ->signal->oom_mm to all thread groups sharing the victim's mm."
> and rejected by you.

Yes, and so far I am not really sure we have to care all that much. I
haven't seen any real world workload actually hitting this condition.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-05 13:42                   ` Michal Hocko
@ 2017-12-05 14:07                     ` Tetsuo Handa
  2017-12-05 14:30                       ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Tetsuo Handa @ 2017-12-05 14:07 UTC (permalink / raw)
  To: mhocko; +Cc: hannes, akpm, linux-mm, aarcange, mjaggi

Michal Hocko wrote:
> On Tue 05-12-17 22:17:27, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > > I do understand the upsides you're advocating for - although you
> > > > haven't quantified them. They're just not worth the downsides.
> > > 
> > > OK, fair enough. Let's drop the patch then. There is no _strong_
> > > justification for it and what I've seen as "nice to have" is indeed
> > > really hard to quantify and not really worth merging without a full
> > > consensus.
> > 
> > Dropping "mm,oom: move last second allocation to inside the OOM killer"
> > means dropping "mm,oom: remove oom_lock serialization from the OOM reaper"
> > together, right?
> 
> No, I believe that we can drop the lock even without this patch. This
> will need more investigation though.

We cannot drop the lock without this patch.

> 
> > The latter patch helped mitigating
> > schedule_timeout_killable(1) lockup problem though...
> > 
> > Also, what is the alternative for "mm,oom: use ALLOC_OOM for OOM victim's
> > last second allocation" ? I proposed "mm, oom: task_will_free_mem(current)
> > should ignore MMF_OOM_SKIP for once." and rejected by you. I also proposed
> > "mm,oom: Set ->signal->oom_mm to all thread groups sharing the victim's mm."
> > and rejected by you.
> 
> Yes, and so far I am not really sure we have to care all that much. I
> haven't seen any real world workload actually hitting this condition.
> 

Somebody will observe what Manish Jaggi observed. OOM with mlock()ed and/or
MAP_SHARED is irrelevant. There is always possibility that the OOM reaper
fails to reclaim memory due to mmap_sem contention (and results in extra
OOM kills).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer.
  2017-12-05 14:07                     ` Tetsuo Handa
@ 2017-12-05 14:30                       ` Michal Hocko
  0 siblings, 0 replies; 17+ messages in thread
From: Michal Hocko @ 2017-12-05 14:30 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: hannes, akpm, linux-mm, aarcange, mjaggi

On Tue 05-12-17 23:07:53, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Tue 05-12-17 22:17:27, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > > I do understand the upsides you're advocating for - although you
> > > > > haven't quantified them. They're just not worth the downsides.
> > > > 
> > > > OK, fair enough. Let's drop the patch then. There is no _strong_
> > > > justification for it and what I've seen as "nice to have" is indeed
> > > > really hard to quantify and not really worth merging without a full
> > > > consensus.
> > > 
> > > Dropping "mm,oom: move last second allocation to inside the OOM killer"
> > > means dropping "mm,oom: remove oom_lock serialization from the OOM reaper"
> > > together, right?
> > 
> > No, I believe that we can drop the lock even without this patch. This
> > will need more investigation though.
> 
> We cannot drop the lock without this patch.

This should be discussed in the respective thread.
 
> > > The latter patch helped mitigating
> > > schedule_timeout_killable(1) lockup problem though...
> > > 
> > > Also, what is the alternative for "mm,oom: use ALLOC_OOM for OOM victim's
> > > last second allocation" ? I proposed "mm, oom: task_will_free_mem(current)
> > > should ignore MMF_OOM_SKIP for once." and rejected by you. I also proposed
> > > "mm,oom: Set ->signal->oom_mm to all thread groups sharing the victim's mm."
> > > and rejected by you.
> > 
> > Yes, and so far I am not really sure we have to care all that much. I
> > haven't seen any real world workload actually hitting this condition.
> > 
> 
> Somebody will observe what Manish Jaggi observed. OOM with mlock()ed and/or
> MAP_SHARED is irrelevant. There is always possibility that the OOM reaper
> fails to reclaim memory due to mmap_sem contention (and results in extra
> OOM kills).

... and we will try to handle this with due diligence as soon as we see
those reports and see how serious they are.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2017-12-05 14:30 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-25 10:52 [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Tetsuo Handa
2017-11-25 10:52 ` [PATCH 2/3] mm,oom: Use ALLOC_OOM for OOM victim's last second allocation Tetsuo Handa
2017-11-25 10:52 ` [PATCH 3/3] mm,oom: Remove oom_lock serialization from the OOM reaper Tetsuo Handa
2017-11-28 13:04 ` [PATCH 1/3] mm,oom: Move last second allocation to inside the OOM killer Michal Hocko
2017-12-01 14:33 ` Johannes Weiner
2017-12-01 14:46   ` Michal Hocko
2017-12-01 14:56     ` Johannes Weiner
2017-12-01 15:17       ` Michal Hocko
2017-12-01 15:57         ` Johannes Weiner
2017-12-01 16:38           ` Michal Hocko
2017-12-05 10:46             ` Johannes Weiner
2017-12-05 13:02               ` Michal Hocko
2017-12-05 13:17                 ` Tetsuo Handa
2017-12-05 13:42                   ` Michal Hocko
2017-12-05 14:07                     ` Tetsuo Handa
2017-12-05 14:30                       ` Michal Hocko
2017-12-01 16:52           ` Tetsuo Handa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.