All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 1/2] mm: memcg: handle non-error OOM situations more gracefully
@ 2013-10-08 20:58 ` Johannes Weiner
  0 siblings, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-10-08 20:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel

3812c8c ("mm: memcg: do not trap chargers with full callstack on OOM")
assumed that only a few places that can trigger a memcg OOM situation
do not return VM_FAULT_OOM, like optional page cache readahead.  But
there are many more and it's impractical to annotate them all.

First of all, we don't want to invoke the OOM killer when the failed
allocation is gracefully handled, so defer the actual kill to the end
of the fault handling as well.  This simplifies the code quite a bit
for added bonus.

Second, since a failed allocation might not be the abrupt end of the
fault, the memcg OOM handler needs to be re-entrant until the fault
finishes for subsequent allocation attempts.  If an allocation is
attempted after the task already OOMed, allow it to bypass the limit
so that it can quickly finish the fault and invoke the OOM killer.

Reported-by: azurIt <azurit@pobox.sk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@kernel.org
---
 include/linux/memcontrol.h |  50 ++++------------
 include/linux/sched.h      |   7 +--
 mm/filemap.c               |  11 +---
 mm/memcontrol.c            | 139 +++++++++++++++++----------------------------
 mm/memory.c                |  18 ++++--
 mm/oom_kill.c              |   2 +-
 6 files changed, 79 insertions(+), 148 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ecc82b3..b3e7a66 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 					struct page *newpage);
 
-/**
- * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
- * @new: true to enable, false to disable
- *
- * Toggle whether a failed memcg charge should invoke the OOM killer
- * or just return -ENOMEM.  Returns the previous toggle state.
- *
- * NOTE: Any path that enables the OOM killer before charging must
- *       call mem_cgroup_oom_synchronize() afterward to finalize the
- *       OOM handling and clean up.
- */
-static inline bool mem_cgroup_toggle_oom(bool new)
+static inline void mem_cgroup_oom_enable(void)
 {
-	bool old;
-
-	old = current->memcg_oom.may_oom;
-	current->memcg_oom.may_oom = new;
-
-	return old;
+	WARN_ON(current->memcg_oom.may_oom);
+	current->memcg_oom.may_oom = 1;
 }
 
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
-	bool old = mem_cgroup_toggle_oom(true);
-
-	WARN_ON(old == true);
-}
-
-static inline void mem_cgroup_disable_oom(void)
-{
-	bool old = mem_cgroup_toggle_oom(false);
-
-	WARN_ON(old == false);
+	WARN_ON(!current->memcg_oom.may_oom);
+	current->memcg_oom.may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.in_memcg_oom;
+	return p->memcg_oom.memcg;
 }
 
-bool mem_cgroup_oom_synchronize(void);
+bool mem_cgroup_oom_synchronize(bool wait);
 
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 {
 }
 
-static inline bool mem_cgroup_toggle_oom(bool new)
-{
-	return false;
-}
-
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_enable(void)
 {
 }
 
-static inline void mem_cgroup_disable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
 }
 
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return false;
 }
 
-static inline bool mem_cgroup_oom_synchronize(void)
+static inline bool mem_cgroup_oom_synchronize(bool wait)
 {
 	return false;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da3..e27baee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
 	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 	struct memcg_oom_info {
+		struct mem_cgroup *memcg;
+		gfp_t gfp_mask;
+		int order;
 		unsigned int may_oom:1;
-		unsigned int in_memcg_oom:1;
-		unsigned int oom_locked:1;
-		int wakeups;
-		struct mem_cgroup *wait_on_memcg;
 	} memcg_oom;
 #endif
 #ifdef CONFIG_UPROBES
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4..ae4846f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = mapping->host;
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
-	bool memcg_oom;
 	pgoff_t size;
 	int ret = 0;
 
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	/*
-	 * Do we have something in the page cache already?  Either
-	 * way, try readahead, but disable the memcg OOM killer for it
-	 * as readahead is optional and no errors are propagated up
-	 * the fault stack.  The OOM killer is enabled while trying to
-	 * instantiate the faulting page individually below.
+	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_async_mmap_readahead(vma, ra, file, page, offset);
-		mem_cgroup_toggle_oom(memcg_oom);
 	} else if (!page) {
 		/* No page in the page cache at all */
-		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_sync_mmap_readahead(vma, ra, file, offset);
-		mem_cgroup_toggle_oom(memcg_oom);
 		count_vm_event(PGMAJFAULT);
 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddb..b39dfac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2159,110 +2159,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	bool locked;
-	int wakeups;
-
 	if (!current->memcg_oom.may_oom)
 		return;
-
-	current->memcg_oom.in_memcg_oom = 1;
-
 	/*
-	 * As with any blocking lock, a contender needs to start
-	 * listening for wakeups before attempting the trylock,
-	 * otherwise it can miss the wakeup from the unlock and sleep
-	 * indefinitely.  This is just open-coded because our locking
-	 * is so particular to memcg hierarchies.
+	 * We are in the middle of the charge context here, so we
+	 * don't want to block when potentially sitting on a callstack
+	 * that holds all kinds of filesystem and mm locks.
+	 *
+	 * Also, the caller may handle a failed allocation gracefully
+	 * (like optional page cache readahead) and so an OOM killer
+	 * invocation might not even be necessary.
+	 *
+	 * That's why we don't do anything here except remember the
+	 * OOM context and then deal with it at the end of the page
+	 * fault when the stack is unwound, the locks are released,
+	 * and when we know whether the fault was overall successful.
 	 */
-	wakeups = atomic_read(&memcg->oom_wakeups);
-	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
-		mem_cgroup_oom_notify(memcg);
-
-	if (locked && !memcg->oom_kill_disable) {
-		mem_cgroup_unmark_under_oom(memcg);
-		mem_cgroup_out_of_memory(memcg, mask, order);
-		mem_cgroup_oom_unlock(memcg);
-		/*
-		 * There is no guarantee that an OOM-lock contender
-		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
-		 */
-		memcg_oom_recover(memcg);
-	} else {
-		/*
-		 * A system call can just return -ENOMEM, but if this
-		 * is a page fault and somebody else is handling the
-		 * OOM already, we need to sleep on the OOM waitqueue
-		 * for this memcg until the situation is resolved.
-		 * Which can take some time because it might be
-		 * handled by a userspace task.
-		 *
-		 * However, this is the charge context, which means
-		 * that we may sit on a large call stack and hold
-		 * various filesystem locks, the mmap_sem etc. and we
-		 * don't want the OOM handler to deadlock on them
-		 * while we sit here and wait.  Store the current OOM
-		 * context in the task_struct, then return -ENOMEM.
-		 * At the end of the page fault handler, with the
-		 * stack unwound, pagefault_out_of_memory() will check
-		 * back with us by calling
-		 * mem_cgroup_oom_synchronize(), possibly putting the
-		 * task to sleep.
-		 */
-		current->memcg_oom.oom_locked = locked;
-		current->memcg_oom.wakeups = wakeups;
-		css_get(&memcg->css);
-		current->memcg_oom.wait_on_memcg = memcg;
-	}
+	css_get(&memcg->css);
+	current->memcg_oom.memcg = memcg;
+	current->memcg_oom.gfp_mask = mask;
+	current->memcg_oom.order = order;
 }
 
 /**
  * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
  *
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
  *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
  * sleep on a waitqueue until the userspace task resolves the
  * situation.  Sleeping directly in the charge context with all kinds
  * of locks held is not a good idea, instead we remember an OOM state
  * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
  *
  * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
  */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+	struct mem_cgroup *memcg = current->memcg_oom.memcg;
 	struct oom_wait_info owait;
-	struct mem_cgroup *memcg;
+	bool locked;
 
 	/* OOM is global, do not handle */
-	if (!current->memcg_oom.in_memcg_oom)
-		return false;
-
-	/*
-	 * We invoked the OOM killer but there is a chance that a kill
-	 * did not free up any charges.  Everybody else might already
-	 * be sleeping, so restart the fault and keep the rampage
-	 * going until some charges are released.
-	 */
-	memcg = current->memcg_oom.wait_on_memcg;
 	if (!memcg)
-		goto out;
+		return false;
 
-	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-		goto out_memcg;
+	if (!handle)
+		goto cleanup;
 
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
@@ -2271,13 +2220,25 @@ bool mem_cgroup_oom_synchronize(void)
 	INIT_LIST_HEAD(&owait.wait.task_list);
 
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	/* Only sleep if we didn't miss any wakeups since OOM */
-	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+	mem_cgroup_mark_under_oom(memcg);
+
+	locked = mem_cgroup_oom_trylock(memcg);
+
+	if (locked)
+		mem_cgroup_oom_notify(memcg);
+
+	if (locked && !memcg->oom_kill_disable) {
+		mem_cgroup_unmark_under_oom(memcg);
+		finish_wait(&memcg_oom_waitq, &owait.wait);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+					 current->memcg_oom.order);
+	} else {
 		schedule();
-	finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
-	mem_cgroup_unmark_under_oom(memcg);
-	if (current->memcg_oom.oom_locked) {
+		mem_cgroup_unmark_under_oom(memcg);
+		finish_wait(&memcg_oom_waitq, &owait.wait);
+	}
+
+	if (locked) {
 		mem_cgroup_oom_unlock(memcg);
 		/*
 		 * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2247,9 @@ out_memcg:
 		 */
 		memcg_oom_recover(memcg);
 	}
+cleanup:
+	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
-	current->memcg_oom.wait_on_memcg = NULL;
-out:
-	current->memcg_oom.in_memcg_oom = 0;
 	return true;
 }
 
@@ -2703,6 +2663,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		     || fatal_signal_pending(current)))
 		goto bypass;
 
+	if (unlikely(task_in_memcg_oom(current)))
+		goto bypass;
+
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..867e064 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3863,15 +3863,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * space.  Kernel faults are handled more gracefully.
 	 */
 	if (flags & FAULT_FLAG_USER)
-		mem_cgroup_enable_oom();
+		mem_cgroup_oom_enable();
 
 	ret = __handle_mm_fault(mm, vma, address, flags);
 
-	if (flags & FAULT_FLAG_USER)
-		mem_cgroup_disable_oom();
-
-	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
-		mem_cgroup_oom_synchronize();
+	if (flags & FAULT_FLAG_USER) {
+		mem_cgroup_oom_disable();
+                /*
+                 * The task may have entered a memcg OOM situation but
+                 * if the allocation error was handled gracefully (no
+                 * VM_FAULT_OOM), there is no need to kill anything.
+                 * Just clean up the OOM state peacefully.
+                 */
+                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                        mem_cgroup_oom_synchronize(false);
+	}
 
 	return ret;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d2..6738c47 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
 {
 	struct zonelist *zonelist;
 
-	if (mem_cgroup_oom_synchronize())
+	if (mem_cgroup_oom_synchronize(true))
 		return;
 
 	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
-- 
1.8.4


^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [patch 1/2] mm: memcg: handle non-error OOM situations more gracefully
@ 2013-10-08 20:58 ` Johannes Weiner
  0 siblings, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-10-08 20:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel

3812c8c ("mm: memcg: do not trap chargers with full callstack on OOM")
assumed that only a few places that can trigger a memcg OOM situation
do not return VM_FAULT_OOM, like optional page cache readahead.  But
there are many more and it's impractical to annotate them all.

First of all, we don't want to invoke the OOM killer when the failed
allocation is gracefully handled, so defer the actual kill to the end
of the fault handling as well.  This simplifies the code quite a bit
for added bonus.

Second, since a failed allocation might not be the abrupt end of the
fault, the memcg OOM handler needs to be re-entrant until the fault
finishes for subsequent allocation attempts.  If an allocation is
attempted after the task already OOMed, allow it to bypass the limit
so that it can quickly finish the fault and invoke the OOM killer.

Reported-by: azurIt <azurit@pobox.sk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@kernel.org
---
 include/linux/memcontrol.h |  50 ++++------------
 include/linux/sched.h      |   7 +--
 mm/filemap.c               |  11 +---
 mm/memcontrol.c            | 139 +++++++++++++++++----------------------------
 mm/memory.c                |  18 ++++--
 mm/oom_kill.c              |   2 +-
 6 files changed, 79 insertions(+), 148 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ecc82b3..b3e7a66 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 					struct page *newpage);
 
-/**
- * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
- * @new: true to enable, false to disable
- *
- * Toggle whether a failed memcg charge should invoke the OOM killer
- * or just return -ENOMEM.  Returns the previous toggle state.
- *
- * NOTE: Any path that enables the OOM killer before charging must
- *       call mem_cgroup_oom_synchronize() afterward to finalize the
- *       OOM handling and clean up.
- */
-static inline bool mem_cgroup_toggle_oom(bool new)
+static inline void mem_cgroup_oom_enable(void)
 {
-	bool old;
-
-	old = current->memcg_oom.may_oom;
-	current->memcg_oom.may_oom = new;
-
-	return old;
+	WARN_ON(current->memcg_oom.may_oom);
+	current->memcg_oom.may_oom = 1;
 }
 
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
-	bool old = mem_cgroup_toggle_oom(true);
-
-	WARN_ON(old == true);
-}
-
-static inline void mem_cgroup_disable_oom(void)
-{
-	bool old = mem_cgroup_toggle_oom(false);
-
-	WARN_ON(old == false);
+	WARN_ON(!current->memcg_oom.may_oom);
+	current->memcg_oom.may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.in_memcg_oom;
+	return p->memcg_oom.memcg;
 }
 
-bool mem_cgroup_oom_synchronize(void);
+bool mem_cgroup_oom_synchronize(bool wait);
 
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 {
 }
 
-static inline bool mem_cgroup_toggle_oom(bool new)
-{
-	return false;
-}
-
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_enable(void)
 {
 }
 
-static inline void mem_cgroup_disable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
 }
 
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return false;
 }
 
-static inline bool mem_cgroup_oom_synchronize(void)
+static inline bool mem_cgroup_oom_synchronize(bool wait)
 {
 	return false;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da3..e27baee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
 	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 	struct memcg_oom_info {
+		struct mem_cgroup *memcg;
+		gfp_t gfp_mask;
+		int order;
 		unsigned int may_oom:1;
-		unsigned int in_memcg_oom:1;
-		unsigned int oom_locked:1;
-		int wakeups;
-		struct mem_cgroup *wait_on_memcg;
 	} memcg_oom;
 #endif
 #ifdef CONFIG_UPROBES
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4..ae4846f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = mapping->host;
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
-	bool memcg_oom;
 	pgoff_t size;
 	int ret = 0;
 
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	/*
-	 * Do we have something in the page cache already?  Either
-	 * way, try readahead, but disable the memcg OOM killer for it
-	 * as readahead is optional and no errors are propagated up
-	 * the fault stack.  The OOM killer is enabled while trying to
-	 * instantiate the faulting page individually below.
+	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_async_mmap_readahead(vma, ra, file, page, offset);
-		mem_cgroup_toggle_oom(memcg_oom);
 	} else if (!page) {
 		/* No page in the page cache at all */
-		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_sync_mmap_readahead(vma, ra, file, offset);
-		mem_cgroup_toggle_oom(memcg_oom);
 		count_vm_event(PGMAJFAULT);
 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddb..b39dfac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2159,110 +2159,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	bool locked;
-	int wakeups;
-
 	if (!current->memcg_oom.may_oom)
 		return;
-
-	current->memcg_oom.in_memcg_oom = 1;
-
 	/*
-	 * As with any blocking lock, a contender needs to start
-	 * listening for wakeups before attempting the trylock,
-	 * otherwise it can miss the wakeup from the unlock and sleep
-	 * indefinitely.  This is just open-coded because our locking
-	 * is so particular to memcg hierarchies.
+	 * We are in the middle of the charge context here, so we
+	 * don't want to block when potentially sitting on a callstack
+	 * that holds all kinds of filesystem and mm locks.
+	 *
+	 * Also, the caller may handle a failed allocation gracefully
+	 * (like optional page cache readahead) and so an OOM killer
+	 * invocation might not even be necessary.
+	 *
+	 * That's why we don't do anything here except remember the
+	 * OOM context and then deal with it at the end of the page
+	 * fault when the stack is unwound, the locks are released,
+	 * and when we know whether the fault was overall successful.
 	 */
-	wakeups = atomic_read(&memcg->oom_wakeups);
-	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
-		mem_cgroup_oom_notify(memcg);
-
-	if (locked && !memcg->oom_kill_disable) {
-		mem_cgroup_unmark_under_oom(memcg);
-		mem_cgroup_out_of_memory(memcg, mask, order);
-		mem_cgroup_oom_unlock(memcg);
-		/*
-		 * There is no guarantee that an OOM-lock contender
-		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
-		 */
-		memcg_oom_recover(memcg);
-	} else {
-		/*
-		 * A system call can just return -ENOMEM, but if this
-		 * is a page fault and somebody else is handling the
-		 * OOM already, we need to sleep on the OOM waitqueue
-		 * for this memcg until the situation is resolved.
-		 * Which can take some time because it might be
-		 * handled by a userspace task.
-		 *
-		 * However, this is the charge context, which means
-		 * that we may sit on a large call stack and hold
-		 * various filesystem locks, the mmap_sem etc. and we
-		 * don't want the OOM handler to deadlock on them
-		 * while we sit here and wait.  Store the current OOM
-		 * context in the task_struct, then return -ENOMEM.
-		 * At the end of the page fault handler, with the
-		 * stack unwound, pagefault_out_of_memory() will check
-		 * back with us by calling
-		 * mem_cgroup_oom_synchronize(), possibly putting the
-		 * task to sleep.
-		 */
-		current->memcg_oom.oom_locked = locked;
-		current->memcg_oom.wakeups = wakeups;
-		css_get(&memcg->css);
-		current->memcg_oom.wait_on_memcg = memcg;
-	}
+	css_get(&memcg->css);
+	current->memcg_oom.memcg = memcg;
+	current->memcg_oom.gfp_mask = mask;
+	current->memcg_oom.order = order;
 }
 
 /**
  * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
  *
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
  *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
  * sleep on a waitqueue until the userspace task resolves the
  * situation.  Sleeping directly in the charge context with all kinds
  * of locks held is not a good idea, instead we remember an OOM state
  * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
  *
  * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
  */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+	struct mem_cgroup *memcg = current->memcg_oom.memcg;
 	struct oom_wait_info owait;
-	struct mem_cgroup *memcg;
+	bool locked;
 
 	/* OOM is global, do not handle */
-	if (!current->memcg_oom.in_memcg_oom)
-		return false;
-
-	/*
-	 * We invoked the OOM killer but there is a chance that a kill
-	 * did not free up any charges.  Everybody else might already
-	 * be sleeping, so restart the fault and keep the rampage
-	 * going until some charges are released.
-	 */
-	memcg = current->memcg_oom.wait_on_memcg;
 	if (!memcg)
-		goto out;
+		return false;
 
-	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-		goto out_memcg;
+	if (!handle)
+		goto cleanup;
 
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
@@ -2271,13 +2220,25 @@ bool mem_cgroup_oom_synchronize(void)
 	INIT_LIST_HEAD(&owait.wait.task_list);
 
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	/* Only sleep if we didn't miss any wakeups since OOM */
-	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+	mem_cgroup_mark_under_oom(memcg);
+
+	locked = mem_cgroup_oom_trylock(memcg);
+
+	if (locked)
+		mem_cgroup_oom_notify(memcg);
+
+	if (locked && !memcg->oom_kill_disable) {
+		mem_cgroup_unmark_under_oom(memcg);
+		finish_wait(&memcg_oom_waitq, &owait.wait);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+					 current->memcg_oom.order);
+	} else {
 		schedule();
-	finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
-	mem_cgroup_unmark_under_oom(memcg);
-	if (current->memcg_oom.oom_locked) {
+		mem_cgroup_unmark_under_oom(memcg);
+		finish_wait(&memcg_oom_waitq, &owait.wait);
+	}
+
+	if (locked) {
 		mem_cgroup_oom_unlock(memcg);
 		/*
 		 * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2247,9 @@ out_memcg:
 		 */
 		memcg_oom_recover(memcg);
 	}
+cleanup:
+	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
-	current->memcg_oom.wait_on_memcg = NULL;
-out:
-	current->memcg_oom.in_memcg_oom = 0;
 	return true;
 }
 
@@ -2703,6 +2663,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		     || fatal_signal_pending(current)))
 		goto bypass;
 
+	if (unlikely(task_in_memcg_oom(current)))
+		goto bypass;
+
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..867e064 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3863,15 +3863,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * space.  Kernel faults are handled more gracefully.
 	 */
 	if (flags & FAULT_FLAG_USER)
-		mem_cgroup_enable_oom();
+		mem_cgroup_oom_enable();
 
 	ret = __handle_mm_fault(mm, vma, address, flags);
 
-	if (flags & FAULT_FLAG_USER)
-		mem_cgroup_disable_oom();
-
-	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
-		mem_cgroup_oom_synchronize();
+	if (flags & FAULT_FLAG_USER) {
+		mem_cgroup_oom_disable();
+                /*
+                 * The task may have entered a memcg OOM situation but
+                 * if the allocation error was handled gracefully (no
+                 * VM_FAULT_OOM), there is no need to kill anything.
+                 * Just clean up the OOM state peacefully.
+                 */
+                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                        mem_cgroup_oom_synchronize(false);
+	}
 
 	return ret;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d2..6738c47 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
 {
 	struct zonelist *zonelist;
 
-	if (mem_cgroup_oom_synchronize())
+	if (mem_cgroup_oom_synchronize(true))
 		return;
 
 	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [patch 1/2] mm: memcg: handle non-error OOM situations more gracefully
@ 2013-10-08 20:58 ` Johannes Weiner
  0 siblings, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-10-08 20:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, azurIt, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

3812c8c ("mm: memcg: do not trap chargers with full callstack on OOM")
assumed that only a few places that can trigger a memcg OOM situation
do not return VM_FAULT_OOM, like optional page cache readahead.  But
there are many more and it's impractical to annotate them all.

First of all, we don't want to invoke the OOM killer when the failed
allocation is gracefully handled, so defer the actual kill to the end
of the fault handling as well.  This simplifies the code quite a bit
for added bonus.

Second, since a failed allocation might not be the abrupt end of the
fault, the memcg OOM handler needs to be re-entrant until the fault
finishes for subsequent allocation attempts.  If an allocation is
attempted after the task already OOMed, allow it to bypass the limit
so that it can quickly finish the fault and invoke the OOM killer.

Reported-by: azurIt <azurit-Rm0zKEqwvD4@public.gmane.org>
Signed-off-by: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: stable-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
---
 include/linux/memcontrol.h |  50 ++++------------
 include/linux/sched.h      |   7 +--
 mm/filemap.c               |  11 +---
 mm/memcontrol.c            | 139 +++++++++++++++++----------------------------
 mm/memory.c                |  18 ++++--
 mm/oom_kill.c              |   2 +-
 6 files changed, 79 insertions(+), 148 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ecc82b3..b3e7a66 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 					struct page *newpage);
 
-/**
- * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
- * @new: true to enable, false to disable
- *
- * Toggle whether a failed memcg charge should invoke the OOM killer
- * or just return -ENOMEM.  Returns the previous toggle state.
- *
- * NOTE: Any path that enables the OOM killer before charging must
- *       call mem_cgroup_oom_synchronize() afterward to finalize the
- *       OOM handling and clean up.
- */
-static inline bool mem_cgroup_toggle_oom(bool new)
+static inline void mem_cgroup_oom_enable(void)
 {
-	bool old;
-
-	old = current->memcg_oom.may_oom;
-	current->memcg_oom.may_oom = new;
-
-	return old;
+	WARN_ON(current->memcg_oom.may_oom);
+	current->memcg_oom.may_oom = 1;
 }
 
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
-	bool old = mem_cgroup_toggle_oom(true);
-
-	WARN_ON(old == true);
-}
-
-static inline void mem_cgroup_disable_oom(void)
-{
-	bool old = mem_cgroup_toggle_oom(false);
-
-	WARN_ON(old == false);
+	WARN_ON(!current->memcg_oom.may_oom);
+	current->memcg_oom.may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-	return p->memcg_oom.in_memcg_oom;
+	return p->memcg_oom.memcg;
 }
 
-bool mem_cgroup_oom_synchronize(void);
+bool mem_cgroup_oom_synchronize(bool wait);
 
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 {
 }
 
-static inline bool mem_cgroup_toggle_oom(bool new)
-{
-	return false;
-}
-
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_enable(void)
 {
 }
 
-static inline void mem_cgroup_disable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
 }
 
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return false;
 }
 
-static inline bool mem_cgroup_oom_synchronize(void)
+static inline bool mem_cgroup_oom_synchronize(bool wait)
 {
 	return false;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da3..e27baee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
 	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 	struct memcg_oom_info {
+		struct mem_cgroup *memcg;
+		gfp_t gfp_mask;
+		int order;
 		unsigned int may_oom:1;
-		unsigned int in_memcg_oom:1;
-		unsigned int oom_locked:1;
-		int wakeups;
-		struct mem_cgroup *wait_on_memcg;
 	} memcg_oom;
 #endif
 #ifdef CONFIG_UPROBES
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4..ae4846f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = mapping->host;
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
-	bool memcg_oom;
 	pgoff_t size;
 	int ret = 0;
 
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	/*
-	 * Do we have something in the page cache already?  Either
-	 * way, try readahead, but disable the memcg OOM killer for it
-	 * as readahead is optional and no errors are propagated up
-	 * the fault stack.  The OOM killer is enabled while trying to
-	 * instantiate the faulting page individually below.
+	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_async_mmap_readahead(vma, ra, file, page, offset);
-		mem_cgroup_toggle_oom(memcg_oom);
 	} else if (!page) {
 		/* No page in the page cache at all */
-		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_sync_mmap_readahead(vma, ra, file, offset);
-		mem_cgroup_toggle_oom(memcg_oom);
 		count_vm_event(PGMAJFAULT);
 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddb..b39dfac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2159,110 +2159,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	bool locked;
-	int wakeups;
-
 	if (!current->memcg_oom.may_oom)
 		return;
-
-	current->memcg_oom.in_memcg_oom = 1;
-
 	/*
-	 * As with any blocking lock, a contender needs to start
-	 * listening for wakeups before attempting the trylock,
-	 * otherwise it can miss the wakeup from the unlock and sleep
-	 * indefinitely.  This is just open-coded because our locking
-	 * is so particular to memcg hierarchies.
+	 * We are in the middle of the charge context here, so we
+	 * don't want to block when potentially sitting on a callstack
+	 * that holds all kinds of filesystem and mm locks.
+	 *
+	 * Also, the caller may handle a failed allocation gracefully
+	 * (like optional page cache readahead) and so an OOM killer
+	 * invocation might not even be necessary.
+	 *
+	 * That's why we don't do anything here except remember the
+	 * OOM context and then deal with it at the end of the page
+	 * fault when the stack is unwound, the locks are released,
+	 * and when we know whether the fault was overall successful.
 	 */
-	wakeups = atomic_read(&memcg->oom_wakeups);
-	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
-		mem_cgroup_oom_notify(memcg);
-
-	if (locked && !memcg->oom_kill_disable) {
-		mem_cgroup_unmark_under_oom(memcg);
-		mem_cgroup_out_of_memory(memcg, mask, order);
-		mem_cgroup_oom_unlock(memcg);
-		/*
-		 * There is no guarantee that an OOM-lock contender
-		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
-		 */
-		memcg_oom_recover(memcg);
-	} else {
-		/*
-		 * A system call can just return -ENOMEM, but if this
-		 * is a page fault and somebody else is handling the
-		 * OOM already, we need to sleep on the OOM waitqueue
-		 * for this memcg until the situation is resolved.
-		 * Which can take some time because it might be
-		 * handled by a userspace task.
-		 *
-		 * However, this is the charge context, which means
-		 * that we may sit on a large call stack and hold
-		 * various filesystem locks, the mmap_sem etc. and we
-		 * don't want the OOM handler to deadlock on them
-		 * while we sit here and wait.  Store the current OOM
-		 * context in the task_struct, then return -ENOMEM.
-		 * At the end of the page fault handler, with the
-		 * stack unwound, pagefault_out_of_memory() will check
-		 * back with us by calling
-		 * mem_cgroup_oom_synchronize(), possibly putting the
-		 * task to sleep.
-		 */
-		current->memcg_oom.oom_locked = locked;
-		current->memcg_oom.wakeups = wakeups;
-		css_get(&memcg->css);
-		current->memcg_oom.wait_on_memcg = memcg;
-	}
+	css_get(&memcg->css);
+	current->memcg_oom.memcg = memcg;
+	current->memcg_oom.gfp_mask = mask;
+	current->memcg_oom.order = order;
 }
 
 /**
  * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
  *
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
  *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
  * sleep on a waitqueue until the userspace task resolves the
  * situation.  Sleeping directly in the charge context with all kinds
  * of locks held is not a good idea, instead we remember an OOM state
  * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
  *
  * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
  */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+	struct mem_cgroup *memcg = current->memcg_oom.memcg;
 	struct oom_wait_info owait;
-	struct mem_cgroup *memcg;
+	bool locked;
 
 	/* OOM is global, do not handle */
-	if (!current->memcg_oom.in_memcg_oom)
-		return false;
-
-	/*
-	 * We invoked the OOM killer but there is a chance that a kill
-	 * did not free up any charges.  Everybody else might already
-	 * be sleeping, so restart the fault and keep the rampage
-	 * going until some charges are released.
-	 */
-	memcg = current->memcg_oom.wait_on_memcg;
 	if (!memcg)
-		goto out;
+		return false;
 
-	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-		goto out_memcg;
+	if (!handle)
+		goto cleanup;
 
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
@@ -2271,13 +2220,25 @@ bool mem_cgroup_oom_synchronize(void)
 	INIT_LIST_HEAD(&owait.wait.task_list);
 
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	/* Only sleep if we didn't miss any wakeups since OOM */
-	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+	mem_cgroup_mark_under_oom(memcg);
+
+	locked = mem_cgroup_oom_trylock(memcg);
+
+	if (locked)
+		mem_cgroup_oom_notify(memcg);
+
+	if (locked && !memcg->oom_kill_disable) {
+		mem_cgroup_unmark_under_oom(memcg);
+		finish_wait(&memcg_oom_waitq, &owait.wait);
+		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+					 current->memcg_oom.order);
+	} else {
 		schedule();
-	finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
-	mem_cgroup_unmark_under_oom(memcg);
-	if (current->memcg_oom.oom_locked) {
+		mem_cgroup_unmark_under_oom(memcg);
+		finish_wait(&memcg_oom_waitq, &owait.wait);
+	}
+
+	if (locked) {
 		mem_cgroup_oom_unlock(memcg);
 		/*
 		 * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2247,9 @@ out_memcg:
 		 */
 		memcg_oom_recover(memcg);
 	}
+cleanup:
+	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
-	current->memcg_oom.wait_on_memcg = NULL;
-out:
-	current->memcg_oom.in_memcg_oom = 0;
 	return true;
 }
 
@@ -2703,6 +2663,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		     || fatal_signal_pending(current)))
 		goto bypass;
 
+	if (unlikely(task_in_memcg_oom(current)))
+		goto bypass;
+
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..867e064 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3863,15 +3863,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * space.  Kernel faults are handled more gracefully.
 	 */
 	if (flags & FAULT_FLAG_USER)
-		mem_cgroup_enable_oom();
+		mem_cgroup_oom_enable();
 
 	ret = __handle_mm_fault(mm, vma, address, flags);
 
-	if (flags & FAULT_FLAG_USER)
-		mem_cgroup_disable_oom();
-
-	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
-		mem_cgroup_oom_synchronize();
+	if (flags & FAULT_FLAG_USER) {
+		mem_cgroup_oom_disable();
+                /*
+                 * The task may have entered a memcg OOM situation but
+                 * if the allocation error was handled gracefully (no
+                 * VM_FAULT_OOM), there is no need to kill anything.
+                 * Just clean up the OOM state peacefully.
+                 */
+                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                        mem_cgroup_oom_synchronize(false);
+	}
 
 	return ret;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d2..6738c47 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
 {
 	struct zonelist *zonelist;
 
-	if (mem_cgroup_oom_synchronize())
+	if (mem_cgroup_oom_synchronize(true))
 		return;
 
 	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
-- 
1.8.4

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-10-08 20:58 ` Johannes Weiner
  (?)
@ 2013-10-08 20:58   ` Johannes Weiner
  -1 siblings, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-10-08 20:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel

Buffer allocation has a very crude indefinite loop around waking the
flusher threads and performing global NOFS direct reclaim because it
can not handle allocation failures.

The most immediate problem with this is that the allocation may fail
due to a memory cgroup limit, where flushers + direct reclaim might
not make any progress towards resolving the situation at all.  Because
unlike the global case, a memory cgroup may not have any cache at all,
only anonymous pages but no swap.  This situation will lead to a
reclaim livelock with insane IO from waking the flushers and thrashing
unrelated filesystem cache in a tight loop.

Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
that any looping happens in the page allocator, which knows how to
orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
also allows memory cgroups to detect allocations that can't handle
failure and will allow them to ultimately bypass the limit if reclaim
can not make progress.

Reported-by: azurIt <azurit@pobox.sk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@kernel.org
---
 fs/buffer.c     | 14 ++++++++++++--
 mm/memcontrol.c |  2 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 4d74335..6024877 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 	sector_t end_block;
 	int ret = 0;		/* Will call free_more_memory() */
+	gfp_t gfp_mask;
 
-	page = find_or_create_page(inode->i_mapping, index,
-		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+	gfp_mask |= __GFP_MOVABLE;
+	/*
+	 * XXX: __getblk_slow() can not really deal with failure and
+	 * will endlessly loop on improvised global reclaim.  Prefer
+	 * looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp_mask |= __GFP_NOFAIL;
+
+	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 	if (!page)
 		return ret;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b39dfac..e233aa1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2764,6 +2764,8 @@ done:
 	return 0;
 nomem:
 	*ptr = NULL;
+	if (gfp_mask & __GFP_NOFAIL)
+		return 0;
 	return -ENOMEM;
 bypass:
 	*ptr = root_mem_cgroup;
-- 
1.8.4


^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-10-08 20:58   ` Johannes Weiner
  0 siblings, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-10-08 20:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel

Buffer allocation has a very crude indefinite loop around waking the
flusher threads and performing global NOFS direct reclaim because it
can not handle allocation failures.

The most immediate problem with this is that the allocation may fail
due to a memory cgroup limit, where flushers + direct reclaim might
not make any progress towards resolving the situation at all.  Because
unlike the global case, a memory cgroup may not have any cache at all,
only anonymous pages but no swap.  This situation will lead to a
reclaim livelock with insane IO from waking the flushers and thrashing
unrelated filesystem cache in a tight loop.

Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
that any looping happens in the page allocator, which knows how to
orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
also allows memory cgroups to detect allocations that can't handle
failure and will allow them to ultimately bypass the limit if reclaim
can not make progress.

Reported-by: azurIt <azurit@pobox.sk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@kernel.org
---
 fs/buffer.c     | 14 ++++++++++++--
 mm/memcontrol.c |  2 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 4d74335..6024877 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 	sector_t end_block;
 	int ret = 0;		/* Will call free_more_memory() */
+	gfp_t gfp_mask;
 
-	page = find_or_create_page(inode->i_mapping, index,
-		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+	gfp_mask |= __GFP_MOVABLE;
+	/*
+	 * XXX: __getblk_slow() can not really deal with failure and
+	 * will endlessly loop on improvised global reclaim.  Prefer
+	 * looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp_mask |= __GFP_NOFAIL;
+
+	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 	if (!page)
 		return ret;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b39dfac..e233aa1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2764,6 +2764,8 @@ done:
 	return 0;
 nomem:
 	*ptr = NULL;
+	if (gfp_mask & __GFP_NOFAIL)
+		return 0;
 	return -ENOMEM;
 bypass:
 	*ptr = root_mem_cgroup;
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-10-08 20:58   ` Johannes Weiner
  0 siblings, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-10-08 20:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, azurIt, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Buffer allocation has a very crude indefinite loop around waking the
flusher threads and performing global NOFS direct reclaim because it
can not handle allocation failures.

The most immediate problem with this is that the allocation may fail
due to a memory cgroup limit, where flushers + direct reclaim might
not make any progress towards resolving the situation at all.  Because
unlike the global case, a memory cgroup may not have any cache at all,
only anonymous pages but no swap.  This situation will lead to a
reclaim livelock with insane IO from waking the flushers and thrashing
unrelated filesystem cache in a tight loop.

Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
that any looping happens in the page allocator, which knows how to
orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
also allows memory cgroups to detect allocations that can't handle
failure and will allow them to ultimately bypass the limit if reclaim
can not make progress.

Reported-by: azurIt <azurit-Rm0zKEqwvD4@public.gmane.org>
Signed-off-by: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: stable-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
---
 fs/buffer.c     | 14 ++++++++++++--
 mm/memcontrol.c |  2 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 4d74335..6024877 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 	sector_t end_block;
 	int ret = 0;		/* Will call free_more_memory() */
+	gfp_t gfp_mask;
 
-	page = find_or_create_page(inode->i_mapping, index,
-		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+	gfp_mask |= __GFP_MOVABLE;
+	/*
+	 * XXX: __getblk_slow() can not really deal with failure and
+	 * will endlessly loop on improvised global reclaim.  Prefer
+	 * looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp_mask |= __GFP_NOFAIL;
+
+	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 	if (!page)
 		return ret;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b39dfac..e233aa1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2764,6 +2764,8 @@ done:
 	return 0;
 nomem:
 	*ptr = NULL;
+	if (gfp_mask & __GFP_NOFAIL)
+		return 0;
 	return -ENOMEM;
 bypass:
 	*ptr = root_mem_cgroup;
-- 
1.8.4

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-10-08 20:58   ` Johannes Weiner
@ 2013-10-11 20:51     ` Andrew Morton
  -1 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-10-11 20:51 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel

On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:

> Buffer allocation has a very crude indefinite loop around waking the
> flusher threads and performing global NOFS direct reclaim because it
> can not handle allocation failures.
> 
> The most immediate problem with this is that the allocation may fail
> due to a memory cgroup limit, where flushers + direct reclaim might
> not make any progress towards resolving the situation at all.  Because
> unlike the global case, a memory cgroup may not have any cache at all,
> only anonymous pages but no swap.  This situation will lead to a
> reclaim livelock with insane IO from waking the flushers and thrashing
> unrelated filesystem cache in a tight loop.
> 
> Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> that any looping happens in the page allocator, which knows how to
> orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> also allows memory cgroups to detect allocations that can't handle
> failure and will allow them to ultimately bypass the limit if reclaim
> can not make progress.
> 
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>  	struct buffer_head *bh;
>  	sector_t end_block;
>  	int ret = 0;		/* Will call free_more_memory() */
> +	gfp_t gfp_mask;
>  
> -	page = find_or_create_page(inode->i_mapping, index,
> -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> +	gfp_mask |= __GFP_MOVABLE;
> +	/*
> +	 * XXX: __getblk_slow() can not really deal with failure and
> +	 * will endlessly loop on improvised global reclaim.  Prefer
> +	 * looping in the allocator rather than here, at least that
> +	 * code knows what it's doing.
> +	 */
> +	gfp_mask |= __GFP_NOFAIL;

Yup.  When I added GFP_NOFAIL all those years ago there were numerous
open-coded try-forever loops, and GFP_NOFAIL was more a cleanup than
anything else - move the loop into the page allocator, leaving behind a
sentinel which says "this code sucks and should be fixed".  Of course,
nothing has since been fixed :(

So apart from fixing a bug, this patch continues this conversion.  I
can't think why I didn't do it a decade ago!

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-10-11 20:51     ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-10-11 20:51 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel

On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:

> Buffer allocation has a very crude indefinite loop around waking the
> flusher threads and performing global NOFS direct reclaim because it
> can not handle allocation failures.
> 
> The most immediate problem with this is that the allocation may fail
> due to a memory cgroup limit, where flushers + direct reclaim might
> not make any progress towards resolving the situation at all.  Because
> unlike the global case, a memory cgroup may not have any cache at all,
> only anonymous pages but no swap.  This situation will lead to a
> reclaim livelock with insane IO from waking the flushers and thrashing
> unrelated filesystem cache in a tight loop.
> 
> Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> that any looping happens in the page allocator, which knows how to
> orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> also allows memory cgroups to detect allocations that can't handle
> failure and will allow them to ultimately bypass the limit if reclaim
> can not make progress.
> 
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>  	struct buffer_head *bh;
>  	sector_t end_block;
>  	int ret = 0;		/* Will call free_more_memory() */
> +	gfp_t gfp_mask;
>  
> -	page = find_or_create_page(inode->i_mapping, index,
> -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> +	gfp_mask |= __GFP_MOVABLE;
> +	/*
> +	 * XXX: __getblk_slow() can not really deal with failure and
> +	 * will endlessly loop on improvised global reclaim.  Prefer
> +	 * looping in the allocator rather than here, at least that
> +	 * code knows what it's doing.
> +	 */
> +	gfp_mask |= __GFP_NOFAIL;

Yup.  When I added GFP_NOFAIL all those years ago there were numerous
open-coded try-forever loops, and GFP_NOFAIL was more a cleanup than
anything else - move the loop into the page allocator, leaving behind a
sentinel which says "this code sucks and should be fixed".  Of course,
nothing has since been fixed :(

So apart from fixing a bug, this patch continues this conversion.  I
can't think why I didn't do it a decade ago!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-10-08 20:58   ` Johannes Weiner
@ 2013-12-04  0:59     ` Andrew Morton
  -1 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-12-04  0:59 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel,
	Christian Casteyde, Pekka Enberg, Christoph Lameter

On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:

> Buffer allocation has a very crude indefinite loop around waking the
> flusher threads and performing global NOFS direct reclaim because it
> can not handle allocation failures.
> 
> The most immediate problem with this is that the allocation may fail
> due to a memory cgroup limit, where flushers + direct reclaim might
> not make any progress towards resolving the situation at all.  Because
> unlike the global case, a memory cgroup may not have any cache at all,
> only anonymous pages but no swap.  This situation will lead to a
> reclaim livelock with insane IO from waking the flushers and thrashing
> unrelated filesystem cache in a tight loop.
> 
> Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> that any looping happens in the page allocator, which knows how to
> orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> also allows memory cgroups to detect allocations that can't handle
> failure and will allow them to ultimately bypass the limit if reclaim
> can not make progress.

Problem.

> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>  	struct buffer_head *bh;
>  	sector_t end_block;
>  	int ret = 0;		/* Will call free_more_memory() */
> +	gfp_t gfp_mask;
>  
> -	page = find_or_create_page(inode->i_mapping, index,
> -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> +	gfp_mask |= __GFP_MOVABLE;

https://bugzilla.kernel.org/show_bug.cgi?id=65991

WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
 0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
 ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
 0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
Call Trace:
 [<ffffffff81898d39>] dump_stack+0x4e/0x7a
 [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
 [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
 [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
 [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
 [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
 [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
 [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
 [<ffffffff81152495>] new_slab+0x345/0x3e0
 [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
 [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
 [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
 [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
 [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
 [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
 [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
 [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
 [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
 [<ffffffff8118e630>] __getblk+0xf0/0x2f0

That __GFP_NOFAIL is getting down into
radix_tree_preload->kmem_cache_alloc() and I expect that in its
boundless stupidity, slab has decided to inappropriately go and use an
unnecessarily massive page size for radix_tree_node_cachep's underlying
memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
more) allocation, which is unacceptably risky, methinks.

I really really wish slab wouldn't do this.  The benefit is surely very
small and these unnecessary higher-order allocations are quite abusive
of the page allocator.

Can we please make slab stop doing this?

radix_tree_nodes are 560 bytes and the kernel often allocates them in
times of extreme memory stress.  We really really want them to be
backed by order=0 pages.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04  0:59     ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-12-04  0:59 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Michal Hocko, azurIt, linux-mm, cgroups, linux-kernel,
	Christian Casteyde, Pekka Enberg, Christoph Lameter

On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:

> Buffer allocation has a very crude indefinite loop around waking the
> flusher threads and performing global NOFS direct reclaim because it
> can not handle allocation failures.
> 
> The most immediate problem with this is that the allocation may fail
> due to a memory cgroup limit, where flushers + direct reclaim might
> not make any progress towards resolving the situation at all.  Because
> unlike the global case, a memory cgroup may not have any cache at all,
> only anonymous pages but no swap.  This situation will lead to a
> reclaim livelock with insane IO from waking the flushers and thrashing
> unrelated filesystem cache in a tight loop.
> 
> Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> that any looping happens in the page allocator, which knows how to
> orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> also allows memory cgroups to detect allocations that can't handle
> failure and will allow them to ultimately bypass the limit if reclaim
> can not make progress.

Problem.

> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
>  	struct buffer_head *bh;
>  	sector_t end_block;
>  	int ret = 0;		/* Will call free_more_memory() */
> +	gfp_t gfp_mask;
>  
> -	page = find_or_create_page(inode->i_mapping, index,
> -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> +	gfp_mask |= __GFP_MOVABLE;

https://bugzilla.kernel.org/show_bug.cgi?id=65991

WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
 0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
 ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
 0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
Call Trace:
 [<ffffffff81898d39>] dump_stack+0x4e/0x7a
 [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
 [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
 [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
 [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
 [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
 [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
 [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
 [<ffffffff81152495>] new_slab+0x345/0x3e0
 [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
 [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
 [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
 [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
 [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
 [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
 [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
 [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
 [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
 [<ffffffff8118e630>] __getblk+0xf0/0x2f0

That __GFP_NOFAIL is getting down into
radix_tree_preload->kmem_cache_alloc() and I expect that in its
boundless stupidity, slab has decided to inappropriately go and use an
unnecessarily massive page size for radix_tree_node_cachep's underlying
memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
more) allocation, which is unacceptably risky, methinks.

I really really wish slab wouldn't do this.  The benefit is surely very
small and these unnecessary higher-order allocations are quite abusive
of the page allocator.

Can we please make slab stop doing this?

radix_tree_nodes are 560 bytes and the kernel often allocates them in
times of extreme memory stress.  We really really want them to be
backed by order=0 pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04  0:59     ` Andrew Morton
  (?)
@ 2013-12-04  1:52       ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04  1:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Tue, Dec 03, 2013 at 04:59:10PM -0800, Andrew Morton wrote:
> On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> > Buffer allocation has a very crude indefinite loop around waking the
> > flusher threads and performing global NOFS direct reclaim because it
> > can not handle allocation failures.
> > 
> > The most immediate problem with this is that the allocation may fail
> > due to a memory cgroup limit, where flushers + direct reclaim might
> > not make any progress towards resolving the situation at all.  Because
> > unlike the global case, a memory cgroup may not have any cache at all,
> > only anonymous pages but no swap.  This situation will lead to a
> > reclaim livelock with insane IO from waking the flushers and thrashing
> > unrelated filesystem cache in a tight loop.
> > 
> > Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> > that any looping happens in the page allocator, which knows how to
> > orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> > also allows memory cgroups to detect allocations that can't handle
> > failure and will allow them to ultimately bypass the limit if reclaim
> > can not make progress.
> 
> Problem.
> 
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> >  	struct buffer_head *bh;
> >  	sector_t end_block;
> >  	int ret = 0;		/* Will call free_more_memory() */
> > +	gfp_t gfp_mask;
> >  
> > -	page = find_or_create_page(inode->i_mapping, index,
> > -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> > +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> > +	gfp_mask |= __GFP_MOVABLE;
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
> Modules linked in:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
> Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
>  0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
>  ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
>  0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
> Call Trace:
>  [<ffffffff81898d39>] dump_stack+0x4e/0x7a
>  [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
>  [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
>  [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
>  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>  [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
>  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>  [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
>  [<ffffffff81152495>] new_slab+0x345/0x3e0
>  [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
>  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
>  [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
>  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
>  [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
>  [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
>  [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
>  [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
>  [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
>  [<ffffffff8118e630>] __getblk+0xf0/0x2f0
> 
> That __GFP_NOFAIL is getting down into
> radix_tree_preload->kmem_cache_alloc() and I expect that in its
> boundless stupidity, slab has decided to inappropriately go and use an
> unnecessarily massive page size for radix_tree_node_cachep's underlying
> memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
> more) allocation, which is unacceptably risky, methinks.
> 
> I really really wish slab wouldn't do this.  The benefit is surely very
> small and these unnecessary higher-order allocations are quite abusive
> of the page allocator.
> 
> Can we please make slab stop doing this?
> 
> radix_tree_nodes are 560 bytes and the kernel often allocates them in
> times of extreme memory stress.  We really really want them to be
> backed by order=0 pages.

Hello, Andrew.

Following patch would fix this problem.

Thanks.

-------------------8<------------------------
>From 7f21232d1eeffccdbd0f6d79c04d297cf95a713e Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 4 Dec 2013 10:36:11 +0900
Subject: [PATCH] slub: fix high order page allocation problem with
 __GFP_NOFAIL

SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
But, when allocating shadow page for kmemcheck, it missed clearing
the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.

https://bugzilla.kernel.org/show_bug.cgi?id=65991

This patch fix this situation by using same allocation flag as original
allocation.

Reported-by: Christian Casteyde <casteyde.christian@free.fr>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/mm/slub.c b/mm/slub.c
index 545a170..3dd28b1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	page = alloc_slab_page(alloc_gfp, node, oo);
 	if (unlikely(!page)) {
 		oo = s->min;
+		alloc_gfp = flags;
 		/*
 		 * Allocation may have failed due to fragmentation.
 		 * Try a lower order alloc if possible
 		 */
-		page = alloc_slab_page(flags, node, oo);
+		page = alloc_slab_page(alloc_gfp, node, oo);
 
 		if (page)
 			stat(s, ORDER_FALLBACK);
@@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
 		int pages = 1 << oo_order(oo);
 
-		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
 
 		/*
 		 * Objects from caches that have a constructor don't get
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04  1:52       ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04  1:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Tue, Dec 03, 2013 at 04:59:10PM -0800, Andrew Morton wrote:
> On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> > Buffer allocation has a very crude indefinite loop around waking the
> > flusher threads and performing global NOFS direct reclaim because it
> > can not handle allocation failures.
> > 
> > The most immediate problem with this is that the allocation may fail
> > due to a memory cgroup limit, where flushers + direct reclaim might
> > not make any progress towards resolving the situation at all.  Because
> > unlike the global case, a memory cgroup may not have any cache at all,
> > only anonymous pages but no swap.  This situation will lead to a
> > reclaim livelock with insane IO from waking the flushers and thrashing
> > unrelated filesystem cache in a tight loop.
> > 
> > Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> > that any looping happens in the page allocator, which knows how to
> > orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> > also allows memory cgroups to detect allocations that can't handle
> > failure and will allow them to ultimately bypass the limit if reclaim
> > can not make progress.
> 
> Problem.
> 
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> >  	struct buffer_head *bh;
> >  	sector_t end_block;
> >  	int ret = 0;		/* Will call free_more_memory() */
> > +	gfp_t gfp_mask;
> >  
> > -	page = find_or_create_page(inode->i_mapping, index,
> > -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> > +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> > +	gfp_mask |= __GFP_MOVABLE;
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
> Modules linked in:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
> Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
>  0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
>  ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
>  0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
> Call Trace:
>  [<ffffffff81898d39>] dump_stack+0x4e/0x7a
>  [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
>  [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
>  [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
>  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>  [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
>  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>  [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
>  [<ffffffff81152495>] new_slab+0x345/0x3e0
>  [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
>  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
>  [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
>  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
>  [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
>  [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
>  [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
>  [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
>  [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
>  [<ffffffff8118e630>] __getblk+0xf0/0x2f0
> 
> That __GFP_NOFAIL is getting down into
> radix_tree_preload->kmem_cache_alloc() and I expect that in its
> boundless stupidity, slab has decided to inappropriately go and use an
> unnecessarily massive page size for radix_tree_node_cachep's underlying
> memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
> more) allocation, which is unacceptably risky, methinks.
> 
> I really really wish slab wouldn't do this.  The benefit is surely very
> small and these unnecessary higher-order allocations are quite abusive
> of the page allocator.
> 
> Can we please make slab stop doing this?
> 
> radix_tree_nodes are 560 bytes and the kernel often allocates them in
> times of extreme memory stress.  We really really want them to be
> backed by order=0 pages.

Hello, Andrew.

Following patch would fix this problem.

Thanks.

-------------------8<------------------------

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04  1:52       ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04  1:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Christian Casteyde,
	Pekka Enberg, Christoph Lameter

On Tue, Dec 03, 2013 at 04:59:10PM -0800, Andrew Morton wrote:
> On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org> wrote:
> 
> > Buffer allocation has a very crude indefinite loop around waking the
> > flusher threads and performing global NOFS direct reclaim because it
> > can not handle allocation failures.
> > 
> > The most immediate problem with this is that the allocation may fail
> > due to a memory cgroup limit, where flushers + direct reclaim might
> > not make any progress towards resolving the situation at all.  Because
> > unlike the global case, a memory cgroup may not have any cache at all,
> > only anonymous pages but no swap.  This situation will lead to a
> > reclaim livelock with insane IO from waking the flushers and thrashing
> > unrelated filesystem cache in a tight loop.
> > 
> > Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> > that any looping happens in the page allocator, which knows how to
> > orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> > also allows memory cgroups to detect allocations that can't handle
> > failure and will allow them to ultimately bypass the limit if reclaim
> > can not make progress.
> 
> Problem.
> 
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> >  	struct buffer_head *bh;
> >  	sector_t end_block;
> >  	int ret = 0;		/* Will call free_more_memory() */
> > +	gfp_t gfp_mask;
> >  
> > -	page = find_or_create_page(inode->i_mapping, index,
> > -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> > +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> > +	gfp_mask |= __GFP_MOVABLE;
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
> Modules linked in:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
> Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
>  0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
>  ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
>  0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
> Call Trace:
>  [<ffffffff81898d39>] dump_stack+0x4e/0x7a
>  [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
>  [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
>  [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
>  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>  [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
>  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>  [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
>  [<ffffffff81152495>] new_slab+0x345/0x3e0
>  [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
>  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
>  [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
>  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
>  [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
>  [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
>  [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
>  [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
>  [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
>  [<ffffffff8118e630>] __getblk+0xf0/0x2f0
> 
> That __GFP_NOFAIL is getting down into
> radix_tree_preload->kmem_cache_alloc() and I expect that in its
> boundless stupidity, slab has decided to inappropriately go and use an
> unnecessarily massive page size for radix_tree_node_cachep's underlying
> memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
> more) allocation, which is unacceptably risky, methinks.
> 
> I really really wish slab wouldn't do this.  The benefit is surely very
> small and these unnecessary higher-order allocations are quite abusive
> of the page allocator.
> 
> Can we please make slab stop doing this?
> 
> radix_tree_nodes are 560 bytes and the kernel often allocates them in
> times of extreme memory stress.  We really really want them to be
> backed by order=0 pages.

Hello, Andrew.

Following patch would fix this problem.

Thanks.

-------------------8<------------------------
From 7f21232d1eeffccdbd0f6d79c04d297cf95a713e Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim-Hm3cg6mZ9cc@public.gmane.org>
Date: Wed, 4 Dec 2013 10:36:11 +0900
Subject: [PATCH] slub: fix high order page allocation problem with
 __GFP_NOFAIL

SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
But, when allocating shadow page for kmemcheck, it missed clearing
the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.

https://bugzilla.kernel.org/show_bug.cgi?id=65991

This patch fix this situation by using same allocation flag as original
allocation.

Reported-by: Christian Casteyde <casteyde.christian-GANU6spQydw@public.gmane.org>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim-Hm3cg6mZ9cc@public.gmane.org>

diff --git a/mm/slub.c b/mm/slub.c
index 545a170..3dd28b1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	page = alloc_slab_page(alloc_gfp, node, oo);
 	if (unlikely(!page)) {
 		oo = s->min;
+		alloc_gfp = flags;
 		/*
 		 * Allocation may have failed due to fragmentation.
 		 * Try a lower order alloc if possible
 		 */
-		page = alloc_slab_page(flags, node, oo);
+		page = alloc_slab_page(alloc_gfp, node, oo);
 
 		if (page)
 			stat(s, ORDER_FALLBACK);
@@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
 		int pages = 1 << oo_order(oo);
 
-		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
 
 		/*
 		 * Objects from caches that have a constructor don't get
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04  1:52       ` Joonsoo Kim
  (?)
@ 2013-12-04  2:07         ` Andrew Morton
  -1 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-12-04  2:07 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Wed, 4 Dec 2013 10:52:18 +0900 Joonsoo Kim <iamjoonsoo.kim@lge.com> wrote:

> SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> But, when allocating shadow page for kmemcheck, it missed clearing
> the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> This patch fix this situation by using same allocation flag as original
> allocation.
> 
> Reported-by: Christian Casteyde <casteyde.christian@free.fr>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 545a170..3dd28b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	page = alloc_slab_page(alloc_gfp, node, oo);
>  	if (unlikely(!page)) {
>  		oo = s->min;

What is the value of s->min?  Please tell me it's zero.

> +		alloc_gfp = flags;
>  		/*
>  		 * Allocation may have failed due to fragmentation.
>  		 * Try a lower order alloc if possible
>  		 */
> -		page = alloc_slab_page(flags, node, oo);
> +		page = alloc_slab_page(alloc_gfp, node, oo);
>  
>  		if (page)
>  			stat(s, ORDER_FALLBACK);

This change doesn't actually do anything.

> @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>  		int pages = 1 << oo_order(oo);
>  
> -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);

That seems reasonable, assuming kmemcheck can handle the allocation
failure.


Still I dislike this practice of using unnecessarily large allocations.
What does it gain us?  Slightly improved object packing density. 
Anything else?


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04  2:07         ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-12-04  2:07 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Wed, 4 Dec 2013 10:52:18 +0900 Joonsoo Kim <iamjoonsoo.kim@lge.com> wrote:

> SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> But, when allocating shadow page for kmemcheck, it missed clearing
> the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> This patch fix this situation by using same allocation flag as original
> allocation.
> 
> Reported-by: Christian Casteyde <casteyde.christian@free.fr>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 545a170..3dd28b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	page = alloc_slab_page(alloc_gfp, node, oo);
>  	if (unlikely(!page)) {
>  		oo = s->min;

What is the value of s->min?  Please tell me it's zero.

> +		alloc_gfp = flags;
>  		/*
>  		 * Allocation may have failed due to fragmentation.
>  		 * Try a lower order alloc if possible
>  		 */
> -		page = alloc_slab_page(flags, node, oo);
> +		page = alloc_slab_page(alloc_gfp, node, oo);
>  
>  		if (page)
>  			stat(s, ORDER_FALLBACK);

This change doesn't actually do anything.

> @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>  		int pages = 1 << oo_order(oo);
>  
> -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);

That seems reasonable, assuming kmemcheck can handle the allocation
failure.


Still I dislike this practice of using unnecessarily large allocations.
What does it gain us?  Slightly improved object packing density. 
Anything else?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04  2:07         ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2013-12-04  2:07 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Johannes Weiner, Michal Hocko, azurIt,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Christian Casteyde,
	Pekka Enberg, Christoph Lameter

On Wed, 4 Dec 2013 10:52:18 +0900 Joonsoo Kim <iamjoonsoo.kim-Hm3cg6mZ9cc@public.gmane.org> wrote:

> SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> But, when allocating shadow page for kmemcheck, it missed clearing
> the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> This patch fix this situation by using same allocation flag as original
> allocation.
> 
> Reported-by: Christian Casteyde <casteyde.christian-GANU6spQydw@public.gmane.org>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim-Hm3cg6mZ9cc@public.gmane.org>
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 545a170..3dd28b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	page = alloc_slab_page(alloc_gfp, node, oo);
>  	if (unlikely(!page)) {
>  		oo = s->min;

What is the value of s->min?  Please tell me it's zero.

> +		alloc_gfp = flags;
>  		/*
>  		 * Allocation may have failed due to fragmentation.
>  		 * Try a lower order alloc if possible
>  		 */
> -		page = alloc_slab_page(flags, node, oo);
> +		page = alloc_slab_page(alloc_gfp, node, oo);
>  
>  		if (page)
>  			stat(s, ORDER_FALLBACK);

This change doesn't actually do anything.

> @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>  		int pages = 1 << oo_order(oo);
>  
> -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);

That seems reasonable, assuming kmemcheck can handle the allocation
failure.


Still I dislike this practice of using unnecessarily large allocations.
What does it gain us?  Slightly improved object packing density. 
Anything else?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04  2:07         ` Andrew Morton
@ 2013-12-04  2:42           ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04  2:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Tue, Dec 03, 2013 at 06:07:17PM -0800, Andrew Morton wrote:
> On Wed, 4 Dec 2013 10:52:18 +0900 Joonsoo Kim <iamjoonsoo.kim@lge.com> wrote:
> 
> > SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> > But, when allocating shadow page for kmemcheck, it missed clearing
> > the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=65991
> > 
> > This patch fix this situation by using same allocation flag as original
> > allocation.
> > 
> > Reported-by: Christian Casteyde <casteyde.christian@free.fr>
> > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > 
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 545a170..3dd28b1 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  	page = alloc_slab_page(alloc_gfp, node, oo);
> >  	if (unlikely(!page)) {
> >  		oo = s->min;
> 
> What is the value of s->min?  Please tell me it's zero.

s->min is calculated by get_order(object size).
So if object size is less or equal than PAGE_SIZE, it would return zero.

> 
> > +		alloc_gfp = flags;
> >  		/*
> >  		 * Allocation may have failed due to fragmentation.
> >  		 * Try a lower order alloc if possible
> >  		 */
> > -		page = alloc_slab_page(flags, node, oo);
> > +		page = alloc_slab_page(alloc_gfp, node, oo);
> >  
> >  		if (page)
> >  			stat(s, ORDER_FALLBACK);
> 
> This change doesn't actually do anything.

It set alloc_gfp to flags and we use alloc_gfp later.
It means that we try to allocate same order and flag as original allocation.

> 
> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
> >  		int pages = 1 << oo_order(oo);
> >  
> > -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> > +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
> 
> That seems reasonable, assuming kmemcheck can handle the allocation
> failure.

Yes, I looked at kmemcheck_alloc_shadow() at a glance, it can handle failure.

> 
> Still I dislike this practice of using unnecessarily large allocations.
> What does it gain us?  Slightly improved object packing density. 
> Anything else?

There is no my likes and dislikes here.
Perhaps, Christoph would answer it.

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04  2:42           ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04  2:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Tue, Dec 03, 2013 at 06:07:17PM -0800, Andrew Morton wrote:
> On Wed, 4 Dec 2013 10:52:18 +0900 Joonsoo Kim <iamjoonsoo.kim@lge.com> wrote:
> 
> > SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> > But, when allocating shadow page for kmemcheck, it missed clearing
> > the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=65991
> > 
> > This patch fix this situation by using same allocation flag as original
> > allocation.
> > 
> > Reported-by: Christian Casteyde <casteyde.christian@free.fr>
> > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > 
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 545a170..3dd28b1 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  	page = alloc_slab_page(alloc_gfp, node, oo);
> >  	if (unlikely(!page)) {
> >  		oo = s->min;
> 
> What is the value of s->min?  Please tell me it's zero.

s->min is calculated by get_order(object size).
So if object size is less or equal than PAGE_SIZE, it would return zero.

> 
> > +		alloc_gfp = flags;
> >  		/*
> >  		 * Allocation may have failed due to fragmentation.
> >  		 * Try a lower order alloc if possible
> >  		 */
> > -		page = alloc_slab_page(flags, node, oo);
> > +		page = alloc_slab_page(alloc_gfp, node, oo);
> >  
> >  		if (page)
> >  			stat(s, ORDER_FALLBACK);
> 
> This change doesn't actually do anything.

It set alloc_gfp to flags and we use alloc_gfp later.
It means that we try to allocate same order and flag as original allocation.

> 
> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
> >  		int pages = 1 << oo_order(oo);
> >  
> > -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> > +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
> 
> That seems reasonable, assuming kmemcheck can handle the allocation
> failure.

Yes, I looked at kmemcheck_alloc_shadow() at a glance, it can handle failure.

> 
> Still I dislike this practice of using unnecessarily large allocations.
> What does it gain us?  Slightly improved object packing density. 
> Anything else?

There is no my likes and dislikes here.
Perhaps, Christoph would answer it.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04  2:07         ` Andrew Morton
  (?)
@ 2013-12-04 15:17           ` Christoph Lameter
  -1 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 15:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, Johannes Weiner, Michal Hocko, azurIt, linux-mm,
	cgroups, linux-kernel, Christian Casteyde, Pekka Enberg

On Tue, 3 Dec 2013, Andrew Morton wrote:

> >  	page = alloc_slab_page(alloc_gfp, node, oo);
> >  	if (unlikely(!page)) {
> >  		oo = s->min;
>
> What is the value of s->min?  Please tell me it's zero.

It usually is.

> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
> >  		int pages = 1 << oo_order(oo);
> >
> > -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> > +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>
> That seems reasonable, assuming kmemcheck can handle the allocation
> failure.
>
>
> Still I dislike this practice of using unnecessarily large allocations.
> What does it gain us?  Slightly improved object packing density.
> Anything else?

The fastpath for slub works only within the bounds of a single slab page.
Therefore a larger frame increases the number of allocation possible from
the fastpath without having to use the slowpath and also reduces the
management overhead in the partial lists.

There is a kernel parameter that can be used to control the maximum order

	slub_max_order

The default is PAGE_ALLOC_COSTLY_ORDER. See also
Documentation/vm/slub.txt.

Booting with slub_max_order=1 will force order 0/1 pages.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04 15:17           ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 15:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, Johannes Weiner, Michal Hocko, azurIt, linux-mm,
	cgroups, linux-kernel, Christian Casteyde, Pekka Enberg

On Tue, 3 Dec 2013, Andrew Morton wrote:

> >  	page = alloc_slab_page(alloc_gfp, node, oo);
> >  	if (unlikely(!page)) {
> >  		oo = s->min;
>
> What is the value of s->min?  Please tell me it's zero.

It usually is.

> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
> >  		int pages = 1 << oo_order(oo);
> >
> > -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> > +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>
> That seems reasonable, assuming kmemcheck can handle the allocation
> failure.
>
>
> Still I dislike this practice of using unnecessarily large allocations.
> What does it gain us?  Slightly improved object packing density.
> Anything else?

The fastpath for slub works only within the bounds of a single slab page.
Therefore a larger frame increases the number of allocation possible from
the fastpath without having to use the slowpath and also reduces the
management overhead in the partial lists.

There is a kernel parameter that can be used to control the maximum order

	slub_max_order

The default is PAGE_ALLOC_COSTLY_ORDER. See also
Documentation/vm/slub.txt.

Booting with slub_max_order=1 will force order 0/1 pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04 15:17           ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 15:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, Johannes Weiner, Michal Hocko, azurIt,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Christian Casteyde,
	Pekka Enberg

On Tue, 3 Dec 2013, Andrew Morton wrote:

> >  	page = alloc_slab_page(alloc_gfp, node, oo);
> >  	if (unlikely(!page)) {
> >  		oo = s->min;
>
> What is the value of s->min?  Please tell me it's zero.

It usually is.

> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> >  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
> >  		int pages = 1 << oo_order(oo);
> >
> > -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> > +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>
> That seems reasonable, assuming kmemcheck can handle the allocation
> failure.
>
>
> Still I dislike this practice of using unnecessarily large allocations.
> What does it gain us?  Slightly improved object packing density.
> Anything else?

The fastpath for slub works only within the bounds of a single slab page.
Therefore a larger frame increases the number of allocation possible from
the fastpath without having to use the slowpath and also reduces the
management overhead in the partial lists.

There is a kernel parameter that can be used to control the maximum order

	slub_max_order

The default is PAGE_ALLOC_COSTLY_ORDER. See also
Documentation/vm/slub.txt.

Booting with slub_max_order=1 will force order 0/1 pages.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04 15:17           ` Christoph Lameter
@ 2013-12-04 16:02             ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04 16:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Joonsoo Kim, Johannes Weiner, Michal Hocko,
	azurIt, Linux Memory Management List, cgroups, LKML,
	Christian Casteyde, Pekka Enberg

2013/12/5 Christoph Lameter <cl@linux.com>:
> On Tue, 3 Dec 2013, Andrew Morton wrote:
>
>> >     page = alloc_slab_page(alloc_gfp, node, oo);
>> >     if (unlikely(!page)) {
>> >             oo = s->min;
>>
>> What is the value of s->min?  Please tell me it's zero.
>
> It usually is.
>
>> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> >             && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>> >             int pages = 1 << oo_order(oo);
>> >
>> > -           kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
>> > +           kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>>
>> That seems reasonable, assuming kmemcheck can handle the allocation
>> failure.
>>
>>
>> Still I dislike this practice of using unnecessarily large allocations.
>> What does it gain us?  Slightly improved object packing density.
>> Anything else?
>
> The fastpath for slub works only within the bounds of a single slab page.
> Therefore a larger frame increases the number of allocation possible from
> the fastpath without having to use the slowpath and also reduces the
> management overhead in the partial lists.

Hello Christoph.

Now we have cpu partial slabs facility, so I think that slowpath isn't really
slow. And it doesn't much increase the management overhead in the node
partial lists, because of cpu partial slabs.

And larger frame may cause more slab_lock contention or cmpxchg contention
if there are parallel freeings.

But, I don't know which one is better. Is larger frame still better? :)

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04 16:02             ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-04 16:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Joonsoo Kim, Johannes Weiner, Michal Hocko,
	azurIt, Linux Memory Management List, cgroups, LKML,
	Christian Casteyde, Pekka Enberg

2013/12/5 Christoph Lameter <cl@linux.com>:
> On Tue, 3 Dec 2013, Andrew Morton wrote:
>
>> >     page = alloc_slab_page(alloc_gfp, node, oo);
>> >     if (unlikely(!page)) {
>> >             oo = s->min;
>>
>> What is the value of s->min?  Please tell me it's zero.
>
> It usually is.
>
>> > @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> >             && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>> >             int pages = 1 << oo_order(oo);
>> >
>> > -           kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
>> > +           kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>>
>> That seems reasonable, assuming kmemcheck can handle the allocation
>> failure.
>>
>>
>> Still I dislike this practice of using unnecessarily large allocations.
>> What does it gain us?  Slightly improved object packing density.
>> Anything else?
>
> The fastpath for slub works only within the bounds of a single slab page.
> Therefore a larger frame increases the number of allocation possible from
> the fastpath without having to use the slowpath and also reduces the
> management overhead in the partial lists.

Hello Christoph.

Now we have cpu partial slabs facility, so I think that slowpath isn't really
slow. And it doesn't much increase the management overhead in the node
partial lists, because of cpu partial slabs.

And larger frame may cause more slab_lock contention or cmpxchg contention
if there are parallel freeings.

But, I don't know which one is better. Is larger frame still better? :)

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04 16:02             ` Joonsoo Kim
@ 2013-12-04 16:33               ` Christoph Lameter
  -1 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 16:33 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Joonsoo Kim, Johannes Weiner, Michal Hocko,
	azurIt, Linux Memory Management List, cgroups, LKML,
	Christian Casteyde, Pekka Enberg

On Thu, 5 Dec 2013, Joonsoo Kim wrote:

> Now we have cpu partial slabs facility, so I think that slowpath isn't really
> slow. And it doesn't much increase the management overhead in the node
> partial lists, because of cpu partial slabs.

Well yes that may address some of the issues here.

> And larger frame may cause more slab_lock contention or cmpxchg contention
> if there are parallel freeings.
>
> But, I don't know which one is better. Is larger frame still better? :)

Could you run some tests to figure this one out? There are also
some situations in which we disable the per cpu partial pages though.
F.e. for low latency/realtime. I posted in kernel synthetic
benchmarks for slab a while back. That maybe something to start with.



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-04 16:33               ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 16:33 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Joonsoo Kim, Johannes Weiner, Michal Hocko,
	azurIt, Linux Memory Management List, cgroups, LKML,
	Christian Casteyde, Pekka Enberg

On Thu, 5 Dec 2013, Joonsoo Kim wrote:

> Now we have cpu partial slabs facility, so I think that slowpath isn't really
> slow. And it doesn't much increase the management overhead in the node
> partial lists, because of cpu partial slabs.

Well yes that may address some of the issues here.

> And larger frame may cause more slab_lock contention or cmpxchg contention
> if there are parallel freeings.
>
> But, I don't know which one is better. Is larger frame still better? :)

Could you run some tests to figure this one out? There are also
some situations in which we disable the per cpu partial pages though.
F.e. for low latency/realtime. I posted in kernel synthetic
benchmarks for slab a while back. That maybe something to start with.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04 16:33               ` Christoph Lameter
@ 2013-12-05  8:44                 ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-05  8:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	Linux Memory Management List, cgroups, LKML, Christian Casteyde,
	Pekka Enberg

On Wed, Dec 04, 2013 at 04:33:43PM +0000, Christoph Lameter wrote:
> On Thu, 5 Dec 2013, Joonsoo Kim wrote:
> 
> > Now we have cpu partial slabs facility, so I think that slowpath isn't really
> > slow. And it doesn't much increase the management overhead in the node
> > partial lists, because of cpu partial slabs.
> 
> Well yes that may address some of the issues here.
> 
> > And larger frame may cause more slab_lock contention or cmpxchg contention
> > if there are parallel freeings.
> >
> > But, I don't know which one is better. Is larger frame still better? :)
> 
> Could you run some tests to figure this one out? There are also
> some situations in which we disable the per cpu partial pages though.
> F.e. for low latency/realtime. I posted in kernel synthetic
> benchmarks for slab a while back. That maybe something to start with.

I could try. But my trial would not figure this out, since my machine has
just 4 cores which normally cannot produce heavy contention.
Anyway, could you tell me where I can find your synthetic benchmarks for slab?

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-05  8:44                 ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-05  8:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	Linux Memory Management List, cgroups, LKML, Christian Casteyde,
	Pekka Enberg

On Wed, Dec 04, 2013 at 04:33:43PM +0000, Christoph Lameter wrote:
> On Thu, 5 Dec 2013, Joonsoo Kim wrote:
> 
> > Now we have cpu partial slabs facility, so I think that slowpath isn't really
> > slow. And it doesn't much increase the management overhead in the node
> > partial lists, because of cpu partial slabs.
> 
> Well yes that may address some of the issues here.
> 
> > And larger frame may cause more slab_lock contention or cmpxchg contention
> > if there are parallel freeings.
> >
> > But, I don't know which one is better. Is larger frame still better? :)
> 
> Could you run some tests to figure this one out? There are also
> some situations in which we disable the per cpu partial pages though.
> F.e. for low latency/realtime. I posted in kernel synthetic
> benchmarks for slab a while back. That maybe something to start with.

I could try. But my trial would not figure this out, since my machine has
just 4 cores which normally cannot produce heavy contention.
Anyway, could you tell me where I can find your synthetic benchmarks for slab?

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-05  8:44                 ` Joonsoo Kim
@ 2013-12-05 18:50                   ` Christoph Lameter
  -1 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-05 18:50 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	Linux Memory Management List, cgroups, LKML, Christian Casteyde,
	Pekka Enberg

On Thu, 5 Dec 2013, Joonsoo Kim wrote:

> I could try. But my trial would not figure this out, since my machine has
> just 4 cores which normally cannot produce heavy contention.

I think that is fine for starters. Once we know what to look for we can
find machines to test specific scenarios.

> Anyway, could you tell me where I can find your synthetic benchmarks for slab?

https://lkml.org/lkml/2009/10/13/459

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-05 18:50                   ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-05 18:50 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	Linux Memory Management List, cgroups, LKML, Christian Casteyde,
	Pekka Enberg

On Thu, 5 Dec 2013, Joonsoo Kim wrote:

> I could try. But my trial would not figure this out, since my machine has
> just 4 cores which normally cannot produce heavy contention.

I think that is fine for starters. Once we know what to look for we can
find machines to test specific scenarios.

> Anyway, could you tell me where I can find your synthetic benchmarks for slab?

https://lkml.org/lkml/2009/10/13/459

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-05 18:50                   ` Christoph Lameter
@ 2013-12-06  8:57                     ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-06  8:57 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	Linux Memory Management List, cgroups, LKML, Christian Casteyde,
	Pekka Enberg

On Thu, Dec 05, 2013 at 06:50:50PM +0000, Christoph Lameter wrote:
> On Thu, 5 Dec 2013, Joonsoo Kim wrote:
> 
> > I could try. But my trial would not figure this out, since my machine has
> > just 4 cores which normally cannot produce heavy contention.
> 
> I think that is fine for starters. Once we know what to look for we can
> find machines to test specific scenarios.
> 
> > Anyway, could you tell me where I can find your synthetic benchmarks for slab?
> 
> https://lkml.org/lkml/2009/10/13/459

Okay.
Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-06  8:57                     ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-06  8:57 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	Linux Memory Management List, cgroups, LKML, Christian Casteyde,
	Pekka Enberg

On Thu, Dec 05, 2013 at 06:50:50PM +0000, Christoph Lameter wrote:
> On Thu, 5 Dec 2013, Joonsoo Kim wrote:
> 
> > I could try. But my trial would not figure this out, since my machine has
> > just 4 cores which normally cannot produce heavy contention.
> 
> I think that is fine for starters. Once we know what to look for we can
> find machines to test specific scenarios.
> 
> > Anyway, could you tell me where I can find your synthetic benchmarks for slab?
> 
> https://lkml.org/lkml/2009/10/13/459

Okay.
Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-04  1:52       ` Joonsoo Kim
  (?)
@ 2013-12-13  6:58         ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-13  6:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Wed, Dec 04, 2013 at 10:52:18AM +0900, Joonsoo Kim wrote:
> On Tue, Dec 03, 2013 at 04:59:10PM -0800, Andrew Morton wrote:
> > On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:
> > 
> > > Buffer allocation has a very crude indefinite loop around waking the
> > > flusher threads and performing global NOFS direct reclaim because it
> > > can not handle allocation failures.
> > > 
> > > The most immediate problem with this is that the allocation may fail
> > > due to a memory cgroup limit, where flushers + direct reclaim might
> > > not make any progress towards resolving the situation at all.  Because
> > > unlike the global case, a memory cgroup may not have any cache at all,
> > > only anonymous pages but no swap.  This situation will lead to a
> > > reclaim livelock with insane IO from waking the flushers and thrashing
> > > unrelated filesystem cache in a tight loop.
> > > 
> > > Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> > > that any looping happens in the page allocator, which knows how to
> > > orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> > > also allows memory cgroups to detect allocations that can't handle
> > > failure and will allow them to ultimately bypass the limit if reclaim
> > > can not make progress.
> > 
> > Problem.
> > 
> > > --- a/fs/buffer.c
> > > +++ b/fs/buffer.c
> > > @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> > >  	struct buffer_head *bh;
> > >  	sector_t end_block;
> > >  	int ret = 0;		/* Will call free_more_memory() */
> > > +	gfp_t gfp_mask;
> > >  
> > > -	page = find_or_create_page(inode->i_mapping, index,
> > > -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> > > +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> > > +	gfp_mask |= __GFP_MOVABLE;
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=65991
> > 
> > WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
> > Modules linked in:
> > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
> > Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
> >  0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
> >  ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
> >  0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
> > Call Trace:
> >  [<ffffffff81898d39>] dump_stack+0x4e/0x7a
> >  [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
> >  [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
> >  [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
> >  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
> >  [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
> >  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
> >  [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
> >  [<ffffffff81152495>] new_slab+0x345/0x3e0
> >  [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
> >  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
> >  [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
> >  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
> >  [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
> >  [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
> >  [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
> >  [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
> >  [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
> >  [<ffffffff8118e630>] __getblk+0xf0/0x2f0
> > 
> > That __GFP_NOFAIL is getting down into
> > radix_tree_preload->kmem_cache_alloc() and I expect that in its
> > boundless stupidity, slab has decided to inappropriately go and use an
> > unnecessarily massive page size for radix_tree_node_cachep's underlying
> > memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
> > more) allocation, which is unacceptably risky, methinks.
> > 
> > I really really wish slab wouldn't do this.  The benefit is surely very
> > small and these unnecessary higher-order allocations are quite abusive
> > of the page allocator.
> > 
> > Can we please make slab stop doing this?
> > 
> > radix_tree_nodes are 560 bytes and the kernel often allocates them in
> > times of extreme memory stress.  We really really want them to be
> > backed by order=0 pages.
> 
> Hello, Andrew.
> 
> Following patch would fix this problem.
> 
> Thanks.
> 
> -------------------8<------------------------
> >From 7f21232d1eeffccdbd0f6d79c04d297cf95a713e Mon Sep 17 00:00:00 2001
> From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Date: Wed, 4 Dec 2013 10:36:11 +0900
> Subject: [PATCH] slub: fix high order page allocation problem with
>  __GFP_NOFAIL
> 
> SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> But, when allocating shadow page for kmemcheck, it missed clearing
> the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> This patch fix this situation by using same allocation flag as original
> allocation.
> 
> Reported-by: Christian Casteyde <casteyde.christian@free.fr>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 545a170..3dd28b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	page = alloc_slab_page(alloc_gfp, node, oo);
>  	if (unlikely(!page)) {
>  		oo = s->min;
> +		alloc_gfp = flags;
>  		/*
>  		 * Allocation may have failed due to fragmentation.
>  		 * Try a lower order alloc if possible
>  		 */
> -		page = alloc_slab_page(flags, node, oo);
> +		page = alloc_slab_page(alloc_gfp, node, oo);
>  
>  		if (page)
>  			stat(s, ORDER_FALLBACK);
> @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>  		int pages = 1 << oo_order(oo);
>  
> -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>  
>  		/*
>  		 * Objects from caches that have a constructor don't get
> -- 
> 1.7.9.5

Hello, Pekka and Christoph.

Could you review this patch?
I think that we should merge it to fix the problem reported by Christian.

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-13  6:58         ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-13  6:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt, linux-mm, cgroups,
	linux-kernel, Christian Casteyde, Pekka Enberg,
	Christoph Lameter

On Wed, Dec 04, 2013 at 10:52:18AM +0900, Joonsoo Kim wrote:
> On Tue, Dec 03, 2013 at 04:59:10PM -0800, Andrew Morton wrote:
> > On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote:
> > 
> > > Buffer allocation has a very crude indefinite loop around waking the
> > > flusher threads and performing global NOFS direct reclaim because it
> > > can not handle allocation failures.
> > > 
> > > The most immediate problem with this is that the allocation may fail
> > > due to a memory cgroup limit, where flushers + direct reclaim might
> > > not make any progress towards resolving the situation at all.  Because
> > > unlike the global case, a memory cgroup may not have any cache at all,
> > > only anonymous pages but no swap.  This situation will lead to a
> > > reclaim livelock with insane IO from waking the flushers and thrashing
> > > unrelated filesystem cache in a tight loop.
> > > 
> > > Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> > > that any looping happens in the page allocator, which knows how to
> > > orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> > > also allows memory cgroups to detect allocations that can't handle
> > > failure and will allow them to ultimately bypass the limit if reclaim
> > > can not make progress.
> > 
> > Problem.
> > 
> > > --- a/fs/buffer.c
> > > +++ b/fs/buffer.c
> > > @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> > >  	struct buffer_head *bh;
> > >  	sector_t end_block;
> > >  	int ret = 0;		/* Will call free_more_memory() */
> > > +	gfp_t gfp_mask;
> > >  
> > > -	page = find_or_create_page(inode->i_mapping, index,
> > > -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> > > +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> > > +	gfp_mask |= __GFP_MOVABLE;
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=65991
> > 
> > WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
> > Modules linked in:
> > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
> > Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
> >  0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
> >  ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
> >  0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
> > Call Trace:
> >  [<ffffffff81898d39>] dump_stack+0x4e/0x7a
> >  [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
> >  [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
> >  [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
> >  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
> >  [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
> >  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
> >  [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
> >  [<ffffffff81152495>] new_slab+0x345/0x3e0
> >  [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
> >  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
> >  [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
> >  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
> >  [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
> >  [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
> >  [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
> >  [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
> >  [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
> >  [<ffffffff8118e630>] __getblk+0xf0/0x2f0
> > 
> > That __GFP_NOFAIL is getting down into
> > radix_tree_preload->kmem_cache_alloc() and I expect that in its
> > boundless stupidity, slab has decided to inappropriately go and use an
> > unnecessarily massive page size for radix_tree_node_cachep's underlying
> > memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
> > more) allocation, which is unacceptably risky, methinks.
> > 
> > I really really wish slab wouldn't do this.  The benefit is surely very
> > small and these unnecessary higher-order allocations are quite abusive
> > of the page allocator.
> > 
> > Can we please make slab stop doing this?
> > 
> > radix_tree_nodes are 560 bytes and the kernel often allocates them in
> > times of extreme memory stress.  We really really want them to be
> > backed by order=0 pages.
> 
> Hello, Andrew.
> 
> Following patch would fix this problem.
> 
> Thanks.
> 
> -------------------8<------------------------
> >From 7f21232d1eeffccdbd0f6d79c04d297cf95a713e Mon Sep 17 00:00:00 2001
> From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Date: Wed, 4 Dec 2013 10:36:11 +0900
> Subject: [PATCH] slub: fix high order page allocation problem with
>  __GFP_NOFAIL
> 
> SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> But, when allocating shadow page for kmemcheck, it missed clearing
> the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> This patch fix this situation by using same allocation flag as original
> allocation.
> 
> Reported-by: Christian Casteyde <casteyde.christian@free.fr>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 545a170..3dd28b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	page = alloc_slab_page(alloc_gfp, node, oo);
>  	if (unlikely(!page)) {
>  		oo = s->min;
> +		alloc_gfp = flags;
>  		/*
>  		 * Allocation may have failed due to fragmentation.
>  		 * Try a lower order alloc if possible
>  		 */
> -		page = alloc_slab_page(flags, node, oo);
> +		page = alloc_slab_page(alloc_gfp, node, oo);
>  
>  		if (page)
>  			stat(s, ORDER_FALLBACK);
> @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>  		int pages = 1 << oo_order(oo);
>  
> -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>  
>  		/*
>  		 * Objects from caches that have a constructor don't get
> -- 
> 1.7.9.5

Hello, Pekka and Christoph.

Could you review this patch?
I think that we should merge it to fix the problem reported by Christian.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-13  6:58         ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-13  6:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, azurIt,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Christian Casteyde,
	Pekka Enberg, Christoph Lameter

On Wed, Dec 04, 2013 at 10:52:18AM +0900, Joonsoo Kim wrote:
> On Tue, Dec 03, 2013 at 04:59:10PM -0800, Andrew Morton wrote:
> > On Tue,  8 Oct 2013 16:58:10 -0400 Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org> wrote:
> > 
> > > Buffer allocation has a very crude indefinite loop around waking the
> > > flusher threads and performing global NOFS direct reclaim because it
> > > can not handle allocation failures.
> > > 
> > > The most immediate problem with this is that the allocation may fail
> > > due to a memory cgroup limit, where flushers + direct reclaim might
> > > not make any progress towards resolving the situation at all.  Because
> > > unlike the global case, a memory cgroup may not have any cache at all,
> > > only anonymous pages but no swap.  This situation will lead to a
> > > reclaim livelock with insane IO from waking the flushers and thrashing
> > > unrelated filesystem cache in a tight loop.
> > > 
> > > Use __GFP_NOFAIL allocations for buffers for now.  This makes sure
> > > that any looping happens in the page allocator, which knows how to
> > > orchestrate kswapd, direct reclaim, and the flushers sensibly.  It
> > > also allows memory cgroups to detect allocations that can't handle
> > > failure and will allow them to ultimately bypass the limit if reclaim
> > > can not make progress.
> > 
> > Problem.
> > 
> > > --- a/fs/buffer.c
> > > +++ b/fs/buffer.c
> > > @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
> > >  	struct buffer_head *bh;
> > >  	sector_t end_block;
> > >  	int ret = 0;		/* Will call free_more_memory() */
> > > +	gfp_t gfp_mask;
> > >  
> > > -	page = find_or_create_page(inode->i_mapping, index,
> > > -		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
> > > +	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
> > > +	gfp_mask |= __GFP_MOVABLE;
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=65991
> > 
> > WARNING: CPU: 0 PID: 1 at mm/page_alloc.c:1539 get_page_from_freelist+0x8a9/0x8c0()
> > Modules linked in:
> > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc1 #42
> > Hardware name: Acer Aspire 7750G/JE70_HR, BIOS V1.07 03/02/2011
> >  0000000000000009 ffff8801c6121650 ffffffff81898d39 0000000000000000
> >  ffff8801c6121688 ffffffff8107dc43 0000000000000002 0000000000000001
> >  0000000000284850 0000000000000000 ffff8801cec04680 ffff8801c6121698
> > Call Trace:
> >  [<ffffffff81898d39>] dump_stack+0x4e/0x7a
> >  [<ffffffff8107dc43>] warn_slowpath_common+0x73/0x90
> >  [<ffffffff8107dd15>] warn_slowpath_null+0x15/0x20
> >  [<ffffffff81116f69>] get_page_from_freelist+0x8a9/0x8c0
> >  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
> >  [<ffffffff81117070>] __alloc_pages_nodemask+0xf0/0x770
> >  [<ffffffff81330cdd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
> >  [<ffffffff81156823>] kmemcheck_alloc_shadow+0x53/0xf0
> >  [<ffffffff81152495>] new_slab+0x345/0x3e0
> >  [<ffffffff81897712>] __slab_alloc.isra.57+0x215/0x535
> >  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
> >  [<ffffffff811545c8>] kmem_cache_alloc+0x118/0x150
> >  [<ffffffff81328030>] ? __radix_tree_preload+0x60/0xf0
> >  [<ffffffff81328030>] __radix_tree_preload+0x60/0xf0
> >  [<ffffffff81328125>] radix_tree_maybe_preload+0x25/0x30
> >  [<ffffffff8110faf7>] add_to_page_cache_locked+0x37/0x100
> >  [<ffffffff8110fbd5>] add_to_page_cache_lru+0x15/0x40
> >  [<ffffffff8110ff37>] find_or_create_page+0x57/0x90
> >  [<ffffffff8118e630>] __getblk+0xf0/0x2f0
> > 
> > That __GFP_NOFAIL is getting down into
> > radix_tree_preload->kmem_cache_alloc() and I expect that in its
> > boundless stupidity, slab has decided to inappropriately go and use an
> > unnecessarily massive page size for radix_tree_node_cachep's underlying
> > memory allocations.  So we end up using GFP_NOFAIL for an order=2 (or
> > more) allocation, which is unacceptably risky, methinks.
> > 
> > I really really wish slab wouldn't do this.  The benefit is surely very
> > small and these unnecessary higher-order allocations are quite abusive
> > of the page allocator.
> > 
> > Can we please make slab stop doing this?
> > 
> > radix_tree_nodes are 560 bytes and the kernel often allocates them in
> > times of extreme memory stress.  We really really want them to be
> > backed by order=0 pages.
> 
> Hello, Andrew.
> 
> Following patch would fix this problem.
> 
> Thanks.
> 
> -------------------8<------------------------
> >From 7f21232d1eeffccdbd0f6d79c04d297cf95a713e Mon Sep 17 00:00:00 2001
> From: Joonsoo Kim <iamjoonsoo.kim-Hm3cg6mZ9cc@public.gmane.org>
> Date: Wed, 4 Dec 2013 10:36:11 +0900
> Subject: [PATCH] slub: fix high order page allocation problem with
>  __GFP_NOFAIL
> 
> SLUB already try to allocate high order page with clearing __GFP_NOFAIL.
> But, when allocating shadow page for kmemcheck, it missed clearing
> the flag. This trigger WARN_ON_ONCE() reported by Christian Casteyde.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=65991
> 
> This patch fix this situation by using same allocation flag as original
> allocation.
> 
> Reported-by: Christian Casteyde <casteyde.christian-GANU6spQydw@public.gmane.org>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim-Hm3cg6mZ9cc@public.gmane.org>
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 545a170..3dd28b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1335,11 +1335,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	page = alloc_slab_page(alloc_gfp, node, oo);
>  	if (unlikely(!page)) {
>  		oo = s->min;
> +		alloc_gfp = flags;
>  		/*
>  		 * Allocation may have failed due to fragmentation.
>  		 * Try a lower order alloc if possible
>  		 */
> -		page = alloc_slab_page(flags, node, oo);
> +		page = alloc_slab_page(alloc_gfp, node, oo);
>  
>  		if (page)
>  			stat(s, ORDER_FALLBACK);
> @@ -1349,7 +1350,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
>  		int pages = 1 << oo_order(oo);
>  
> -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
> +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
>  
>  		/*
>  		 * Objects from caches that have a constructor don't get
> -- 
> 1.7.9.5

Hello, Pekka and Christoph.

Could you review this patch?
I think that we should merge it to fix the problem reported by Christian.

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-13  6:58         ` Joonsoo Kim
@ 2013-12-13 16:40           ` Christoph Lameter
  -1 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-13 16:40 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt, linux-mm,
	cgroups, linux-kernel, Christian Casteyde, Pekka Enberg

On Fri, 13 Dec 2013, Joonsoo Kim wrote:

> Could you review this patch?
> I think that we should merge it to fix the problem reported by Christian.

I'd be fine with clearing __GFP_NOFAIL but not with using the same flags
as for a higher order alloc. __GFP_NORETRY and __GFP_NOWARN should be left
untouched for the minimal alloc.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-13 16:40           ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-13 16:40 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt, linux-mm,
	cgroups, linux-kernel, Christian Casteyde, Pekka Enberg

On Fri, 13 Dec 2013, Joonsoo Kim wrote:

> Could you review this patch?
> I think that we should merge it to fix the problem reported by Christian.

I'd be fine with clearing __GFP_NOFAIL but not with using the same flags
as for a higher order alloc. __GFP_NORETRY and __GFP_NOWARN should be left
untouched for the minimal alloc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
  2013-12-13 16:40           ` Christoph Lameter
  (?)
@ 2013-12-16  8:22             ` Joonsoo Kim
  -1 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-16  8:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt, linux-mm,
	cgroups, linux-kernel, Christian Casteyde, Pekka Enberg

On Fri, Dec 13, 2013 at 04:40:58PM +0000, Christoph Lameter wrote:
> On Fri, 13 Dec 2013, Joonsoo Kim wrote:
> 
> > Could you review this patch?
> > I think that we should merge it to fix the problem reported by Christian.
> 
> I'd be fine with clearing __GFP_NOFAIL but not with using the same flags
> as for a higher order alloc. __GFP_NORETRY and __GFP_NOWARN should be left
> untouched for the minimal alloc.

Hello.

So you don't want to add __GFP_NORETRY and __GFP_NOWARN for kmemcheck?
I think that it isn't good idea, since users would meet *unexpected*
allocation failure if they enable kmemcheck and slub uses different flags
for kmemcheck. It makes users who want to debug their own problems embarrass.

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-16  8:22             ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-16  8:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt, linux-mm,
	cgroups, linux-kernel, Christian Casteyde, Pekka Enberg

On Fri, Dec 13, 2013 at 04:40:58PM +0000, Christoph Lameter wrote:
> On Fri, 13 Dec 2013, Joonsoo Kim wrote:
> 
> > Could you review this patch?
> > I think that we should merge it to fix the problem reported by Christian.
> 
> I'd be fine with clearing __GFP_NOFAIL but not with using the same flags
> as for a higher order alloc. __GFP_NORETRY and __GFP_NOWARN should be left
> untouched for the minimal alloc.

Hello.

So you don't want to add __GFP_NORETRY and __GFP_NOWARN for kmemcheck?
I think that it isn't good idea, since users would meet *unexpected*
allocation failure if they enable kmemcheck and slub uses different flags
for kmemcheck. It makes users who want to debug their own problems embarrass.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 2/2] fs: buffer: move allocation failure loop into the allocator
@ 2013-12-16  8:22             ` Joonsoo Kim
  0 siblings, 0 replies; 39+ messages in thread
From: Joonsoo Kim @ 2013-12-16  8:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, azurIt,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Christian Casteyde,
	Pekka Enberg

On Fri, Dec 13, 2013 at 04:40:58PM +0000, Christoph Lameter wrote:
> On Fri, 13 Dec 2013, Joonsoo Kim wrote:
> 
> > Could you review this patch?
> > I think that we should merge it to fix the problem reported by Christian.
> 
> I'd be fine with clearing __GFP_NOFAIL but not with using the same flags
> as for a higher order alloc. __GFP_NORETRY and __GFP_NOWARN should be left
> untouched for the minimal alloc.

Hello.

So you don't want to add __GFP_NORETRY and __GFP_NOWARN for kmemcheck?
I think that it isn't good idea, since users would meet *unexpected*
allocation failure if they enable kmemcheck and slub uses different flags
for kmemcheck. It makes users who want to debug their own problems embarrass.

Thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2013-12-16  8:22 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-08 20:58 [patch 1/2] mm: memcg: handle non-error OOM situations more gracefully Johannes Weiner
2013-10-08 20:58 ` Johannes Weiner
2013-10-08 20:58 ` Johannes Weiner
2013-10-08 20:58 ` [patch 2/2] fs: buffer: move allocation failure loop into the allocator Johannes Weiner
2013-10-08 20:58   ` Johannes Weiner
2013-10-08 20:58   ` Johannes Weiner
2013-10-11 20:51   ` Andrew Morton
2013-10-11 20:51     ` Andrew Morton
2013-12-04  0:59   ` Andrew Morton
2013-12-04  0:59     ` Andrew Morton
2013-12-04  1:52     ` Joonsoo Kim
2013-12-04  1:52       ` Joonsoo Kim
2013-12-04  1:52       ` Joonsoo Kim
2013-12-04  2:07       ` Andrew Morton
2013-12-04  2:07         ` Andrew Morton
2013-12-04  2:07         ` Andrew Morton
2013-12-04  2:42         ` Joonsoo Kim
2013-12-04  2:42           ` Joonsoo Kim
2013-12-04 15:17         ` Christoph Lameter
2013-12-04 15:17           ` Christoph Lameter
2013-12-04 15:17           ` Christoph Lameter
2013-12-04 16:02           ` Joonsoo Kim
2013-12-04 16:02             ` Joonsoo Kim
2013-12-04 16:33             ` Christoph Lameter
2013-12-04 16:33               ` Christoph Lameter
2013-12-05  8:44               ` Joonsoo Kim
2013-12-05  8:44                 ` Joonsoo Kim
2013-12-05 18:50                 ` Christoph Lameter
2013-12-05 18:50                   ` Christoph Lameter
2013-12-06  8:57                   ` Joonsoo Kim
2013-12-06  8:57                     ` Joonsoo Kim
2013-12-13  6:58       ` Joonsoo Kim
2013-12-13  6:58         ` Joonsoo Kim
2013-12-13  6:58         ` Joonsoo Kim
2013-12-13 16:40         ` Christoph Lameter
2013-12-13 16:40           ` Christoph Lameter
2013-12-16  8:22           ` Joonsoo Kim
2013-12-16  8:22             ` Joonsoo Kim
2013-12-16  8:22             ` Joonsoo Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.