All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
@ 2015-03-31 22:11 ` Sasha Levin
  0 siblings, 0 replies; 13+ messages in thread
From: Sasha Levin @ 2015-03-31 22:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: mhocko, Sasha Levin, Andrew Morton, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

Freeing pages became a rather costly operation, specially when multiple debug
options are enabled. This causes hangs when an attempt to free a large amount
of 0-order is made. Two examples are vfree()ing large block of memory, and
punching a hole in a shmem filesystem.

To avoid that, move any free operations that involve batching pages into a
list to a workqueue handler where they could be freed later.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
---
 mm/page_alloc.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5bd9711..812ca75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1586,10 +1586,11 @@ out:
 	local_irq_restore(flags);
 }
 
-/*
- * Free a list of 0-order pages
- */
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+static LIST_HEAD(free_hot_page_list);
+static LIST_HEAD(free_cold_page_list);
+static DEFINE_SPINLOCK(free_page_lock);
+
+static void __free_hot_cold_page_list(struct list_head *list, bool cold)
 {
 	struct page *page, *next;
 
@@ -1599,6 +1600,47 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 	}
 }
 
+static void free_page_lists_work(struct work_struct *work)
+{
+	LIST_HEAD(hot_pages);
+	LIST_HEAD(cold_pages);
+	unsigned long flags;
+
+	spin_lock_irqsave(&free_page_lock, flags);
+	list_cut_position(&hot_pages, &free_hot_page_list,
+					free_hot_page_list.prev);
+	list_cut_position(&cold_pages, &free_cold_page_list,
+					free_cold_page_list.prev);
+	spin_unlock_irqrestore(&free_page_lock, flags);
+
+	__free_hot_cold_page_list(&hot_pages, false);
+	__free_hot_cold_page_list(&cold_pages, true);
+}
+
+static DECLARE_WORK(free_page_work, free_page_lists_work);
+
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, bool cold)
+{
+	unsigned long flags;
+
+	if (unlikely(!keventd_up())) {
+		__free_hot_cold_page_list(list, cold);
+		return;
+	}
+
+	spin_lock_irqsave(&free_page_lock, flags);
+	if(cold)
+		list_splice_tail(list, &free_cold_page_list);
+	else
+		list_splice_tail(list, &free_hot_page_list);
+	spin_unlock_irqrestore(&free_page_lock, flags);
+
+	schedule_work(&free_page_work);
+}
+
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
@ 2015-03-31 22:11 ` Sasha Levin
  0 siblings, 0 replies; 13+ messages in thread
From: Sasha Levin @ 2015-03-31 22:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: mhocko, Sasha Levin, Andrew Morton, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

Freeing pages became a rather costly operation, specially when multiple debug
options are enabled. This causes hangs when an attempt to free a large amount
of 0-order is made. Two examples are vfree()ing large block of memory, and
punching a hole in a shmem filesystem.

To avoid that, move any free operations that involve batching pages into a
list to a workqueue handler where they could be freed later.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
---
 mm/page_alloc.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5bd9711..812ca75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1586,10 +1586,11 @@ out:
 	local_irq_restore(flags);
 }
 
-/*
- * Free a list of 0-order pages
- */
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+static LIST_HEAD(free_hot_page_list);
+static LIST_HEAD(free_cold_page_list);
+static DEFINE_SPINLOCK(free_page_lock);
+
+static void __free_hot_cold_page_list(struct list_head *list, bool cold)
 {
 	struct page *page, *next;
 
@@ -1599,6 +1600,47 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 	}
 }
 
+static void free_page_lists_work(struct work_struct *work)
+{
+	LIST_HEAD(hot_pages);
+	LIST_HEAD(cold_pages);
+	unsigned long flags;
+
+	spin_lock_irqsave(&free_page_lock, flags);
+	list_cut_position(&hot_pages, &free_hot_page_list,
+					free_hot_page_list.prev);
+	list_cut_position(&cold_pages, &free_cold_page_list,
+					free_cold_page_list.prev);
+	spin_unlock_irqrestore(&free_page_lock, flags);
+
+	__free_hot_cold_page_list(&hot_pages, false);
+	__free_hot_cold_page_list(&cold_pages, true);
+}
+
+static DECLARE_WORK(free_page_work, free_page_lists_work);
+
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, bool cold)
+{
+	unsigned long flags;
+
+	if (unlikely(!keventd_up())) {
+		__free_hot_cold_page_list(list, cold);
+		return;
+	}
+
+	spin_lock_irqsave(&free_page_lock, flags);
+	if(cold)
+		list_splice_tail(list, &free_cold_page_list);
+	else
+		list_splice_tail(list, &free_hot_page_list);
+	spin_unlock_irqrestore(&free_page_lock, flags);
+
+	schedule_work(&free_page_work);
+}
+
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
-- 
1.7.10.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] mm: __free_pages batch up 0-order pages for freeing
  2015-03-31 22:11 ` Sasha Levin
@ 2015-03-31 22:11   ` Sasha Levin
  -1 siblings, 0 replies; 13+ messages in thread
From: Sasha Levin @ 2015-03-31 22:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: mhocko, Sasha Levin, Andrew Morton, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

Rather than calling free_hot_cold_page() for every page, batch them up in a
list and pass them on to free_hot_cold_page_list(). This will let us defer
them to a workqueue.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
---
 mm/page_alloc.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 812ca75..e58e795 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2997,12 +2997,16 @@ EXPORT_SYMBOL(get_zeroed_page);
 
 void __free_pages(struct page *page, unsigned int order)
 {
+	LIST_HEAD(hot_cold_pages);
+
 	if (put_page_testzero(page)) {
 		if (order == 0)
-			free_hot_cold_page(page, false);
+			list_add(&page->lru, &hot_cold_pages);
 		else
 			__free_pages_ok(page, order);
 	}
+
+	free_hot_cold_page_list(&hot_cold_pages, false);
 }
 
 EXPORT_SYMBOL(__free_pages);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] mm: __free_pages batch up 0-order pages for freeing
@ 2015-03-31 22:11   ` Sasha Levin
  0 siblings, 0 replies; 13+ messages in thread
From: Sasha Levin @ 2015-03-31 22:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: mhocko, Sasha Levin, Andrew Morton, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

Rather than calling free_hot_cold_page() for every page, batch them up in a
list and pass them on to free_hot_cold_page_list(). This will let us defer
them to a workqueue.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
---
 mm/page_alloc.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 812ca75..e58e795 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2997,12 +2997,16 @@ EXPORT_SYMBOL(get_zeroed_page);
 
 void __free_pages(struct page *page, unsigned int order)
 {
+	LIST_HEAD(hot_cold_pages);
+
 	if (put_page_testzero(page)) {
 		if (order == 0)
-			free_hot_cold_page(page, false);
+			list_add(&page->lru, &hot_cold_pages);
 		else
 			__free_pages_ok(page, order);
 	}
+
+	free_hot_cold_page_list(&hot_cold_pages, false);
 }
 
 EXPORT_SYMBOL(__free_pages);
-- 
1.7.10.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
  2015-03-31 22:11 ` Sasha Levin
@ 2015-03-31 22:31   ` Andrew Morton
  -1 siblings, 0 replies; 13+ messages in thread
From: Andrew Morton @ 2015-03-31 22:31 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, mhocko, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

On Tue, 31 Mar 2015 18:11:32 -0400 Sasha Levin <sasha.levin@oracle.com> wrote:

> Freeing pages became a rather costly operation, specially when multiple debug
> options are enabled. This causes hangs when an attempt to free a large amount
> of 0-order is made. Two examples are vfree()ing large block of memory, and
> punching a hole in a shmem filesystem.
> 
> To avoid that, move any free operations that involve batching pages into a
> list to a workqueue handler where they could be freed later.

eek.

__free_pages() is going to be a hot path for someone - it has 500+
callsites.

And this patch might cause problems for rt_prio() tasks which run for a
long time, starving out the workqueue thread.  And probably other stuff
I didn't think of...

What whacky debug option is actually causing this?  Full-page poisoning?



Stick a cond_resched() in __vunmap() ;)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
@ 2015-03-31 22:31   ` Andrew Morton
  0 siblings, 0 replies; 13+ messages in thread
From: Andrew Morton @ 2015-03-31 22:31 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, mhocko, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

On Tue, 31 Mar 2015 18:11:32 -0400 Sasha Levin <sasha.levin@oracle.com> wrote:

> Freeing pages became a rather costly operation, specially when multiple debug
> options are enabled. This causes hangs when an attempt to free a large amount
> of 0-order is made. Two examples are vfree()ing large block of memory, and
> punching a hole in a shmem filesystem.
> 
> To avoid that, move any free operations that involve batching pages into a
> list to a workqueue handler where they could be freed later.

eek.

__free_pages() is going to be a hot path for someone - it has 500+
callsites.

And this patch might cause problems for rt_prio() tasks which run for a
long time, starving out the workqueue thread.  And probably other stuff
I didn't think of...

What whacky debug option is actually causing this?  Full-page poisoning?



Stick a cond_resched() in __vunmap() ;)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
  2015-03-31 22:31   ` Andrew Morton
  (?)
@ 2015-03-31 22:39   ` Sasha Levin
  2015-03-31 22:54     ` Andrew Morton
  -1 siblings, 1 reply; 13+ messages in thread
From: Sasha Levin @ 2015-03-31 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, mhocko, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim, open, list,
	MEMORY MANAGEMENT

On 03/31/2015 06:31 PM, Andrew Morton wrote:
> On Tue, 31 Mar 2015 18:11:32 -0400 Sasha Levin <sasha.levin@oracle.com> wrote:
> 
>> > Freeing pages became a rather costly operation, specially when multiple debug
>> > options are enabled. This causes hangs when an attempt to free a large amount
>> > of 0-order is made. Two examples are vfree()ing large block of memory, and
>> > punching a hole in a shmem filesystem.
>> > 
>> > To avoid that, move any free operations that involve batching pages into a
>> > list to a workqueue handler where they could be freed later.
> eek.
> 
> __free_pages() is going to be a hot path for someone - it has 500+
> callsites.

I guess we can make the whole workqueue depend on how many pages are going to
get free?

> And this patch might cause problems for rt_prio() tasks which run for a
> long time, starving out the workqueue thread.  And probably other stuff
> I didn't think of...

Give it it's own workqueue?

> What whacky debug option is actually causing this?  Full-page poisoning?

I think that the winner here is the whole object debugging which checks for
active objects in pages that get freed.

> Stick a cond_resched() in __vunmap() ;)

If only it was that simple :)

Not only it get called in atomic context, but the problem is not just the
thread locking up, it's also lock dependency which causes other processes
to lock up. This is the example I've mentioned in the commit log with shmem.

We have one random process crying about being stuck for two minutes:

[ 2885.711517] INFO: task trinity-c5:7071 blocked for more than 120 seconds.
[ 2885.714534]       Not tainted 4.0.0-rc6-next-20150331-sasha-00036-g29ef5d2 #2108
[ 2885.717519] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 2885.719472] trinity-c5      D ffff88011604fc18 26704  7071   9144 0x10000004
[ 2885.721271]  ffff88011604fc18 ffff880127bb3d80 0000000000000001 0000000000000000
[ 2885.722842]  ffff8801291e1588 ffff8801291e1560 ffff880127bb3008 ffff8801f9218000
[ 2885.724431]  ffff880127bb3000 ffff88011604fbf8 ffff880116048000 ffffed0022c09002
[ 2885.726088] Call Trace:
[ 2885.726612] schedule (./arch/x86/include/asm/bitops.h:311 (discriminator 1) kernel/sched/core.c:2827 (discriminator 1))
[ 2885.727523] schedule_preempt_disabled (kernel/sched/core.c:2859)
[ 2885.728639] mutex_lock_nested (kernel/locking/mutex.c:585 kernel/locking/mutex.c:623)
[ 2885.736019] chown_common (fs/open.c:595)
[ 2885.745761] SyS_fchown (fs/open.c:663 fs/open.c:650)
[ 2885.746714] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)
[ 2885.747758] 2 locks held by trinity-c5/7071:
[ 2885.748545] #0: (sb_writers#10){.+.+.+}, at: mnt_want_write_file (fs/namespace.c:445)
[ 2885.751407] #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: chown_common (fs/open.c:595)
[ 2885.755143] Mutex: counter: -1 owner: trinity-c6

While shmem is work tirelessly to free up it's pages:

[ 2896.340953] trinity-c6      R  running task    27040  6561   9144 0x10000006
[ 2896.342673]  ffff8802e72576a8 ffff8802e7257758 ffffffffabfdd628 003c5e36ef1674fa
[ 2896.344267]  ffff8801533e1588 ffff8801533e1560 ffff8802d3963778 ffff8802ad220000
[ 2896.345824]  ffff8802d3963000 0000000000000000 ffff8802e7250000 ffffed005ce4a002
[ 2896.347286] Call Trace:
[ 2896.347784] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42)
[ 2896.348977] preempt_schedule_common (./arch/x86/include/asm/preempt.h:77 (discriminator 1) kernel/sched/core.c:2867 (discriminator 1))
[ 2896.350279] preempt_schedule (kernel/sched/core.c:2893)
[ 2896.351349] ___preempt_schedule (arch/x86/lib/thunk_64.S:51)
[ 2896.353782] __debug_check_no_obj_freed (lib/debugobjects.c:713)
[ 2896.360001] debug_check_no_obj_freed (lib/debugobjects.c:727)
[ 2896.361574] free_pages_prepare (mm/page_alloc.c:823)
[ 2896.362657] free_hot_cold_page (mm/page_alloc.c:1550)
[ 2896.363735] free_hot_cold_page_list (mm/page_alloc.c:1596 (discriminator 3))
[ 2896.364846] release_pages (mm/swap.c:935)
[ 2896.367979] __pagevec_release (include/linux/pagevec.h:44 mm/swap.c:1013)
[ 2896.369149] shmem_undo_range (include/linux/pagevec.h:69 mm/shmem.c:446)
[ 2896.377070] shmem_truncate_range (mm/shmem.c:541)
[ 2896.378450] shmem_setattr (mm/shmem.c:577)
[ 2896.379556] notify_change (fs/attr.c:270)
[ 2896.382804] do_truncate (fs/open.c:62)
[ 2896.387739] do_sys_ftruncate.constprop.4 (fs/open.c:191)
[ 2896.389450] SyS_ftruncate (fs/open.c:199)
[ 2896.390879] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)


Thanks,
Sasha

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
  2015-03-31 22:39   ` Sasha Levin
@ 2015-03-31 22:54     ` Andrew Morton
  2015-04-01 13:20       ` Sasha Levin
  2015-04-25 21:51       ` Sasha Levin
  0 siblings, 2 replies; 13+ messages in thread
From: Andrew Morton @ 2015-03-31 22:54 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, mhocko, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim, open, list,
	MEMORY MANAGEMENT

On Tue, 31 Mar 2015 18:39:42 -0400 Sasha Levin <sasha.levin@oracle.com> wrote:

> 
> > Stick a cond_resched() in __vunmap() ;)
> 
> If only it was that simple :)
> 
> Not only it get called in atomic context, 

Drat.  Who's calling vfree() from non-interrupt, atomic context for
vast regions?

> but the problem is not just the
> thread locking up, it's also lock dependency which causes other processes
> to lock up. This is the example I've mentioned in the commit log with shmem.
> 
> We have one random process crying about being stuck for two minutes:
> 
> [ 2885.711517] INFO: task trinity-c5:7071 blocked for more than 120 seconds.
> [ 2885.714534]       Not tainted 4.0.0-rc6-next-20150331-sasha-00036-g29ef5d2 #2108
> [ 2885.717519] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 2885.719472] trinity-c5      D ffff88011604fc18 26704  7071   9144 0x10000004
> [ 2885.721271]  ffff88011604fc18 ffff880127bb3d80 0000000000000001 0000000000000000
> [ 2885.722842]  ffff8801291e1588 ffff8801291e1560 ffff880127bb3008 ffff8801f9218000
> [ 2885.724431]  ffff880127bb3000 ffff88011604fbf8 ffff880116048000 ffffed0022c09002
> [ 2885.726088] Call Trace:
> [ 2885.726612] schedule (./arch/x86/include/asm/bitops.h:311 (discriminator 1) kernel/sched/core.c:2827 (discriminator 1))
> [ 2885.727523] schedule_preempt_disabled (kernel/sched/core.c:2859)
> [ 2885.728639] mutex_lock_nested (kernel/locking/mutex.c:585 kernel/locking/mutex.c:623)
> [ 2885.736019] chown_common (fs/open.c:595)
> [ 2885.745761] SyS_fchown (fs/open.c:663 fs/open.c:650)
> [ 2885.746714] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)
> [ 2885.747758] 2 locks held by trinity-c5/7071:
> [ 2885.748545] #0: (sb_writers#10){.+.+.+}, at: mnt_want_write_file (fs/namespace.c:445)
> [ 2885.751407] #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: chown_common (fs/open.c:595)
> [ 2885.755143] Mutex: counter: -1 owner: trinity-c6
> 
> While shmem is work tirelessly to free up it's pages:
> 
> [ 2896.340953] trinity-c6      R  running task    27040  6561   9144 0x10000006
> [ 2896.342673]  ffff8802e72576a8 ffff8802e7257758 ffffffffabfdd628 003c5e36ef1674fa
> [ 2896.344267]  ffff8801533e1588 ffff8801533e1560 ffff8802d3963778 ffff8802ad220000
> [ 2896.345824]  ffff8802d3963000 0000000000000000 ffff8802e7250000 ffffed005ce4a002
> [ 2896.347286] Call Trace:
> [ 2896.347784] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42)
> [ 2896.348977] preempt_schedule_common (./arch/x86/include/asm/preempt.h:77 (discriminator 1) kernel/sched/core.c:2867 (discriminator 1))
> [ 2896.350279] preempt_schedule (kernel/sched/core.c:2893)
> [ 2896.351349] ___preempt_schedule (arch/x86/lib/thunk_64.S:51)
> [ 2896.353782] __debug_check_no_obj_freed (lib/debugobjects.c:713)
> [ 2896.360001] debug_check_no_obj_freed (lib/debugobjects.c:727)
> [ 2896.361574] free_pages_prepare (mm/page_alloc.c:823)
> [ 2896.362657] free_hot_cold_page (mm/page_alloc.c:1550)
> [ 2896.363735] free_hot_cold_page_list (mm/page_alloc.c:1596 (discriminator 3))
> [ 2896.364846] release_pages (mm/swap.c:935)
> [ 2896.367979] __pagevec_release (include/linux/pagevec.h:44 mm/swap.c:1013)
> [ 2896.369149] shmem_undo_range (include/linux/pagevec.h:69 mm/shmem.c:446)
> [ 2896.377070] shmem_truncate_range (mm/shmem.c:541)
> [ 2896.378450] shmem_setattr (mm/shmem.c:577)
> [ 2896.379556] notify_change (fs/attr.c:270)
> [ 2896.382804] do_truncate (fs/open.c:62)
> [ 2896.387739] do_sys_ftruncate.constprop.4 (fs/open.c:191)
> [ 2896.389450] SyS_ftruncate (fs/open.c:199)
> [ 2896.390879] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)

OK, so shmem_undo_range() is full of cond_resched()s but it's holding
i_mutex for too long.  Hugh, fix your junk!

Rather than mucking with the core page allocator I really do think it
would be better to bodge the offending callers for this problem.

And/or maybe extend the softlockup timeout when crazy debug options are
selected.  You're the only person who this will hurt ;)




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] mm: __free_pages batch up 0-order pages for freeing
  2015-03-31 22:11   ` Sasha Levin
@ 2015-04-01 12:48     ` Rasmus Villemoes
  -1 siblings, 0 replies; 13+ messages in thread
From: Rasmus Villemoes @ 2015-04-01 12:48 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, mhocko, Andrew Morton, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

On Wed, Apr 01 2015, Sasha Levin <sasha.levin@oracle.com> wrote:

> Rather than calling free_hot_cold_page() for every page, batch them up in a
> list and pass them on to free_hot_cold_page_list(). This will let us defer
> them to a workqueue.
>
> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> ---
>  mm/page_alloc.c |    6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 812ca75..e58e795 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2997,12 +2997,16 @@ EXPORT_SYMBOL(get_zeroed_page);
>  
>  void __free_pages(struct page *page, unsigned int order)
>  {
> +	LIST_HEAD(hot_cold_pages);
> +
>  	if (put_page_testzero(page)) {
>  		if (order == 0)
> -			free_hot_cold_page(page, false);
> +			list_add(&page->lru, &hot_cold_pages);
>  		else
>  			__free_pages_ok(page, order);
>  	}
> +
> +	free_hot_cold_page_list(&hot_cold_pages, false);

Is there a reason to do this function call when the list is empty? In
other words, why can't this just be done inside the if (order == 0)?

Rasmus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] mm: __free_pages batch up 0-order pages for freeing
@ 2015-04-01 12:48     ` Rasmus Villemoes
  0 siblings, 0 replies; 13+ messages in thread
From: Rasmus Villemoes @ 2015-04-01 12:48 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, mhocko, Andrew Morton, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim,
	open list:MEMORY MANAGEMENT

On Wed, Apr 01 2015, Sasha Levin <sasha.levin@oracle.com> wrote:

> Rather than calling free_hot_cold_page() for every page, batch them up in a
> list and pass them on to free_hot_cold_page_list(). This will let us defer
> them to a workqueue.
>
> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> ---
>  mm/page_alloc.c |    6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 812ca75..e58e795 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2997,12 +2997,16 @@ EXPORT_SYMBOL(get_zeroed_page);
>  
>  void __free_pages(struct page *page, unsigned int order)
>  {
> +	LIST_HEAD(hot_cold_pages);
> +
>  	if (put_page_testzero(page)) {
>  		if (order == 0)
> -			free_hot_cold_page(page, false);
> +			list_add(&page->lru, &hot_cold_pages);
>  		else
>  			__free_pages_ok(page, order);
>  	}
> +
> +	free_hot_cold_page_list(&hot_cold_pages, false);

Is there a reason to do this function call when the list is empty? In
other words, why can't this just be done inside the if (order == 0)?

Rasmus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
  2015-03-31 22:11 ` Sasha Levin
                   ` (2 preceding siblings ...)
  (?)
@ 2015-04-01 12:57 ` Vlastimil Babka
  -1 siblings, 0 replies; 13+ messages in thread
From: Vlastimil Babka @ 2015-04-01 12:57 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel
  Cc: mhocko, Andrew Morton, Mel Gorman, Johannes Weiner,
	David Rientjes, Joonsoo Kim, open, list, MEMORY MANAGEMENT

On 04/01/2015 12:11 AM, Sasha Levin wrote:
> Freeing pages became a rather costly operation, specially when multiple debug
> options are enabled. This causes hangs when an attempt to free a large amount
> of 0-order is made. Two examples are vfree()ing large block of memory, and
> punching a hole in a shmem filesystem.
>
> To avoid that, move any free operations that involve batching pages into a
> list to a workqueue handler where they could be freed later.

Is there a risk of creating a situation where memory is apparently 
missing, because the work item hasn't been processed? Leading to 
allocation failures, needless reclaim, spurious OOM, etc? If yes, such 
situations should probably wait for completion of the work first?

And maybe it shouldn't be used everywhere (as patch 2/2 does) but only 
where it makes sense. Process exits, maybe?

> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> ---
>   mm/page_alloc.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 46 insertions(+), 4 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5bd9711..812ca75 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1586,10 +1586,11 @@ out:
>   	local_irq_restore(flags);
>   }
>
> -/*
> - * Free a list of 0-order pages
> - */
> -void free_hot_cold_page_list(struct list_head *list, bool cold)
> +static LIST_HEAD(free_hot_page_list);
> +static LIST_HEAD(free_cold_page_list);
> +static DEFINE_SPINLOCK(free_page_lock);
> +
> +static void __free_hot_cold_page_list(struct list_head *list, bool cold)
>   {
>   	struct page *page, *next;
>
> @@ -1599,6 +1600,47 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
>   	}
>   }
>
> +static void free_page_lists_work(struct work_struct *work)
> +{
> +	LIST_HEAD(hot_pages);
> +	LIST_HEAD(cold_pages);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&free_page_lock, flags);
> +	list_cut_position(&hot_pages, &free_hot_page_list,
> +					free_hot_page_list.prev);
> +	list_cut_position(&cold_pages, &free_cold_page_list,
> +					free_cold_page_list.prev);
> +	spin_unlock_irqrestore(&free_page_lock, flags);
> +
> +	__free_hot_cold_page_list(&hot_pages, false);
> +	__free_hot_cold_page_list(&cold_pages, true);
> +}
> +
> +static DECLARE_WORK(free_page_work, free_page_lists_work);
> +
> +/*
> + * Free a list of 0-order pages
> + */
> +void free_hot_cold_page_list(struct list_head *list, bool cold)
> +{
> +	unsigned long flags;
> +
> +	if (unlikely(!keventd_up())) {
> +		__free_hot_cold_page_list(list, cold);
> +		return;
> +	}
> +
> +	spin_lock_irqsave(&free_page_lock, flags);
> +	if(cold)
> +		list_splice_tail(list, &free_cold_page_list);
> +	else
> +		list_splice_tail(list, &free_hot_page_list);
> +	spin_unlock_irqrestore(&free_page_lock, flags);
> +
> +	schedule_work(&free_page_work);
> +}
> +
>   /*
>    * split_page takes a non-compound higher-order page, and splits it into
>    * n (1<<order) sub-pages: page[0..n]
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
  2015-03-31 22:54     ` Andrew Morton
@ 2015-04-01 13:20       ` Sasha Levin
  2015-04-25 21:51       ` Sasha Levin
  1 sibling, 0 replies; 13+ messages in thread
From: Sasha Levin @ 2015-04-01 13:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, mhocko, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim, open, list,
	MEMORY MANAGEMENT

On 03/31/2015 06:54 PM, Andrew Morton wrote:
> On Tue, 31 Mar 2015 18:39:42 -0400 Sasha Levin <sasha.levin@oracle.com> wrote:
> 
>>
>>> Stick a cond_resched() in __vunmap() ;)
>>
>> If only it was that simple :)
>>
>> Not only it get called in atomic context, 
> 
> Drat.  Who's calling vfree() from non-interrupt, atomic context for
> vast regions?

I have to admit that I don't have a clue. Michal and I discussed it at LSF/MM, and
he mentioned in his mail on the subject:

On 03/17/2015 04:58 AM, Michal Hocko wrote:
> Hmm, just looked into the git log and it seems that there are/were
> some callers of vfree with spinlock held (e.g. 9265f1d0c759 (GFS2:
> gfs2_dir_get_hash_table(): avoiding deferred vfree() is easy here...))
> and who knows how many others like that we have so cond_resched here is
> no-no.

>> but the problem is not just the
>> thread locking up, it's also lock dependency which causes other processes
>> to lock up. This is the example I've mentioned in the commit log with shmem.
>>
>> We have one random process crying about being stuck for two minutes:
>>
>> [ 2885.711517] INFO: task trinity-c5:7071 blocked for more than 120 seconds.
>> [ 2885.714534]       Not tainted 4.0.0-rc6-next-20150331-sasha-00036-g29ef5d2 #2108
>> [ 2885.717519] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 2885.719472] trinity-c5      D ffff88011604fc18 26704  7071   9144 0x10000004
>> [ 2885.721271]  ffff88011604fc18 ffff880127bb3d80 0000000000000001 0000000000000000
>> [ 2885.722842]  ffff8801291e1588 ffff8801291e1560 ffff880127bb3008 ffff8801f9218000
>> [ 2885.724431]  ffff880127bb3000 ffff88011604fbf8 ffff880116048000 ffffed0022c09002
>> [ 2885.726088] Call Trace:
>> [ 2885.726612] schedule (./arch/x86/include/asm/bitops.h:311 (discriminator 1) kernel/sched/core.c:2827 (discriminator 1))
>> [ 2885.727523] schedule_preempt_disabled (kernel/sched/core.c:2859)
>> [ 2885.728639] mutex_lock_nested (kernel/locking/mutex.c:585 kernel/locking/mutex.c:623)
>> [ 2885.736019] chown_common (fs/open.c:595)
>> [ 2885.745761] SyS_fchown (fs/open.c:663 fs/open.c:650)
>> [ 2885.746714] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)
>> [ 2885.747758] 2 locks held by trinity-c5/7071:
>> [ 2885.748545] #0: (sb_writers#10){.+.+.+}, at: mnt_want_write_file (fs/namespace.c:445)
>> [ 2885.751407] #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: chown_common (fs/open.c:595)
>> [ 2885.755143] Mutex: counter: -1 owner: trinity-c6
>>
>> While shmem is work tirelessly to free up it's pages:
>>
>> [ 2896.340953] trinity-c6      R  running task    27040  6561   9144 0x10000006
>> [ 2896.342673]  ffff8802e72576a8 ffff8802e7257758 ffffffffabfdd628 003c5e36ef1674fa
>> [ 2896.344267]  ffff8801533e1588 ffff8801533e1560 ffff8802d3963778 ffff8802ad220000
>> [ 2896.345824]  ffff8802d3963000 0000000000000000 ffff8802e7250000 ffffed005ce4a002
>> [ 2896.347286] Call Trace:
>> [ 2896.347784] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42)
>> [ 2896.348977] preempt_schedule_common (./arch/x86/include/asm/preempt.h:77 (discriminator 1) kernel/sched/core.c:2867 (discriminator 1))
>> [ 2896.350279] preempt_schedule (kernel/sched/core.c:2893)
>> [ 2896.351349] ___preempt_schedule (arch/x86/lib/thunk_64.S:51)
>> [ 2896.353782] __debug_check_no_obj_freed (lib/debugobjects.c:713)
>> [ 2896.360001] debug_check_no_obj_freed (lib/debugobjects.c:727)
>> [ 2896.361574] free_pages_prepare (mm/page_alloc.c:823)
>> [ 2896.362657] free_hot_cold_page (mm/page_alloc.c:1550)
>> [ 2896.363735] free_hot_cold_page_list (mm/page_alloc.c:1596 (discriminator 3))
>> [ 2896.364846] release_pages (mm/swap.c:935)
>> [ 2896.367979] __pagevec_release (include/linux/pagevec.h:44 mm/swap.c:1013)
>> [ 2896.369149] shmem_undo_range (include/linux/pagevec.h:69 mm/shmem.c:446)
>> [ 2896.377070] shmem_truncate_range (mm/shmem.c:541)
>> [ 2896.378450] shmem_setattr (mm/shmem.c:577)
>> [ 2896.379556] notify_change (fs/attr.c:270)
>> [ 2896.382804] do_truncate (fs/open.c:62)
>> [ 2896.387739] do_sys_ftruncate.constprop.4 (fs/open.c:191)
>> [ 2896.389450] SyS_ftruncate (fs/open.c:199)
>> [ 2896.390879] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)
> 
> OK, so shmem_undo_range() is full of cond_resched()s but it's holding
> i_mutex for too long.  Hugh, fix your junk!
> 
> Rather than mucking with the core page allocator I really do think it
> would be better to bodge the offending callers for this problem.
> 
> And/or maybe extend the softlockup timeout when crazy debug options are
> selected.  You're the only person who this will hurt ;)

2 minutes is too little, but I'm hitting (unrelated) things like the
lru_add_drain_all() hang even with a 20 minute timer. At some point it
just stops fuzzing and turns into an attempt to deal with freeing large
chunks of memory :/


Thanks,
Sasha

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] mm: free large amount of 0-order pages in workqueue
  2015-03-31 22:54     ` Andrew Morton
  2015-04-01 13:20       ` Sasha Levin
@ 2015-04-25 21:51       ` Sasha Levin
  1 sibling, 0 replies; 13+ messages in thread
From: Sasha Levin @ 2015-04-25 21:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, mhocko, Mel Gorman, Vlastimil Babka,
	Johannes Weiner, David Rientjes, Joonsoo Kim, open, list,
	MEMORY MANAGEMENT

On 03/31/2015 06:54 PM, Andrew Morton wrote:
>> [ 2896.340953] trinity-c6      R  running task    27040  6561   9144 0x10000006
>> > [ 2896.342673]  ffff8802e72576a8 ffff8802e7257758 ffffffffabfdd628 003c5e36ef1674fa
>> > [ 2896.344267]  ffff8801533e1588 ffff8801533e1560 ffff8802d3963778 ffff8802ad220000
>> > [ 2896.345824]  ffff8802d3963000 0000000000000000 ffff8802e7250000 ffffed005ce4a002
>> > [ 2896.347286] Call Trace:
>> > [ 2896.347784] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42)
>> > [ 2896.348977] preempt_schedule_common (./arch/x86/include/asm/preempt.h:77 (discriminator 1) kernel/sched/core.c:2867 (discriminator 1))
>> > [ 2896.350279] preempt_schedule (kernel/sched/core.c:2893)
>> > [ 2896.351349] ___preempt_schedule (arch/x86/lib/thunk_64.S:51)
>> > [ 2896.353782] __debug_check_no_obj_freed (lib/debugobjects.c:713)
>> > [ 2896.360001] debug_check_no_obj_freed (lib/debugobjects.c:727)
>> > [ 2896.361574] free_pages_prepare (mm/page_alloc.c:823)
>> > [ 2896.362657] free_hot_cold_page (mm/page_alloc.c:1550)
>> > [ 2896.363735] free_hot_cold_page_list (mm/page_alloc.c:1596 (discriminator 3))
>> > [ 2896.364846] release_pages (mm/swap.c:935)
>> > [ 2896.367979] __pagevec_release (include/linux/pagevec.h:44 mm/swap.c:1013)
>> > [ 2896.369149] shmem_undo_range (include/linux/pagevec.h:69 mm/shmem.c:446)
>> > [ 2896.377070] shmem_truncate_range (mm/shmem.c:541)
>> > [ 2896.378450] shmem_setattr (mm/shmem.c:577)
>> > [ 2896.379556] notify_change (fs/attr.c:270)
>> > [ 2896.382804] do_truncate (fs/open.c:62)
>> > [ 2896.387739] do_sys_ftruncate.constprop.4 (fs/open.c:191)
>> > [ 2896.389450] SyS_ftruncate (fs/open.c:199)
>> > [ 2896.390879] tracesys_phase2 (arch/x86/kernel/entry_64.S:340)
> OK, so shmem_undo_range() is full of cond_resched()s but it's holding
> i_mutex for too long.  Hugh, fix your junk!

Ping on this one? It's causing lockups on all kernels...


Thanks,
Sasha

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-04-25 21:51 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-31 22:11 [PATCH 1/2] mm: free large amount of 0-order pages in workqueue Sasha Levin
2015-03-31 22:11 ` Sasha Levin
2015-03-31 22:11 ` [PATCH 2/2] mm: __free_pages batch up 0-order pages for freeing Sasha Levin
2015-03-31 22:11   ` Sasha Levin
2015-04-01 12:48   ` Rasmus Villemoes
2015-04-01 12:48     ` Rasmus Villemoes
2015-03-31 22:31 ` [PATCH 1/2] mm: free large amount of 0-order pages in workqueue Andrew Morton
2015-03-31 22:31   ` Andrew Morton
2015-03-31 22:39   ` Sasha Levin
2015-03-31 22:54     ` Andrew Morton
2015-04-01 13:20       ` Sasha Levin
2015-04-25 21:51       ` Sasha Levin
2015-04-01 12:57 ` Vlastimil Babka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.