All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] mm: skip rebalance of hopeless zones
@ 2010-12-08 15:16 Johannes Weiner
  2010-12-08 18:05 ` Rik van Riel
                   ` (4 more replies)
  0 siblings, 5 replies; 35+ messages in thread
From: Johannes Weiner @ 2010-12-08 15:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, linux-mm

Kswapd tries to rebalance zones persistently until their high
watermarks are restored.

If the amount of unreclaimable pages in a zone makes this impossible
for reclaim, though, kswapd will end up in a busy loop without a
chance of reaching its goal.

This behaviour was observed on a virtual machine with a tiny
Normal-zone that filled up with unreclaimable slab objects.

This patch makes kswapd skip rebalancing on such 'hopeless' zones and
leaves them to direct reclaim.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/mmzone.h |    2 ++
 mm/page_alloc.c        |    4 ++--
 mm/vmscan.c            |   36 ++++++++++++++++++++++++++++--------
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4890662..0cc1d63 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -655,6 +655,8 @@ typedef struct pglist_data {
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(void *data);
 void wakeup_kswapd(struct zone *zone, int order);
+bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+			 int classzone_idx, int alloc_flags, long free_pages);
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1845a97..c7d2b28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1458,8 +1458,8 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		      int classzone_idx, int alloc_flags, long free_pages)
+bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+			 int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 42a4859..5623f36 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+static bool zone_needs_scan(struct zone *zone, int order,
+			    unsigned long goal, int classzone_idx)
+{
+	unsigned long free, prospect;
+
+	free = zone_page_state(zone, NR_FREE_PAGES);
+	if (zone->percpu_drift_mark && free < zone->percpu_drift_mark)
+		free = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+	if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free))
+		return false;
+	/*
+	 * Ensure that the watermark is at all restorable through
+	 * reclaim.  Otherwise, leave the zone to direct reclaim.
+	 */
+	prospect = free + zone_reclaimable_pages(zone);
+	return prospect >= goal;
+}
+
 /* is kswapd sleeping prematurely? */
 static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 {
@@ -2210,8 +2229,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 		if (zone->all_unreclaimable)
 			continue;
 
-		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-								0, 0))
+		if (zone_needs_scan(zone, order, high_wmark_pages(zone), 0))
 			return 1;
 	}
 
@@ -2282,6 +2300,7 @@ loop_again:
 		 */
 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
+			unsigned long goal;
 
 			if (!populated_zone(zone))
 				continue;
@@ -2297,8 +2316,8 @@ loop_again:
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
-			if (!zone_watermark_ok_safe(zone, order,
-					high_wmark_pages(zone), 0, 0)) {
+			goal = high_wmark_pages(zone);
+			if (zone_needs_scan(zone, order, goal, 0)) {
 				end_zone = i;
 				break;
 			}
@@ -2323,6 +2342,7 @@ loop_again:
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
+			unsigned long goal;
 			int nr_slab;
 
 			if (!populated_zone(zone))
@@ -2339,12 +2359,13 @@ loop_again:
 			 */
 			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
 
+			goal = high_wmark_pages(zone);
 			/*
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
 			 */
 			if (!zone_watermark_ok_safe(zone, order,
-					8*high_wmark_pages(zone), end_zone, 0))
+						    8 * goal, end_zone, 0))
 				shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2373,8 +2394,7 @@ loop_again:
 				compact_zone_order(zone, sc.order, sc.gfp_mask,
 							false);
 
-			if (!zone_watermark_ok_safe(zone, order,
-					high_wmark_pages(zone), end_zone, 0)) {
+			if (zone_needs_scan(zone, order, goal, end_zone)) {
 				all_zones_ok = 0;
 				/*
 				 * We are still under min water mark.  This
@@ -2587,7 +2607,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		pgdat->kswapd_max_order = order;
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
-	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+	if (!zone_needs_scan(zone, order, low_wmark_pages(zone), 0))
 		return;
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
-- 
1.7.3.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 15:16 [patch] mm: skip rebalance of hopeless zones Johannes Weiner
@ 2010-12-08 18:05 ` Rik van Riel
  2010-12-08 22:19 ` Andrew Morton
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 35+ messages in thread
From: Rik van Riel @ 2010-12-08 18:05 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, linux-mm

On 12/08/2010 10:16 AM, Johannes Weiner wrote:
> Kswapd tries to rebalance zones persistently until their high
> watermarks are restored.
>
> If the amount of unreclaimable pages in a zone makes this impossible
> for reclaim, though, kswapd will end up in a busy loop without a
> chance of reaching its goal.
>
> This behaviour was observed on a virtual machine with a tiny
> Normal-zone that filled up with unreclaimable slab objects.
>
> This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> leaves them to direct reclaim.
>
> Signed-off-by: Johannes Weiner<hannes@cmpxchg.org>

Reviewed-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 15:16 [patch] mm: skip rebalance of hopeless zones Johannes Weiner
  2010-12-08 18:05 ` Rik van Riel
@ 2010-12-08 22:19 ` Andrew Morton
  2010-12-09  0:04   ` Johannes Weiner
                     ` (2 more replies)
  2010-12-09  0:36 ` Simon Kirby
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 35+ messages in thread
From: Andrew Morton @ 2010-12-08 22:19 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Rik van Riel, linux-mm

On Wed,  8 Dec 2010 16:16:59 +0100
Johannes Weiner <hannes@cmpxchg.org> wrote:

> Kswapd tries to rebalance zones persistently until their high
> watermarks are restored.
> 
> If the amount of unreclaimable pages in a zone makes this impossible
> for reclaim, though, kswapd will end up in a busy loop without a
> chance of reaching its goal.
> 
> This behaviour was observed on a virtual machine with a tiny
> Normal-zone that filled up with unreclaimable slab objects.

Doesn't this mean that vmscan is incorrectly handling its
zone->all_unreclaimable logic?

> This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> leaves them to direct reclaim.
> 
> ...
>
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>  }
>  #endif
>  
> +static bool zone_needs_scan(struct zone *zone, int order,
> +			    unsigned long goal, int classzone_idx)
> +{
> +	unsigned long free, prospect;
> +
> +	free = zone_page_state(zone, NR_FREE_PAGES);
> +	if (zone->percpu_drift_mark && free < zone->percpu_drift_mark)
> +		free = zone_page_state_snapshot(zone, NR_FREE_PAGES);
> +
> +	if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free))
> +		return false;
> +	/*
> +	 * Ensure that the watermark is at all restorable through
> +	 * reclaim.  Otherwise, leave the zone to direct reclaim.
> +	 */
> +	prospect = free + zone_reclaimable_pages(zone);
> +	return prospect >= goal;
> +}

presumably in certain cases that's a bit more efficient than doing the
scan and using ->all_unreclaimable.  But the scanner shouldn't have got
stuck!  That's a regresion which got added, and I don't think that new
code of this nature was needed to fix that regression.

Did this zone end up with ->all_unreclaimable set?  If so, why was
kswapd stuck in a loop scanning an all-unreclaimable zone?


Also, if I'm understanding the new logic then if the "goal" is 100
pages and zone_reclaimable_pages() says "50 pages potentially
reclaimable" then kswapd won't reclaim *any* pages.  If so, is that
good behaviour?  Should we instead attempt to reclaim some of those 50
pages and then give up?  That sounds like a better strategy if we want
to keep (say) network Rx happening in a tight memory situation.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 22:19 ` Andrew Morton
@ 2010-12-09  0:04   ` Johannes Weiner
  2010-12-09 21:17     ` Andrew Morton
  2011-01-04 23:56     ` Andrew Morton
  2010-12-09  0:47   ` Rik van Riel
  2010-12-09 14:34   ` Mel Gorman
  2 siblings, 2 replies; 35+ messages in thread
From: Johannes Weiner @ 2010-12-09  0:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, linux-mm

On Wed, Dec 08, 2010 at 02:19:09PM -0800, Andrew Morton wrote:
> On Wed,  8 Dec 2010 16:16:59 +0100
> Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> > Kswapd tries to rebalance zones persistently until their high
> > watermarks are restored.
> > 
> > If the amount of unreclaimable pages in a zone makes this impossible
> > for reclaim, though, kswapd will end up in a busy loop without a
> > chance of reaching its goal.
> > 
> > This behaviour was observed on a virtual machine with a tiny
> > Normal-zone that filled up with unreclaimable slab objects.
> 
> Doesn't this mean that vmscan is incorrectly handling its
> zone->all_unreclaimable logic?

I don't think so.  What leads to the problem is that we only declare a
zone unreclaimable after a lot of work, but reset it with a single
page that gets released back to the allocator (past the pcp queue,
that is).

That's probably a good idea per-se, we don't want to leave a zone
behind and retry it eagerly when pages are freed up.

> presumably in certain cases that's a bit more efficient than doing the
> scan and using ->all_unreclaimable.  But the scanner shouldn't have got
> stuck!  That's a regresion which got added, and I don't think that new
> code of this nature was needed to fix that regression.

I'll dig through the history.  But we observed this on a very odd
configuration (24MB ZONE_NORMAL), maybe this was never hit before?

> Did this zone end up with ->all_unreclaimable set?  If so, why was
> kswapd stuck in a loop scanning an all-unreclaimable zone?

It wasn't.  This state is just not very sticky.  After all, the zone
is not all_unreclaimable, just not reclaimable enough to restore the
high watermark.  But the remaining reclaimable pages of that zone may
very well be in constant flux.

> Also, if I'm understanding the new logic then if the "goal" is 100
> pages and zone_reclaimable_pages() says "50 pages potentially
> reclaimable" then kswapd won't reclaim *any* pages.  If so, is that
> good behaviour?  Should we instead attempt to reclaim some of those 50
> pages and then give up?  That sounds like a better strategy if we want
> to keep (say) network Rx happening in a tight memory situation.

Yes, that is probably a good idea.  I'll see that this is improved for
atomic allocators.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 15:16 [patch] mm: skip rebalance of hopeless zones Johannes Weiner
  2010-12-08 18:05 ` Rik van Riel
  2010-12-08 22:19 ` Andrew Morton
@ 2010-12-09  0:36 ` Simon Kirby
  2010-12-09  0:49   ` Rik van Riel
  2010-12-09  1:23   ` Andrew Morton
  2010-12-09  1:29 ` Minchan Kim
  2010-12-09 18:51 ` Ying Han
  4 siblings, 2 replies; 35+ messages in thread
From: Simon Kirby @ 2010-12-09  0:36 UTC (permalink / raw)
  To: Johannes Weiner, Mel Gorman; +Cc: Andrew Morton, Rik van Riel, linux-mm

On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:

> Kswapd tries to rebalance zones persistently until their high
> watermarks are restored.
> 
> If the amount of unreclaimable pages in a zone makes this impossible
> for reclaim, though, kswapd will end up in a busy loop without a
> chance of reaching its goal.
> 
> This behaviour was observed on a virtual machine with a tiny
> Normal-zone that filled up with unreclaimable slab objects.
> 
> This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> leaves them to direct reclaim.

Hi!

We are experiencing a similar issue, though with a 757 MB Normal zone,
where kswapd tries to rebalance Normal after an order-3 allocation while
page cache allocations (order-0) keep splitting it back up again.  It can
run the whole day like this (SSD storage) without sleeping.

Mel Gorman posted a similar patch to yours, but the logic is instead to
consider order>0 balancing sufficient when there are other balanced zones
totalling at least 25% of pages on this node.  This would probably fix
your case as well.

See "Free memory never fully used, swapping" thread, and "[PATCH 0/5]
Prevent kswapd dumping excessive amounts of memory in response to
high-order allocations V2", and finally "Stop high-order balancing when
any suitable zone is balanced".

It probably makes sense to merge one of these patches, or sort out the
good parts of each.  I'm not sure if your patch alone would solve our
case with a significantly bigger Normal zone but where most pages are
still reclaimable...

On the other hand, I still think it's weird that kswapd can fight with
allocations.  It seems like something should hold the free pages while
balancing is happening to avoid them being split right back up again by
lower-order allocations.

Simon-

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/mmzone.h |    2 ++
>  mm/page_alloc.c        |    4 ++--
>  mm/vmscan.c            |   36 ++++++++++++++++++++++++++++--------
>  3 files changed, 32 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 4890662..0cc1d63 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -655,6 +655,8 @@ typedef struct pglist_data {
>  extern struct mutex zonelists_mutex;
>  void build_all_zonelists(void *data);
>  void wakeup_kswapd(struct zone *zone, int order);
> +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +			 int classzone_idx, int alloc_flags, long free_pages);
>  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  		int classzone_idx, int alloc_flags);
>  bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1845a97..c7d2b28 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1458,8 +1458,8 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
>   * Return true if free pages are above 'mark'. This takes into account the order
>   * of the allocation.
>   */
> -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> -		      int classzone_idx, int alloc_flags, long free_pages)
> +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +			 int classzone_idx, int alloc_flags, long free_pages)
>  {
>  	/* free_pages my go negative - that's OK */
>  	long min = mark;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 42a4859..5623f36 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>  }
>  #endif
>  
> +static bool zone_needs_scan(struct zone *zone, int order,
> +			    unsigned long goal, int classzone_idx)
> +{
> +	unsigned long free, prospect;
> +
> +	free = zone_page_state(zone, NR_FREE_PAGES);
> +	if (zone->percpu_drift_mark && free < zone->percpu_drift_mark)
> +		free = zone_page_state_snapshot(zone, NR_FREE_PAGES);
> +
> +	if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free))
> +		return false;
> +	/*
> +	 * Ensure that the watermark is at all restorable through
> +	 * reclaim.  Otherwise, leave the zone to direct reclaim.
> +	 */
> +	prospect = free + zone_reclaimable_pages(zone);
> +	return prospect >= goal;
> +}
> +
>  /* is kswapd sleeping prematurely? */
>  static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>  {
> @@ -2210,8 +2229,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>  		if (zone->all_unreclaimable)
>  			continue;
>  
> -		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
> -								0, 0))
> +		if (zone_needs_scan(zone, order, high_wmark_pages(zone), 0))
>  			return 1;
>  	}
>  
> @@ -2282,6 +2300,7 @@ loop_again:
>  		 */
>  		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
>  			struct zone *zone = pgdat->node_zones + i;
> +			unsigned long goal;
>  
>  			if (!populated_zone(zone))
>  				continue;
> @@ -2297,8 +2316,8 @@ loop_again:
>  				shrink_active_list(SWAP_CLUSTER_MAX, zone,
>  							&sc, priority, 0);
>  
> -			if (!zone_watermark_ok_safe(zone, order,
> -					high_wmark_pages(zone), 0, 0)) {
> +			goal = high_wmark_pages(zone);
> +			if (zone_needs_scan(zone, order, goal, 0)) {
>  				end_zone = i;
>  				break;
>  			}
> @@ -2323,6 +2342,7 @@ loop_again:
>  		 */
>  		for (i = 0; i <= end_zone; i++) {
>  			struct zone *zone = pgdat->node_zones + i;
> +			unsigned long goal;
>  			int nr_slab;
>  
>  			if (!populated_zone(zone))
> @@ -2339,12 +2359,13 @@ loop_again:
>  			 */
>  			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
>  
> +			goal = high_wmark_pages(zone);
>  			/*
>  			 * We put equal pressure on every zone, unless one
>  			 * zone has way too many pages free already.
>  			 */
>  			if (!zone_watermark_ok_safe(zone, order,
> -					8*high_wmark_pages(zone), end_zone, 0))
> +						    8 * goal, end_zone, 0))
>  				shrink_zone(priority, zone, &sc);
>  			reclaim_state->reclaimed_slab = 0;
>  			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> @@ -2373,8 +2394,7 @@ loop_again:
>  				compact_zone_order(zone, sc.order, sc.gfp_mask,
>  							false);
>  
> -			if (!zone_watermark_ok_safe(zone, order,
> -					high_wmark_pages(zone), end_zone, 0)) {
> +			if (zone_needs_scan(zone, order, goal, end_zone)) {
>  				all_zones_ok = 0;
>  				/*
>  				 * We are still under min water mark.  This
> @@ -2587,7 +2607,7 @@ void wakeup_kswapd(struct zone *zone, int order)
>  		pgdat->kswapd_max_order = order;
>  	if (!waitqueue_active(&pgdat->kswapd_wait))
>  		return;
> -	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
> +	if (!zone_needs_scan(zone, order, low_wmark_pages(zone), 0))
>  		return;
>  
>  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
> -- 
> 1.7.3.2
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 22:19 ` Andrew Morton
  2010-12-09  0:04   ` Johannes Weiner
@ 2010-12-09  0:47   ` Rik van Riel
  2010-12-09 14:34   ` Mel Gorman
  2 siblings, 0 replies; 35+ messages in thread
From: Rik van Riel @ 2010-12-09  0:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Johannes Weiner, linux-mm

On 12/08/2010 05:19 PM, Andrew Morton wrote:

> presumably in certain cases that's a bit more efficient than doing the
> scan and using ->all_unreclaimable.  But the scanner shouldn't have got
> stuck!  That's a regresion which got added, and I don't think that new
> code of this nature was needed to fix that regression.
>
> Did this zone end up with ->all_unreclaimable set?  If so, why was
> kswapd stuck in a loop scanning an all-unreclaimable zone?

IIRC kswapd does not get stuck, but the page allocator
keeps waking it up. That also results in near 100% CPU use.

> Also, if I'm understanding the new logic then if the "goal" is 100
> pages and zone_reclaimable_pages() says "50 pages potentially
> reclaimable" then kswapd won't reclaim *any* pages.  If so, is that
> good behaviour?  Should we instead attempt to reclaim some of those 50
> pages and then give up?  That sounds like a better strategy if we want
> to keep (say) network Rx happening in a tight memory situation.

Actually, given the number of reports on how the VM keeps
trying to hard and the system stalls for minutes before an
OOM kill happens, giving up earlier is probably the right
thing to do.

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  0:36 ` Simon Kirby
@ 2010-12-09  0:49   ` Rik van Riel
  2010-12-09  1:08     ` Simon Kirby
  2010-12-09  1:23   ` Andrew Morton
  1 sibling, 1 reply; 35+ messages in thread
From: Rik van Riel @ 2010-12-09  0:49 UTC (permalink / raw)
  To: Simon Kirby; +Cc: Johannes Weiner, Mel Gorman, Andrew Morton, linux-mm

On 12/08/2010 07:36 PM, Simon Kirby wrote:

> Mel Gorman posted a similar patch to yours, but the logic is instead to
> consider order>0 balancing sufficient when there are other balanced zones
> totalling at least 25% of pages on this node.  This would probably fix
> your case as well.

Mel's patch addresses something very different and is unlikely
to fix the problem this patch addresses.

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  0:49   ` Rik van Riel
@ 2010-12-09  1:08     ` Simon Kirby
  2010-12-09 14:42       ` Mel Gorman
  0 siblings, 1 reply; 35+ messages in thread
From: Simon Kirby @ 2010-12-09  1:08 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Johannes Weiner, Mel Gorman, Andrew Morton, linux-mm

On Wed, Dec 08, 2010 at 07:49:03PM -0500, Rik van Riel wrote:

> On 12/08/2010 07:36 PM, Simon Kirby wrote:
>
>> Mel Gorman posted a similar patch to yours, but the logic is instead to
>> consider order>0 balancing sufficient when there are other balanced zones
>> totalling at least 25% of pages on this node.  This would probably fix
>> your case as well.
>
> Mel's patch addresses something very different and is unlikely
> to fix the problem this patch addresses.

Ok, I see they're quite separate.

Johannes' patch solves the problem of trying to balance a tiny Normal
zone which happens to be full of unclaimable slab pages by giving up in
this hopeless case, regardless of order.

Mel's patch solves the problem of fighting allocations causing an
order>0 imbalance in the small Normal zone which happens to be full of
reclaimable pages by giving up in this not-worth-bothering case.

The key difference is that Johannes' patch has no condition on order, so
Mel's patch probably would help (though not for intended reasons) in the
order != 0 case, and probably not in the order=0 case.

Simon-

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  0:36 ` Simon Kirby
  2010-12-09  0:49   ` Rik van Riel
@ 2010-12-09  1:23   ` Andrew Morton
  2010-12-09  1:55     ` Minchan Kim
                       ` (4 more replies)
  1 sibling, 5 replies; 35+ messages in thread
From: Andrew Morton @ 2010-12-09  1:23 UTC (permalink / raw)
  To: Simon Kirby; +Cc: Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:

> On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
> 
> > Kswapd tries to rebalance zones persistently until their high
> > watermarks are restored.
> > 
> > If the amount of unreclaimable pages in a zone makes this impossible
> > for reclaim, though, kswapd will end up in a busy loop without a
> > chance of reaching its goal.
> > 
> > This behaviour was observed on a virtual machine with a tiny
> > Normal-zone that filled up with unreclaimable slab objects.
> > 
> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> > leaves them to direct reclaim.
> 
> Hi!
> 
> We are experiencing a similar issue, though with a 757 MB Normal zone,
> where kswapd tries to rebalance Normal after an order-3 allocation while
> page cache allocations (order-0) keep splitting it back up again.  It can
> run the whole day like this (SSD storage) without sleeping.

People at google have told me they've seen the same thing.  A fork is
taking 15 minutes when someone else is doing a dd, because the fork
enters direct-reclaim trying for an order-one page.  It successfully
frees some order-one pages but before it gets back to allocate one, dd
has gone and stolen them, or split them apart.

This problem would have got worse when slub came along doing its stupid
unnecessary high-order allocations.

Billions of years ago a direct-reclaimer had a one-deep cache in the
task_struct into which it freed the page to prevent it from getting
stolen.

Later, we took that out because pages were being freed into the
per-cpu-pages magazine, which is effectively task-local anyway.  But
per-cpu-pages are only for order-0 pages.  See slub stupidity, above.

I expect that this is happening so repeatably because the
direct-reclaimer is dong a sleep somewhere after freeing the pages it
needs - if it wasn't doing that then surely the window wouldn't be wide
enough for it to happen so often.  But I didn't look.

Suitable fixes might be

a) don't go to sleep after the successful direct-reclaim.

b) reinstate the one-deep task-local free page cache.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 15:16 [patch] mm: skip rebalance of hopeless zones Johannes Weiner
                   ` (2 preceding siblings ...)
  2010-12-09  0:36 ` Simon Kirby
@ 2010-12-09  1:29 ` Minchan Kim
  2010-12-09 18:51 ` Ying Han
  4 siblings, 0 replies; 35+ messages in thread
From: Minchan Kim @ 2010-12-09  1:29 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Rik van Riel, linux-mm

On Thu, Dec 9, 2010 at 12:16 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> Kswapd tries to rebalance zones persistently until their high
> watermarks are restored.
>
> If the amount of unreclaimable pages in a zone makes this impossible
> for reclaim, though, kswapd will end up in a busy loop without a
> chance of reaching its goal.
>
> This behaviour was observed on a virtual machine with a tiny
> Normal-zone that filled up with unreclaimable slab objects.
>
> This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> leaves them to direct reclaim.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

I like this. It makes code more readable as well as solving the problem.

Just nitpick/off-topic.

Doesn't we really consider NR_SLAB_RECLAIMABLE in zone_reclaimable_pages?
We already consider it when we calculate size of free pages in some
places(__vm_enough_memory,  minimum_image_size) but it is hard to make
sure we can really reclaim. But I it would be mitigated by Nick's
per-zone slab shrinker.

Maybe be another patch.
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:23   ` Andrew Morton
@ 2010-12-09  1:55     ` Minchan Kim
  2010-12-09  1:57       ` Minchan Kim
  2010-12-09  2:01       ` Andrew Morton
  2010-12-09  2:05     ` Simon Kirby
                       ` (3 subsequent siblings)
  4 siblings, 2 replies; 35+ messages in thread
From: Minchan Kim @ 2010-12-09  1:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

On Thu, Dec 9, 2010 at 10:23 AM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
>
>> On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
>>
>> > Kswapd tries to rebalance zones persistently until their high
>> > watermarks are restored.
>> >
>> > If the amount of unreclaimable pages in a zone makes this impossible
>> > for reclaim, though, kswapd will end up in a busy loop without a
>> > chance of reaching its goal.
>> >
>> > This behaviour was observed on a virtual machine with a tiny
>> > Normal-zone that filled up with unreclaimable slab objects.
>> >
>> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
>> > leaves them to direct reclaim.
>>
>> Hi!
>>
>> We are experiencing a similar issue, though with a 757 MB Normal zone,
>> where kswapd tries to rebalance Normal after an order-3 allocation while
>> page cache allocations (order-0) keep splitting it back up again.  It can
>> run the whole day like this (SSD storage) without sleeping.
>
> People at google have told me they've seen the same thing.  A fork is
> taking 15 minutes when someone else is doing a dd, because the fork
> enters direct-reclaim trying for an order-one page.  It successfully
> frees some order-one pages but before it gets back to allocate one, dd
> has gone and stolen them, or split them apart.
>
> This problem would have got worse when slub came along doing its stupid
> unnecessary high-order allocations.
>
> Billions of years ago a direct-reclaimer had a one-deep cache in the
> task_struct into which it freed the page to prevent it from getting
> stolen.
>
> Later, we took that out because pages were being freed into the
> per-cpu-pages magazine, which is effectively task-local anyway.  But
> per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
>
> I expect that this is happening so repeatably because the
> direct-reclaimer is dong a sleep somewhere after freeing the pages it
> needs - if it wasn't doing that then surely the window wouldn't be wide
> enough for it to happen so often.  But I didn't look.
>
> Suitable fixes might be
>
> a) don't go to sleep after the successful direct-reclaim.

It can't make sure success since direct reclaim needs sleep with !GFP_AOMIC

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:55     ` Minchan Kim
@ 2010-12-09  1:57       ` Minchan Kim
  2010-12-09  2:01       ` Andrew Morton
  1 sibling, 0 replies; 35+ messages in thread
From: Minchan Kim @ 2010-12-09  1:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

Where is my code?
Resend.

On Thu, Dec 9, 2010 at 10:55 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> On Thu, Dec 9, 2010 at 10:23 AM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
>> On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
>>
>>> On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
>>>
>>> > Kswapd tries to rebalance zones persistently until their high
>>> > watermarks are restored.
>>> >
>>> > If the amount of unreclaimable pages in a zone makes this impossible
>>> > for reclaim, though, kswapd will end up in a busy loop without a
>>> > chance of reaching its goal.
>>> >
>>> > This behaviour was observed on a virtual machine with a tiny
>>> > Normal-zone that filled up with unreclaimable slab objects.
>>> >
>>> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
>>> > leaves them to direct reclaim.
>>>
>>> Hi!
>>>
>>> We are experiencing a similar issue, though with a 757 MB Normal zone,
>>> where kswapd tries to rebalance Normal after an order-3 allocation while
>>> page cache allocations (order-0) keep splitting it back up again.  It can
>>> run the whole day like this (SSD storage) without sleeping.
>>
>> People at google have told me they've seen the same thing.  A fork is
>> taking 15 minutes when someone else is doing a dd, because the fork
>> enters direct-reclaim trying for an order-one page.  It successfully
>> frees some order-one pages but before it gets back to allocate one, dd
>> has gone and stolen them, or split them apart.
>>
>> This problem would have got worse when slub came along doing its stupid
>> unnecessary high-order allocations.
>>
>> Billions of years ago a direct-reclaimer had a one-deep cache in the
>> task_struct into which it freed the page to prevent it from getting
>> stolen.
>>
>> Later, we took that out because pages were being freed into the
>> per-cpu-pages magazine, which is effectively task-local anyway.  But
>> per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
>>
>> I expect that this is happening so repeatably because the
>> direct-reclaimer is dong a sleep somewhere after freeing the pages it
>> needs - if it wasn't doing that then surely the window wouldn't be wide
>> enough for it to happen so often.  But I didn't look.
>>
>> Suitable fixes might be
>>
>> a) don't go to sleep after the successful direct-reclaim.
>
> It can't make sure success since direct reclaim needs sleep with !GFP_AOMIC.
>
>>
>> b) reinstate the one-deep task-local free page cache.
>
> I like b) so how about this?
> Just for the concept.
>
> @@ -1880,7 +1881,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
> unsigned int order,
>        reclaim_state.reclaimed_slab = 0;
>        p->reclaim_state = &reclaim_state;
>
> -       *did_some_progress = try_to_free_pages(zonelist, order,
> gfp_mask, nodemask);
> +       *did_some_progress = try_to_free_pages(zonelist, order,
> gfp_mask, nodemask, &ret_pages);
>
>        p->reclaim_state = NULL;
>        lockdep_clear_current_reclaim_state();
> @@ -1892,10 +1893,11 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
> unsigned int order,
>                return NULL;
>
>  retry:
> -       page = get_page_from_freelist(gfp_mask, nodemask, order,
> -                                       zonelist, high_zoneidx,
> -                                       alloc_flags, preferred_zone,
> -                                       migratetype);
> +       if(!list_empty(&ret_pages)) {
> +               page = lru_to_page(ret_pages);
> +               list_del(&page->lru);
> +               free_page_list(&ret_pages);
> +       }
>
>        /*
>         * If an allocation failed after direct reclaim, it could be because
>
> --
> Kind regards,
> Minchan Kim
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:55     ` Minchan Kim
  2010-12-09  1:57       ` Minchan Kim
@ 2010-12-09  2:01       ` Andrew Morton
  2010-12-09  2:19         ` Minchan Kim
  2010-12-09  5:18         ` Minchan Kim
  1 sibling, 2 replies; 35+ messages in thread
From: Andrew Morton @ 2010-12-09  2:01 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

On Thu, 9 Dec 2010 10:55:24 +0900 Minchan Kim <minchan.kim@gmail.com> wrote:

> >> > leaves them to direct reclaim.
> >>
> >> Hi!
> >>
> >> We are experiencing a similar issue, though with a 757 MB Normal zone,
> >> where kswapd tries to rebalance Normal after an order-3 allocation while
> >> page cache allocations (order-0) keep splitting it back up again. __It can
> >> run the whole day like this (SSD storage) without sleeping.
> >
> > People at google have told me they've seen the same thing. __A fork is
> > taking 15 minutes when someone else is doing a dd, because the fork
> > enters direct-reclaim trying for an order-one page. __It successfully
> > frees some order-one pages but before it gets back to allocate one, dd
> > has gone and stolen them, or split them apart.
> >
> > This problem would have got worse when slub came along doing its stupid
> > unnecessary high-order allocations.
> >
> > Billions of years ago a direct-reclaimer had a one-deep cache in the
> > task_struct into which it freed the page to prevent it from getting
> > stolen.
> >
> > Later, we took that out because pages were being freed into the
> > per-cpu-pages magazine, which is effectively task-local anyway. __But
> > per-cpu-pages are only for order-0 pages. __See slub stupidity, above.
> >
> > I expect that this is happening so repeatably because the
> > direct-reclaimer is dong a sleep somewhere after freeing the pages it
> > needs - if it wasn't doing that then surely the window wouldn't be wide
> > enough for it to happen so often. __But I didn't look.
> >
> > Suitable fixes might be
> >
> > a) don't go to sleep after the successful direct-reclaim.
> 
> It can't make sure success since direct reclaim needs sleep with !GFP_AOMIC.

It doesn't necessarily need to sleep *after* successfully freeing
pages.  If it needs to sleep then do it before or during the freeing.

> >
> > b) reinstate the one-deep task-local free page cache.
> 
> I like b) so how about this?
> Just for the concept.
> 
> @@ -1880,7 +1881,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
> unsigned int order,
>         reclaim_state.reclaimed_slab = 0;
>         p->reclaim_state = &reclaim_state;
> 
> -       *did_some_progress = try_to_free_pages(zonelist, order,
> gfp_mask, nodemask);
> +       *did_some_progress = try_to_free_pages(zonelist, order,
> gfp_mask, nodemask, &ret_pages);
> 
>         p->reclaim_state = NULL;
>         lockdep_clear_current_reclaim_state();
> @@ -1892,10 +1893,11 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
> unsigned int order,
>                 return NULL;
> 
>  retry:
> -       page = get_page_from_freelist(gfp_mask, nodemask, order,
> -                                       zonelist, high_zoneidx,
> -                                       alloc_flags, preferred_zone,
> -                                       migratetype);
> +       if(!list_empty(&ret_pages)) {
> +               page = lru_to_page(ret_pages);
> +               list_del(&page->lru);
> +               free_page_list(&ret_pages);
> +       }

Maybe.  Or just pass a page*.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:23   ` Andrew Morton
  2010-12-09  1:55     ` Minchan Kim
@ 2010-12-09  2:05     ` Simon Kirby
  2010-12-09  8:55     ` Pekka Enberg
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 35+ messages in thread
From: Simon Kirby @ 2010-12-09  2:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

On Wed, Dec 08, 2010 at 05:23:24PM -0800, Andrew Morton wrote:

> People at google have told me they've seen the same thing.  A fork is
> taking 15 minutes when someone else is doing a dd, because the fork
> enters direct-reclaim trying for an order-one page.  It successfully
> frees some order-one pages but before it gets back to allocate one, dd
> has gone and stolen them, or split them apart.
> 
> This problem would have got worse when slub came along doing its stupid
> unnecessary high-order allocations.

Yeah, we can all blame slub, but even when I force everything to be
order-0 except task_struct and kmalloc(>4096), I still see problems, even
if they aren't as obvious.

Until reclaim holds a page it is about to turn into an order-1, or until
it can hold all of the pages until the watermark is reached including the
allocation it may be directly reclaiming for, this operation is always
going to be non-fair so long as kswapd can run while other allocations
are happening.

Let me guess, Linus will say RCU fixes this.. ;)

Simon-

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  2:01       ` Andrew Morton
@ 2010-12-09  2:19         ` Minchan Kim
  2010-12-09  5:18         ` Minchan Kim
  1 sibling, 0 replies; 35+ messages in thread
From: Minchan Kim @ 2010-12-09  2:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

On Thu, Dec 9, 2010 at 11:01 AM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Thu, 9 Dec 2010 10:55:24 +0900 Minchan Kim <minchan.kim@gmail.com> wrote:
>
>> >> > leaves them to direct reclaim.
>> >>
>> >> Hi!
>> >>
>> >> We are experiencing a similar issue, though with a 757 MB Normal zone,
>> >> where kswapd tries to rebalance Normal after an order-3 allocation while
>> >> page cache allocations (order-0) keep splitting it back up again. __It can
>> >> run the whole day like this (SSD storage) without sleeping.
>> >
>> > People at google have told me they've seen the same thing. __A fork is
>> > taking 15 minutes when someone else is doing a dd, because the fork
>> > enters direct-reclaim trying for an order-one page. __It successfully
>> > frees some order-one pages but before it gets back to allocate one, dd
>> > has gone and stolen them, or split them apart.
>> >
>> > This problem would have got worse when slub came along doing its stupid
>> > unnecessary high-order allocations.
>> >
>> > Billions of years ago a direct-reclaimer had a one-deep cache in the
>> > task_struct into which it freed the page to prevent it from getting
>> > stolen.
>> >
>> > Later, we took that out because pages were being freed into the
>> > per-cpu-pages magazine, which is effectively task-local anyway. __But
>> > per-cpu-pages are only for order-0 pages. __See slub stupidity, above.
>> >
>> > I expect that this is happening so repeatably because the
>> > direct-reclaimer is dong a sleep somewhere after freeing the pages it
>> > needs - if it wasn't doing that then surely the window wouldn't be wide
>> > enough for it to happen so often. __But I didn't look.
>> >
>> > Suitable fixes might be
>> >
>> > a) don't go to sleep after the successful direct-reclaim.
>>
>> It can't make sure success since direct reclaim needs sleep with !GFP_AOMIC.
>
> It doesn't necessarily need to sleep *after* successfully freeing
> pages.  If it needs to sleep then do it before or during the freeing.

Okay. Other point is following as.

do_try_to_free_pages
shrink_zones
shrink_slab
wait_iff_congested

If shrink_zones can't reclaim 32 pages at once, it can enter sleep
then don't make sure successful allocation.
I think it would be better to choose "B" rather than "A" which may
cause complicated things.

>
>> >
>> > b) reinstate the one-deep task-local free page cache.
>>
>> I like b) so how about this?
>> Just for the concept.
>>
>> @@ -1880,7 +1881,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
>> unsigned int order,
>>         reclaim_state.reclaimed_slab = 0;
>>         p->reclaim_state = &reclaim_state;
>>
>> -       *did_some_progress = try_to_free_pages(zonelist, order,
>> gfp_mask, nodemask);
>> +       *did_some_progress = try_to_free_pages(zonelist, order,
>> gfp_mask, nodemask, &ret_pages);
>>
>>         p->reclaim_state = NULL;
>>         lockdep_clear_current_reclaim_state();
>> @@ -1892,10 +1893,11 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
>> unsigned int order,
>>                 return NULL;
>>
>>  retry:
>> -       page = get_page_from_freelist(gfp_mask, nodemask, order,
>> -                                       zonelist, high_zoneidx,
>> -                                       alloc_flags, preferred_zone,
>> -                                       migratetype);
>> +       if(!list_empty(&ret_pages)) {
>> +               page = lru_to_page(ret_pages);
>> +               list_del(&page->lru);
>> +               free_page_list(&ret_pages);
>> +       }
>
> Maybe.  Or just pass a page*.
>

Absolutely.


-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  2:01       ` Andrew Morton
  2010-12-09  2:19         ` Minchan Kim
@ 2010-12-09  5:18         ` Minchan Kim
  1 sibling, 0 replies; 35+ messages in thread
From: Minchan Kim @ 2010-12-09  5:18 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 3424 bytes --]

On Thu, Dec 9, 2010 at 11:01 AM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Thu, 9 Dec 2010 10:55:24 +0900 Minchan Kim <minchan.kim@gmail.com> wrote:
>
>> >> > leaves them to direct reclaim.
>> >>
>> >> Hi!
>> >>
>> >> We are experiencing a similar issue, though with a 757 MB Normal zone,
>> >> where kswapd tries to rebalance Normal after an order-3 allocation while
>> >> page cache allocations (order-0) keep splitting it back up again. __It can
>> >> run the whole day like this (SSD storage) without sleeping.
>> >
>> > People at google have told me they've seen the same thing. __A fork is
>> > taking 15 minutes when someone else is doing a dd, because the fork
>> > enters direct-reclaim trying for an order-one page. __It successfully
>> > frees some order-one pages but before it gets back to allocate one, dd
>> > has gone and stolen them, or split them apart.
>> >
>> > This problem would have got worse when slub came along doing its stupid
>> > unnecessary high-order allocations.
>> >
>> > Billions of years ago a direct-reclaimer had a one-deep cache in the
>> > task_struct into which it freed the page to prevent it from getting
>> > stolen.
>> >
>> > Later, we took that out because pages were being freed into the
>> > per-cpu-pages magazine, which is effectively task-local anyway. __But
>> > per-cpu-pages are only for order-0 pages. __See slub stupidity, above.
>> >
>> > I expect that this is happening so repeatably because the
>> > direct-reclaimer is dong a sleep somewhere after freeing the pages it
>> > needs - if it wasn't doing that then surely the window wouldn't be wide
>> > enough for it to happen so often. __But I didn't look.
>> >
>> > Suitable fixes might be
>> >
>> > a) don't go to sleep after the successful direct-reclaim.
>>
>> It can't make sure success since direct reclaim needs sleep with !GFP_AOMIC.
>
> It doesn't necessarily need to sleep *after* successfully freeing
> pages.  If it needs to sleep then do it before or during the freeing.
>
>> >
>> > b) reinstate the one-deep task-local free page cache.
>>
>> I like b) so how about this?
>> Just for the concept.
>>
>> @@ -1880,7 +1881,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
>> unsigned int order,
>>         reclaim_state.reclaimed_slab = 0;
>>         p->reclaim_state = &reclaim_state;
>>
>> -       *did_some_progress = try_to_free_pages(zonelist, order,
>> gfp_mask, nodemask);
>> +       *did_some_progress = try_to_free_pages(zonelist, order,
>> gfp_mask, nodemask, &ret_pages);
>>
>>         p->reclaim_state = NULL;
>>         lockdep_clear_current_reclaim_state();
>> @@ -1892,10 +1893,11 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask,
>> unsigned int order,
>>                 return NULL;
>>
>>  retry:
>> -       page = get_page_from_freelist(gfp_mask, nodemask, order,
>> -                                       zonelist, high_zoneidx,
>> -                                       alloc_flags, preferred_zone,
>> -                                       migratetype);
>> +       if(!list_empty(&ret_pages)) {
>> +               page = lru_to_page(ret_pages);
>> +               list_del(&page->lru);
>> +               free_page_list(&ret_pages);
>> +       }
>
> Maybe.  Or just pass a page*.
>

I did it more detailed but It's not completed and test at all.
Please consider just RFC.

-- CUT_HERE --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:23   ` Andrew Morton
  2010-12-09  1:55     ` Minchan Kim
  2010-12-09  2:05     ` Simon Kirby
@ 2010-12-09  8:55     ` Pekka Enberg
  2010-12-09 14:46       ` Mel Gorman
  2010-12-09 14:44     ` Mel Gorman
  2010-12-09 18:39     ` Ying Han
  4 siblings, 1 reply; 35+ messages in thread
From: Pekka Enberg @ 2010-12-09  8:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm,
	Christoph Lameter, David Rientjes, Nick Piggin

On Thu, Dec 9, 2010 at 3:23 AM, Andrew Morton <akpm@linux-foundation.org> wrote:
> This problem would have got worse when slub came along doing its stupid
> unnecessary high-order allocations.

Stupid, maybe but not unnecessary because they're a performance
improvement on large CPU systems (needed because of current SLUB
design). We're scaling the allocation order based on number of CPUs
but maybe we could shrink it even more.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 22:19 ` Andrew Morton
  2010-12-09  0:04   ` Johannes Weiner
  2010-12-09  0:47   ` Rik van Riel
@ 2010-12-09 14:34   ` Mel Gorman
  2 siblings, 0 replies; 35+ messages in thread
From: Mel Gorman @ 2010-12-09 14:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Johannes Weiner, Rik van Riel, linux-mm

On Wed, Dec 08, 2010 at 02:19:09PM -0800, Andrew Morton wrote:
> On Wed,  8 Dec 2010 16:16:59 +0100
> Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> > Kswapd tries to rebalance zones persistently until their high
> > watermarks are restored.
> > 
> > If the amount of unreclaimable pages in a zone makes this impossible
> > for reclaim, though, kswapd will end up in a busy loop without a
> > chance of reaching its goal.
> > 
> > This behaviour was observed on a virtual machine with a tiny
> > Normal-zone that filled up with unreclaimable slab objects.
> 
> Doesn't this mean that vmscan is incorrectly handling its
> zone->all_unreclaimable logic?
> 

I believe there is a bug in sleeping_prematurely() that is not handling
zone->all_unreclaimable logic correctly at the very least. I posted a
patch called "mm: kswapd: Treat zone->all_unreclaimable in
sleeping_prematurely similar to balance_pgdat()" as part of a larger
series. Johannes, it'd be nice if you could read that patch and see if
it's related to this bug.

> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> > leaves them to direct reclaim.
> > 
> > ...
> >
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
> >  }
> >  #endif
> >  
> > +static bool zone_needs_scan(struct zone *zone, int order,
> > +			    unsigned long goal, int classzone_idx)
> > +{
> > +	unsigned long free, prospect;
> > +
> > +	free = zone_page_state(zone, NR_FREE_PAGES);
> > +	if (zone->percpu_drift_mark && free < zone->percpu_drift_mark)
> > +		free = zone_page_state_snapshot(zone, NR_FREE_PAGES);
> > +
> > +	if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free))
> > +		return false;
> > +	/*
> > +	 * Ensure that the watermark is at all restorable through
> > +	 * reclaim.  Otherwise, leave the zone to direct reclaim.
> > +	 */
> > +	prospect = free + zone_reclaimable_pages(zone);
> > +	return prospect >= goal;
> > +}
> 
> presumably in certain cases that's a bit more efficient than doing the
> scan and using ->all_unreclaimable.  But the scanner shouldn't have got
> stuck!  That's a regresion which got added, and I don't think that new
> code of this nature was needed to fix that regression.
> 
> Did this zone end up with ->all_unreclaimable set?  If so, why was
> kswapd stuck in a loop scanning an all-unreclaimable zone?
> 

There is a bug that kswapd is staying awake when it shouldn't. I've cc'd
you on V3 of a series "Prevent kswapd dumping excessive amounts of
memory in response to high-order allocations". It has been reported
that V2 of the series fixed a problem where kswapd stayed awake when it
shouldn't.

> Also, if I'm understanding the new logic then if the "goal" is 100
> pages and zone_reclaimable_pages() says "50 pages potentially
> reclaimable" then kswapd won't reclaim *any* pages.  If so, is that
> good behaviour?  Should we instead attempt to reclaim some of those 50
> pages and then give up?  That sounds like a better strategy if we want
> to keep (say) network Rx happening in a tight memory situation.
> 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:08     ` Simon Kirby
@ 2010-12-09 14:42       ` Mel Gorman
  0 siblings, 0 replies; 35+ messages in thread
From: Mel Gorman @ 2010-12-09 14:42 UTC (permalink / raw)
  To: Simon Kirby; +Cc: Rik van Riel, Johannes Weiner, Andrew Morton, linux-mm

On Wed, Dec 08, 2010 at 05:08:38PM -0800, Simon Kirby wrote:
> On Wed, Dec 08, 2010 at 07:49:03PM -0500, Rik van Riel wrote:
> 
> > On 12/08/2010 07:36 PM, Simon Kirby wrote:
> >
> >> Mel Gorman posted a similar patch to yours, but the logic is instead to
> >> consider order>0 balancing sufficient when there are other balanced zones
> >> totalling at least 25% of pages on this node.  This would probably fix
> >> your case as well.
> >
> > Mel's patch addresses something very different and is unlikely
> > to fix the problem this patch addresses.
> 
> Ok, I see they're quite separate.
> 
> Johannes' patch solves the problem of trying to balance a tiny Normal
> zone which happens to be full of unclaimable slab pages by giving up in
> this hopeless case, regardless of order.
> 
> Mel's patch solves the problem of fighting allocations causing an
> order>0 imbalance in the small Normal zone which happens to be full of
> reclaimable pages by giving up in this not-worth-bothering case.
> 
> The key difference is that Johannes' patch has no condition on order, so
> Mel's patch probably would help (though not for intended reasons) in the
> order != 0 case, and probably not in the order=0 case.
> 

I would be interested in hearing if the patch in that series that alters
how sleeping_prematurely() treats zone->all_unreclaimable makes a
difference though.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:23   ` Andrew Morton
                       ` (2 preceding siblings ...)
  2010-12-09  8:55     ` Pekka Enberg
@ 2010-12-09 14:44     ` Mel Gorman
  2010-12-09 18:03       ` Andrew Morton
  2010-12-09 18:48       ` Ying Han
  2010-12-09 18:39     ` Ying Han
  4 siblings, 2 replies; 35+ messages in thread
From: Mel Gorman @ 2010-12-09 14:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Simon Kirby, Johannes Weiner, Rik van Riel, linux-mm

On Wed, Dec 08, 2010 at 05:23:24PM -0800, Andrew Morton wrote:
> On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
> 
> > On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
> > 
> > > Kswapd tries to rebalance zones persistently until their high
> > > watermarks are restored.
> > > 
> > > If the amount of unreclaimable pages in a zone makes this impossible
> > > for reclaim, though, kswapd will end up in a busy loop without a
> > > chance of reaching its goal.
> > > 
> > > This behaviour was observed on a virtual machine with a tiny
> > > Normal-zone that filled up with unreclaimable slab objects.
> > > 
> > > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> > > leaves them to direct reclaim.
> > 
> > Hi!
> > 
> > We are experiencing a similar issue, though with a 757 MB Normal zone,
> > where kswapd tries to rebalance Normal after an order-3 allocation while
> > page cache allocations (order-0) keep splitting it back up again.  It can
> > run the whole day like this (SSD storage) without sleeping.
> 
> People at google have told me they've seen the same thing.  A fork is
> taking 15 minutes when someone else is doing a dd, because the fork
> enters direct-reclaim trying for an order-one page.  It successfully
> frees some order-one pages but before it gets back to allocate one, dd
> has gone and stolen them, or split them apart.
> 

Is there a known test case for this or should I look at doing a
streaming-IO test with a basic workload constantly forking in the
background to measure the fork latency?

> This problem would have got worse when slub came along doing its stupid
> unnecessary high-order allocations.
> 
> Billions of years ago a direct-reclaimer had a one-deep cache in the
> task_struct into which it freed the page to prevent it from getting
> stolen.
> 
> Later, we took that out because pages were being freed into the
> per-cpu-pages magazine, which is effectively task-local anyway.  But
> per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
> 
> I expect that this is happening so repeatably because the
> direct-reclaimer is dong a sleep somewhere after freeing the pages it
> needs - if it wasn't doing that then surely the window wouldn't be wide
> enough for it to happen so often.  But I didn't look.
> 
> Suitable fixes might be
> 
> a) don't go to sleep after the successful direct-reclaim.
> 

I submitted a patch for this a long time ago but at the time we didn't
have a test case that made a difference to it. Might be worth
revisiting. I can't find the related patch any more but it was fairly
trivial.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  8:55     ` Pekka Enberg
@ 2010-12-09 14:46       ` Mel Gorman
  0 siblings, 0 replies; 35+ messages in thread
From: Mel Gorman @ 2010-12-09 14:46 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Andrew Morton, Simon Kirby, Johannes Weiner, Rik van Riel,
	linux-mm, Christoph Lameter, David Rientjes, Nick Piggin

On Thu, Dec 09, 2010 at 10:55:10AM +0200, Pekka Enberg wrote:
> On Thu, Dec 9, 2010 at 3:23 AM, Andrew Morton <akpm@linux-foundation.org> wrote:
> > This problem would have got worse when slub came along doing its stupid
> > unnecessary high-order allocations.
> 
> Stupid, maybe but not unnecessary because they're a performance
> improvement on large CPU systems (needed because of current SLUB
> design). We're scaling the allocation order based on number of CPUs
> but maybe we could shrink it even more.
> 

It's conceivable that the GFP_NOKSWAPD patch needs to be taken from the
THP series and applied to slub but only when slub is ruled out as the
only source of the problem. Right now, it looks like forking workloads
are suffering which is unrelated to slub.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 14:44     ` Mel Gorman
@ 2010-12-09 18:03       ` Andrew Morton
  2010-12-09 18:48       ` Ying Han
  1 sibling, 0 replies; 35+ messages in thread
From: Andrew Morton @ 2010-12-09 18:03 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Simon Kirby, Johannes Weiner, Rik van Riel, linux-mm, Ying Han

On Thu, 9 Dec 2010 14:44:12 +0000 Mel Gorman <mel@csn.ul.ie> wrote:

> On Wed, Dec 08, 2010 at 05:23:24PM -0800, Andrew Morton wrote:
> > On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
> > 
> > > On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
> > > 
> > > > Kswapd tries to rebalance zones persistently until their high
> > > > watermarks are restored.
> > > > 
> > > > If the amount of unreclaimable pages in a zone makes this impossible
> > > > for reclaim, though, kswapd will end up in a busy loop without a
> > > > chance of reaching its goal.
> > > > 
> > > > This behaviour was observed on a virtual machine with a tiny
> > > > Normal-zone that filled up with unreclaimable slab objects.
> > > > 
> > > > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> > > > leaves them to direct reclaim.
> > > 
> > > Hi!
> > > 
> > > We are experiencing a similar issue, though with a 757 MB Normal zone,
> > > where kswapd tries to rebalance Normal after an order-3 allocation while
> > > page cache allocations (order-0) keep splitting it back up again.  It can
> > > run the whole day like this (SSD storage) without sleeping.
> > 
> > People at google have told me they've seen the same thing.  A fork is
> > taking 15 minutes when someone else is doing a dd, because the fork
> > enters direct-reclaim trying for an order-one page.  It successfully
> > frees some order-one pages but before it gets back to allocate one, dd
> > has gone and stolen them, or split them apart.
> > 
> 
> Is there a known test case for this or should I look at doing a
> streaming-IO test with a basic workload constantly forking in the
> background to measure the fork latency?

(cc yinghan)

> > This problem would have got worse when slub came along doing its stupid
> > unnecessary high-order allocations.
> > 
> > Billions of years ago a direct-reclaimer had a one-deep cache in the
> > task_struct into which it freed the page to prevent it from getting
> > stolen.
> > 
> > Later, we took that out because pages were being freed into the
> > per-cpu-pages magazine, which is effectively task-local anyway.  But
> > per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
> > 
> > I expect that this is happening so repeatably because the
> > direct-reclaimer is dong a sleep somewhere after freeing the pages it
> > needs - if it wasn't doing that then surely the window wouldn't be wide
> > enough for it to happen so often.  But I didn't look.
> > 
> > Suitable fixes might be
> > 
> > a) don't go to sleep after the successful direct-reclaim.
> > 
> 
> I submitted a patch for this a long time ago but at the time we didn't
> have a test case that made a difference to it. Might be worth
> revisiting. I can't find the related patch any more but it was fairly
> trivial.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  1:23   ` Andrew Morton
                       ` (3 preceding siblings ...)
  2010-12-09 14:44     ` Mel Gorman
@ 2010-12-09 18:39     ` Ying Han
  2010-12-10 11:37       ` Mel Gorman
  4 siblings, 1 reply; 35+ messages in thread
From: Ying Han @ 2010-12-09 18:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Simon Kirby, Johannes Weiner, Mel Gorman, Rik van Riel, linux-mm

On Wed, Dec 8, 2010 at 5:23 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
>
>> On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
>>
>> > Kswapd tries to rebalance zones persistently until their high
>> > watermarks are restored.
>> >
>> > If the amount of unreclaimable pages in a zone makes this impossible
>> > for reclaim, though, kswapd will end up in a busy loop without a
>> > chance of reaching its goal.
>> >
>> > This behaviour was observed on a virtual machine with a tiny
>> > Normal-zone that filled up with unreclaimable slab objects.
>> >
>> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
>> > leaves them to direct reclaim.
>>
>> Hi!
>>
>> We are experiencing a similar issue, though with a 757 MB Normal zone,
>> where kswapd tries to rebalance Normal after an order-3 allocation while
>> page cache allocations (order-0) keep splitting it back up again.  It can
>> run the whole day like this (SSD storage) without sleeping.
>
> People at google have told me they've seen the same thing.  A fork is
> taking 15 minutes when someone else is doing a dd, because the fork
> enters direct-reclaim trying for an order-one page.  It successfully
> frees some order-one pages but before it gets back to allocate one, dd
> has gone and stolen them, or split them apart.

So we are running into this problem in a container environment. While
running dd in a container with
bunch of system daemons like sshd, we've seen sshd being OOM killed.

One of the theory which we haven't fully proven is dd keep sallocating
and stealing pages which just being
reclaimed from ttfp of sshd. We've talked with Andrew and wondering if
there is a way to prevent that
happening. And we learned that we might have something for order 0
pages since they got freed to per-cpu
list and the process triggered ttfp more likely to get it unless being
rescheduled. But nothing for order 1 which
is fork() in this case.

--Ying

>
> This problem would have got worse when slub came along doing its stupid
> unnecessary high-order allocations.
>
> Billions of years ago a direct-reclaimer had a one-deep cache in the
> task_struct into which it freed the page to prevent it from getting
> stolen.
>
> Later, we took that out because pages were being freed into the
> per-cpu-pages magazine, which is effectively task-local anyway.  But
> per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
>
> I expect that this is happening so repeatably because the
> direct-reclaimer is dong a sleep somewhere after freeing the pages it
> needs - if it wasn't doing that then surely the window wouldn't be wide
> enough for it to happen so often.  But I didn't look.
>
> Suitable fixes might be
>
> a) don't go to sleep after the successful direct-reclaim.
>
> b) reinstate the one-deep task-local free page cache.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 14:44     ` Mel Gorman
  2010-12-09 18:03       ` Andrew Morton
@ 2010-12-09 18:48       ` Ying Han
  2010-12-10 11:34         ` Mel Gorman
  1 sibling, 1 reply; 35+ messages in thread
From: Ying Han @ 2010-12-09 18:48 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Simon Kirby, Johannes Weiner, Rik van Riel, linux-mm

On Thu, Dec 9, 2010 at 6:44 AM, Mel Gorman <mel@csn.ul.ie> wrote:
> On Wed, Dec 08, 2010 at 05:23:24PM -0800, Andrew Morton wrote:
>> On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
>>
>> > On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
>> >
>> > > Kswapd tries to rebalance zones persistently until their high
>> > > watermarks are restored.
>> > >
>> > > If the amount of unreclaimable pages in a zone makes this impossible
>> > > for reclaim, though, kswapd will end up in a busy loop without a
>> > > chance of reaching its goal.
>> > >
>> > > This behaviour was observed on a virtual machine with a tiny
>> > > Normal-zone that filled up with unreclaimable slab objects.
>> > >
>> > > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
>> > > leaves them to direct reclaim.
>> >
>> > Hi!
>> >
>> > We are experiencing a similar issue, though with a 757 MB Normal zone,
>> > where kswapd tries to rebalance Normal after an order-3 allocation while
>> > page cache allocations (order-0) keep splitting it back up again.  It can
>> > run the whole day like this (SSD storage) without sleeping.
>>
>> People at google have told me they've seen the same thing.  A fork is
>> taking 15 minutes when someone else is doing a dd, because the fork
>> enters direct-reclaim trying for an order-one page.  It successfully
>> frees some order-one pages but before it gets back to allocate one, dd
>> has gone and stolen them, or split them apart.
>>
>
> Is there a known test case for this or should I look at doing a
> streaming-IO test with a basic workload constantly forking in the
> background to measure the fork latency?

We were seeing some system daemons(sshd) being OOM killed while
running in the same
memory container as dd test. I assume we can generate the test case
while running dd on
10G of file in 1G container, at the same time running
unixbench(fork/exec loop)?

--Ying

>
>> This problem would have got worse when slub came along doing its stupid
>> unnecessary high-order allocations.
>>
>> Billions of years ago a direct-reclaimer had a one-deep cache in the
>> task_struct into which it freed the page to prevent it from getting
>> stolen.
>>
>> Later, we took that out because pages were being freed into the
>> per-cpu-pages magazine, which is effectively task-local anyway.  But
>> per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
>>
>> I expect that this is happening so repeatably because the
>> direct-reclaimer is dong a sleep somewhere after freeing the pages it
>> needs - if it wasn't doing that then surely the window wouldn't be wide
>> enough for it to happen so often.  But I didn't look.
>>
>> Suitable fixes might be
>>
>> a) don't go to sleep after the successful direct-reclaim.
>>
>
> I submitted a patch for this a long time ago but at the time we didn't
> have a test case that made a difference to it. Might be worth
> revisiting. I can't find the related patch any more but it was fairly
> trivial.

If you have the patch, maybe we can give a try on our case.

--Ying
>
> --
> Mel Gorman
> Part-time Phd Student                          Linux Technology Center
> University of Limerick                         IBM Dublin Software Lab
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-08 15:16 [patch] mm: skip rebalance of hopeless zones Johannes Weiner
                   ` (3 preceding siblings ...)
  2010-12-09  1:29 ` Minchan Kim
@ 2010-12-09 18:51 ` Ying Han
  2010-12-10  7:25   ` KOSAKI Motohiro
  2010-12-10 10:54   ` Johannes Weiner
  4 siblings, 2 replies; 35+ messages in thread
From: Ying Han @ 2010-12-09 18:51 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Rik van Riel, linux-mm

On Wed, Dec 8, 2010 at 7:16 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> Kswapd tries to rebalance zones persistently until their high
> watermarks are restored.
>
> If the amount of unreclaimable pages in a zone makes this impossible
> for reclaim, though, kswapd will end up in a busy loop without a
> chance of reaching its goal.
>
> This behaviour was observed on a virtual machine with a tiny
> Normal-zone that filled up with unreclaimable slab objects.
>
> This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> leaves them to direct reclaim.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/mmzone.h |    2 ++
>  mm/page_alloc.c        |    4 ++--
>  mm/vmscan.c            |   36 ++++++++++++++++++++++++++++--------
>  3 files changed, 32 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 4890662..0cc1d63 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -655,6 +655,8 @@ typedef struct pglist_data {
>  extern struct mutex zonelists_mutex;
>  void build_all_zonelists(void *data);
>  void wakeup_kswapd(struct zone *zone, int order);
> +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +                        int classzone_idx, int alloc_flags, long free_pages);
>  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>                int classzone_idx, int alloc_flags);
>  bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1845a97..c7d2b28 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1458,8 +1458,8 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
>  * Return true if free pages are above 'mark'. This takes into account the order
>  * of the allocation.
>  */
> -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> -                     int classzone_idx, int alloc_flags, long free_pages)
> +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +                        int classzone_idx, int alloc_flags, long free_pages)
>  {
>        /* free_pages my go negative - that's OK */
>        long min = mark;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 42a4859..5623f36 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>  }
>  #endif
>
> +static bool zone_needs_scan(struct zone *zone, int order,
> +                           unsigned long goal, int classzone_idx)
> +{
> +       unsigned long free, prospect;
> +
> +       free = zone_page_state(zone, NR_FREE_PAGES);
> +       if (zone->percpu_drift_mark && free < zone->percpu_drift_mark)
> +               free = zone_page_state_snapshot(zone, NR_FREE_PAGES);
> +
> +       if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free))
> +               return false;
> +       /*
> +        * Ensure that the watermark is at all restorable through
> +        * reclaim.  Otherwise, leave the zone to direct reclaim.
> +        */
> +       prospect = free + zone_reclaimable_pages(zone);
> +       return prospect >= goal;
> +}
> +
>  /* is kswapd sleeping prematurely? */
>  static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>  {
> @@ -2210,8 +2229,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>                if (zone->all_unreclaimable)
>                        continue;
>
> -               if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
> -                                                               0, 0))
> +               if (zone_needs_scan(zone, order, high_wmark_pages(zone), 0))
>                        return 1;
>        }
>
> @@ -2282,6 +2300,7 @@ loop_again:
>                 */
>                for (i = pgdat->nr_zones - 1; i >= 0; i--) {
>                        struct zone *zone = pgdat->node_zones + i;
> +                       unsigned long goal;
>
>                        if (!populated_zone(zone))
>                                continue;
> @@ -2297,8 +2316,8 @@ loop_again:
>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>                                                        &sc, priority, 0);
>
> -                       if (!zone_watermark_ok_safe(zone, order,
> -                                       high_wmark_pages(zone), 0, 0)) {
> +                       goal = high_wmark_pages(zone);
> +                       if (zone_needs_scan(zone, order, goal, 0)) {
>                                end_zone = i;
>                                break;
>                        }
> @@ -2323,6 +2342,7 @@ loop_again:
>                 */
>                for (i = 0; i <= end_zone; i++) {
>                        struct zone *zone = pgdat->node_zones + i;
> +                       unsigned long goal;
>                        int nr_slab;
>
>                        if (!populated_zone(zone))
> @@ -2339,12 +2359,13 @@ loop_again:
>                         */
>                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
>
> +                       goal = high_wmark_pages(zone);
>                        /*
>                         * We put equal pressure on every zone, unless one
>                         * zone has way too many pages free already.
>                         */
>                        if (!zone_watermark_ok_safe(zone, order,
> -                                       8*high_wmark_pages(zone), end_zone, 0))
> +                                                   8 * goal, end_zone, 0))
>                                shrink_zone(priority, zone, &sc);
>                        reclaim_state->reclaimed_slab = 0;
>                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> @@ -2373,8 +2394,7 @@ loop_again:
>                                compact_zone_order(zone, sc.order, sc.gfp_mask,
>                                                        false);
>
> -                       if (!zone_watermark_ok_safe(zone, order,
> -                                       high_wmark_pages(zone), end_zone, 0)) {
> +                       if (zone_needs_scan(zone, order, goal, end_zone)) {
>                                all_zones_ok = 0;
>                                /*
>                                 * We are still under min water mark.  This
> @@ -2587,7 +2607,7 @@ void wakeup_kswapd(struct zone *zone, int order)
>                pgdat->kswapd_max_order = order;
>        if (!waitqueue_active(&pgdat->kswapd_wait))
>                return;
> -       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
> +       if (!zone_needs_scan(zone, order, low_wmark_pages(zone), 0))
>                return;
>
>        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);


So we look at zone_reclaimable_pages() only to determine proceed
reclaiming or not. What if I have tons of unused dentry and inode
caches and we are skipping the shrinker here?

--Ying


> 1.7.3.2
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  0:04   ` Johannes Weiner
@ 2010-12-09 21:17     ` Andrew Morton
  2010-12-10 16:27       ` Johannes Weiner
  2011-01-04 23:56     ` Andrew Morton
  1 sibling, 1 reply; 35+ messages in thread
From: Andrew Morton @ 2010-12-09 21:17 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Rik van Riel, linux-mm

On Thu, 9 Dec 2010 01:04:40 +0100
Johannes Weiner <hannes@cmpxchg.org> wrote:

> On Wed, Dec 08, 2010 at 02:19:09PM -0800, Andrew Morton wrote:
> > On Wed,  8 Dec 2010 16:16:59 +0100
> > Johannes Weiner <hannes@cmpxchg.org> wrote:
> > 
> > > Kswapd tries to rebalance zones persistently until their high
> > > watermarks are restored.
> > > 
> > > If the amount of unreclaimable pages in a zone makes this impossible
> > > for reclaim, though, kswapd will end up in a busy loop without a
> > > chance of reaching its goal.
> > > 
> > > This behaviour was observed on a virtual machine with a tiny
> > > Normal-zone that filled up with unreclaimable slab objects.
> > 
> > Doesn't this mean that vmscan is incorrectly handling its
> > zone->all_unreclaimable logic?
> 
> I don't think so.  What leads to the problem is that we only declare a
> zone unreclaimable after a lot of work, but reset it with a single
> page that gets released back to the allocator (past the pcp queue,
> that is).
> 
> That's probably a good idea per-se, we don't want to leave a zone
> behind and retry it eagerly when pages are freed up.
> 
> > presumably in certain cases that's a bit more efficient than doing the
> > scan and using ->all_unreclaimable.  But the scanner shouldn't have got
> > stuck!  That's a regresion which got added, and I don't think that new
> > code of this nature was needed to fix that regression.
> 
> I'll dig through the history.  But we observed this on a very odd
> configuration (24MB ZONE_NORMAL), maybe this was never hit before?
> 
> > Did this zone end up with ->all_unreclaimable set?  If so, why was
> > kswapd stuck in a loop scanning an all-unreclaimable zone?
> 
> It wasn't.  This state is just not very sticky.  After all, the zone
> is not all_unreclaimable, just not reclaimable enough to restore the
> high watermark.  But the remaining reclaimable pages of that zone may
> very well be in constant flux.

It's bothersome that we have two mechanisms for doing pretty mcuh the
same thing.

> > Also, if I'm understanding the new logic then if the "goal" is 100
> > pages and zone_reclaimable_pages() says "50 pages potentially
> > reclaimable" then kswapd won't reclaim *any* pages.  If so, is that
> > good behaviour?  Should we instead attempt to reclaim some of those 50
> > pages and then give up?  That sounds like a better strategy if we want
> > to keep (say) network Rx happening in a tight memory situation.
> 
> Yes, that is probably a good idea.  I'll see that this is improved for
> atomic allocators.

Does that mean we can expect a v2?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 18:51 ` Ying Han
@ 2010-12-10  7:25   ` KOSAKI Motohiro
  2010-12-10  7:37     ` KOSAKI Motohiro
  2010-12-10 10:54   ` Johannes Weiner
  1 sibling, 1 reply; 35+ messages in thread
From: KOSAKI Motohiro @ 2010-12-10  7:25 UTC (permalink / raw)
  To: Ying Han
  Cc: kosaki.motohiro, Johannes Weiner, Andrew Morton, Rik van Riel, linux-mm

> So we look at zone_reclaimable_pages() only to determine proceed
> reclaiming or not. What if I have tons of unused dentry and inode
> caches and we are skipping the shrinker here?
> 
> --Ying

Good catch!
I perfectly agree with you.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-10  7:25   ` KOSAKI Motohiro
@ 2010-12-10  7:37     ` KOSAKI Motohiro
  0 siblings, 0 replies; 35+ messages in thread
From: KOSAKI Motohiro @ 2010-12-10  7:37 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Ying Han, Johannes Weiner, Andrew Morton, Rik van Riel, linux-mm

> > So we look at zone_reclaimable_pages() only to determine proceed
> > reclaiming or not. What if I have tons of unused dentry and inode
> > caches and we are skipping the shrinker here?
> > 
> > --Ying
> 
> Good catch!
> I perfectly agree with you.

The problem is, nunber of reclaimable slab doesn't give us any information.
There are frequently pinned and unreclaimable. That's one of the reason
now we are trying reclaim only when priority==DEF_PRIORITY even if all_unreclaimable=1.

slab shrinker should implement all_unreclaimable heuristics too?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 18:51 ` Ying Han
  2010-12-10  7:25   ` KOSAKI Motohiro
@ 2010-12-10 10:54   ` Johannes Weiner
  1 sibling, 0 replies; 35+ messages in thread
From: Johannes Weiner @ 2010-12-10 10:54 UTC (permalink / raw)
  To: Ying Han; +Cc: Andrew Morton, Rik van Riel, linux-mm

On Thu, Dec 09, 2010 at 10:51:40AM -0800, Ying Han wrote:
> On Wed, Dec 8, 2010 at 7:16 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> > @@ -2587,7 +2607,7 @@ void wakeup_kswapd(struct zone *zone, int order)
> >                pgdat->kswapd_max_order = order;
> >        if (!waitqueue_active(&pgdat->kswapd_wait))
> >                return;
> > -       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
> > +       if (!zone_needs_scan(zone, order, low_wmark_pages(zone), 0))
> >                return;
> >
> >        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
> 
> So we look at zone_reclaimable_pages() only to determine proceed
> reclaiming or not. What if I have tons of unused dentry and inode
> caches and we are skipping the shrinker here?

We have no straight-forward way to asking that (yet - per-zone
shrinkers may change that?), so the zone is left for direct reclaim to
figure this out.

Forcing allocators into direct reclaim more often is still better than
having kswapd run wild.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 18:48       ` Ying Han
@ 2010-12-10 11:34         ` Mel Gorman
  0 siblings, 0 replies; 35+ messages in thread
From: Mel Gorman @ 2010-12-10 11:34 UTC (permalink / raw)
  To: Ying Han
  Cc: Andrew Morton, Simon Kirby, Johannes Weiner, Rik van Riel, linux-mm

On Thu, Dec 09, 2010 at 10:48:37AM -0800, Ying Han wrote:
> On Thu, Dec 9, 2010 at 6:44 AM, Mel Gorman <mel@csn.ul.ie> wrote:
> > On Wed, Dec 08, 2010 at 05:23:24PM -0800, Andrew Morton wrote:
> >> On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
> >>
> >> > On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
> >> >
> >> > > Kswapd tries to rebalance zones persistently until their high
> >> > > watermarks are restored.
> >> > >
> >> > > If the amount of unreclaimable pages in a zone makes this impossible
> >> > > for reclaim, though, kswapd will end up in a busy loop without a
> >> > > chance of reaching its goal.
> >> > >
> >> > > This behaviour was observed on a virtual machine with a tiny
> >> > > Normal-zone that filled up with unreclaimable slab objects.
> >> > >
> >> > > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> >> > > leaves them to direct reclaim.
> >> >
> >> > Hi!
> >> >
> >> > We are experiencing a similar issue, though with a 757 MB Normal zone,
> >> > where kswapd tries to rebalance Normal after an order-3 allocation while
> >> > page cache allocations (order-0) keep splitting it back up again.  It can
> >> > run the whole day like this (SSD storage) without sleeping.
> >>
> >> People at google have told me they've seen the same thing.  A fork is
> >> taking 15 minutes when someone else is doing a dd, because the fork
> >> enters direct-reclaim trying for an order-one page.  It successfully
> >> frees some order-one pages but before it gets back to allocate one, dd
> >> has gone and stolen them, or split them apart.
> >>
> >
> > Is there a known test case for this or should I look at doing a
> > streaming-IO test with a basic workload constantly forking in the
> > background to measure the fork latency?
> 
> We were seeing some system daemons(sshd) being OOM killed while
> running in the same
> memory container as dd test. I assume we can generate the test case
> while running dd on
> 10G of file in 1G container, at the same time running
> unixbench(fork/exec loop)?
> 

unixbench in a fork/exec loop won't tell us the latency of each
individual operation. If order-1 is really a problem, we should see a
large standard deviation between fork/exec attempts. A custom test of
some sort is probably required.

> >
> >> This problem would have got worse when slub came along doing its stupid
> >> unnecessary high-order allocations.
> >>
> >> Billions of years ago a direct-reclaimer had a one-deep cache in the
> >> task_struct into which it freed the page to prevent it from getting
> >> stolen.
> >>
> >> Later, we took that out because pages were being freed into the
> >> per-cpu-pages magazine, which is effectively task-local anyway.  But
> >> per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
> >>
> >> I expect that this is happening so repeatably because the
> >> direct-reclaimer is dong a sleep somewhere after freeing the pages it
> >> needs - if it wasn't doing that then surely the window wouldn't be wide
> >> enough for it to happen so often.  But I didn't look.
> >>
> >> Suitable fixes might be
> >>
> >> a) don't go to sleep after the successful direct-reclaim.
> >>
> >
> > I submitted a patch for this a long time ago but at the time we didn't
> > have a test case that made a difference to it. Might be worth
> > revisiting. I can't find the related patch any more but it was fairly
> > trivial.
> 
> If you have the patch, maybe we can give a try on our case.
> 

I'll cobble one together early next week.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 18:39     ` Ying Han
@ 2010-12-10 11:37       ` Mel Gorman
  2010-12-10 19:46         ` Ying Han
  0 siblings, 1 reply; 35+ messages in thread
From: Mel Gorman @ 2010-12-10 11:37 UTC (permalink / raw)
  To: Ying Han
  Cc: Andrew Morton, Simon Kirby, Johannes Weiner, Rik van Riel, linux-mm

On Thu, Dec 09, 2010 at 10:39:46AM -0800, Ying Han wrote:
> On Wed, Dec 8, 2010 at 5:23 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
> > On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
> >
> >> On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
> >>
> >> > Kswapd tries to rebalance zones persistently until their high
> >> > watermarks are restored.
> >> >
> >> > If the amount of unreclaimable pages in a zone makes this impossible
> >> > for reclaim, though, kswapd will end up in a busy loop without a
> >> > chance of reaching its goal.
> >> >
> >> > This behaviour was observed on a virtual machine with a tiny
> >> > Normal-zone that filled up with unreclaimable slab objects.
> >> >
> >> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> >> > leaves them to direct reclaim.
> >>
> >> Hi!
> >>
> >> We are experiencing a similar issue, though with a 757 MB Normal zone,
> >> where kswapd tries to rebalance Normal after an order-3 allocation while
> >> page cache allocations (order-0) keep splitting it back up again.  It can
> >> run the whole day like this (SSD storage) without sleeping.
> >
> > People at google have told me they've seen the same thing.  A fork is
> > taking 15 minutes when someone else is doing a dd, because the fork
> > enters direct-reclaim trying for an order-one page.  It successfully
> > frees some order-one pages but before it gets back to allocate one, dd
> > has gone and stolen them, or split them apart.
> 
> So we are running into this problem in a container environment. While
> running dd in a container with
> bunch of system daemons like sshd, we've seen sshd being OOM killed.
> 

It's possible that containers are *particularly* vunerable to this
problem because they don't have kswapd. As direct reclaimers go to
sleep, the race between an order-1 page being freed and another request
breaking up the order-1 page might be far more severe.

> One of the theory which we haven't fully proven is dd keep sallocating
> and stealing pages which just being
> reclaimed from ttfp of sshd. We've talked with Andrew and wondering if
> there is a way to prevent that
> happening. And we learned that we might have something for order 0
> pages since they got freed to per-cpu
> list and the process triggered ttfp more likely to get it unless being
> rescheduled. But nothing for order 1 which
> is fork() in this case.
> 
> --Ying
> 
> >
> > This problem would have got worse when slub came along doing its stupid
> > unnecessary high-order allocations.
> >
> > Billions of years ago a direct-reclaimer had a one-deep cache in the
> > task_struct into which it freed the page to prevent it from getting
> > stolen.
> >
> > Later, we took that out because pages were being freed into the
> > per-cpu-pages magazine, which is effectively task-local anyway.  But
> > per-cpu-pages are only for order-0 pages.  See slub stupidity, above.
> >
> > I expect that this is happening so repeatably because the
> > direct-reclaimer is dong a sleep somewhere after freeing the pages it
> > needs - if it wasn't doing that then surely the window wouldn't be wide
> > enough for it to happen so often.  But I didn't look.
> >
> > Suitable fixes might be
> >
> > a) don't go to sleep after the successful direct-reclaim.
> >
> > b) reinstate the one-deep task-local free page cache.
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09 21:17     ` Andrew Morton
@ 2010-12-10 16:27       ` Johannes Weiner
  2011-01-05 11:15         ` Johannes Weiner
  0 siblings, 1 reply; 35+ messages in thread
From: Johannes Weiner @ 2010-12-10 16:27 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, linux-mm

On Thu, Dec 09, 2010 at 01:17:23PM -0800, Andrew Morton wrote:
> Does that mean we can expect a v2?

Ok, while comparing Mel's patches with this change on IRC, I realized
that the enterprise kernel the issue was reported against is lacking
'de3fab3 vmscan: kswapd: don't retry balance_pgdat() if all zones are
unreclaimable'.

The above change fixed the observed malfunction of course, but Occam's
Razor suggests that de3fab3 will do so, too.  I'll verify that, but I
don't expect to send another version of this patch.

Sorry for the noise.

	Hannes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-10 11:37       ` Mel Gorman
@ 2010-12-10 19:46         ` Ying Han
  0 siblings, 0 replies; 35+ messages in thread
From: Ying Han @ 2010-12-10 19:46 UTC (permalink / raw)
  To: Mel Gorman, Rik van Riel
  Cc: Andrew Morton, Simon Kirby, Johannes Weiner, linux-mm

On Fri, Dec 10, 2010 at 3:37 AM, Mel Gorman <mel@csn.ul.ie> wrote:
> On Thu, Dec 09, 2010 at 10:39:46AM -0800, Ying Han wrote:
>> On Wed, Dec 8, 2010 at 5:23 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
>> > On Wed, 8 Dec 2010 16:36:21 -0800 Simon Kirby <sim@hostway.ca> wrote:
>> >
>> >> On Wed, Dec 08, 2010 at 04:16:59PM +0100, Johannes Weiner wrote:
>> >>
>> >> > Kswapd tries to rebalance zones persistently until their high
>> >> > watermarks are restored.
>> >> >
>> >> > If the amount of unreclaimable pages in a zone makes this impossible
>> >> > for reclaim, though, kswapd will end up in a busy loop without a
>> >> > chance of reaching its goal.
>> >> >
>> >> > This behaviour was observed on a virtual machine with a tiny
>> >> > Normal-zone that filled up with unreclaimable slab objects.
>> >> >
>> >> > This patch makes kswapd skip rebalancing on such 'hopeless' zones and
>> >> > leaves them to direct reclaim.
>> >>
>> >> Hi!
>> >>
>> >> We are experiencing a similar issue, though with a 757 MB Normal zone,
>> >> where kswapd tries to rebalance Normal after an order-3 allocation while
>> >> page cache allocations (order-0) keep splitting it back up again.  It can
>> >> run the whole day like this (SSD storage) without sleeping.
>> >
>> > People at google have told me they've seen the same thing.  A fork is
>> > taking 15 minutes when someone else is doing a dd, because the fork
>> > enters direct-reclaim trying for an order-one page.  It successfully
>> > frees some order-one pages but before it gets back to allocate one, dd
>> > has gone and stolen them, or split them apart.
>>
>> So we are running into this problem in a container environment. While
>> running dd in a container with
>> bunch of system daemons like sshd, we've seen sshd being OOM killed.
>>
>
> It's possible that containers are *particularly* vunerable to this
> problem because they don't have kswapd.
In our fake numa enviroment, we do have per-container kswapd which are
the ones in container's nodemask. We also have extension for
consolidating all kswapds per-container due to bad lock contention.

As direct reclaimers go to sleep, the race between an order-1 page
being freed and another request
breaking up the order-1 page might be far more severe.

One thing we found which affecting the OOM is the logic in
inactive_file_is_low_global(), which tries to balance Active/Inactive
into 50%. If pages being promoted to Active (dirty data) and they will
be safe for being reclaimed until the LRU becomes unbalanced. So for
streaming IO, we have pages in Active list which won't be used again
and won't be scanned by page reclaim neither.

--Ying




>
>> One of the theory which we haven't fully proven is dd keep sallocating
>> and stealing pages which just being
>> reclaimed from ttfp of sshd. We've talked with Andrew and wondering if
>> there is a way to prevent that
>> happening. And we learned that we might have something for order 0
>> pages since they got freed to per-cpu
>> list and the process triggered ttfp more likely to get it unless being
>> rescheduled. But nothing for order 1 which
>> is fork() in this case.
>>
>> --Ying
>>
>> >
>> > This problem would have got worse when slub came along doing its stupid
>> > unnecessary high-order allocations.
>> >
>> > Billions of years ago a direct-reclaimer had a one-deep cache in the
>> > task_struct into which it freed the page to prevent it from getting
>> > stolen.
>> >
>> > Later, we took that out because pages were being freed into the
>> > per-cpu-pages magazine, which is effectively task-local anyway.  But
>> > per-cpu-pages are only for order-0 pages.  See slub stupidity, above

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-09  0:04   ` Johannes Weiner
  2010-12-09 21:17     ` Andrew Morton
@ 2011-01-04 23:56     ` Andrew Morton
  1 sibling, 0 replies; 35+ messages in thread
From: Andrew Morton @ 2011-01-04 23:56 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Rik van Riel, linux-mm

On Thu, 9 Dec 2010 01:04:40 +0100
Johannes Weiner <hannes@cmpxchg.org> wrote:

> On Wed, Dec 08, 2010 at 02:19:09PM -0800, Andrew Morton wrote:
> > On Wed,  8 Dec 2010 16:16:59 +0100
> > Johannes Weiner <hannes@cmpxchg.org> wrote:
> > 
> > > Kswapd tries to rebalance zones persistently until their high
> > > watermarks are restored.

So we still haven't fixed this.

> > > If the amount of unreclaimable pages in a zone makes this impossible
> > > for reclaim, though, kswapd will end up in a busy loop without a
> > > chance of reaching its goal.
> > > 
> > > This behaviour was observed on a virtual machine with a tiny
> > > Normal-zone that filled up with unreclaimable slab objects.
> > 
> > Doesn't this mean that vmscan is incorrectly handling its
> > zone->all_unreclaimable logic?
> 
> I don't think so.  What leads to the problem is that we only declare a
> zone unreclaimable after a lot of work, but reset it with a single
> page that gets released back to the allocator (past the pcp queue,
> that is).
> 
> That's probably a good idea per-se, we don't want to leave a zone
> behind and retry it eagerly when pages are freed up.
> 
> > presumably in certain cases that's a bit more efficient than doing the
> > scan and using ->all_unreclaimable.  But the scanner shouldn't have got
> > stuck!  That's a regresion which got added, and I don't think that new
> > code of this nature was needed to fix that regression.
> 
> I'll dig through the history.  But we observed this on a very odd
> configuration (24MB ZONE_NORMAL), maybe this was never hit before?

I expect scenarios like this _were_ tested, back in the day.  More
usually with a highmem zone which is much smaller than the normal zone.

> > Did this zone end up with ->all_unreclaimable set?  If so, why was
> > kswapd stuck in a loop scanning an all-unreclaimable zone?
> 
> It wasn't.  This state is just not very sticky.  After all, the zone
> is not all_unreclaimable, just not reclaimable enough to restore the
> high watermark.  But the remaining reclaimable pages of that zone may
> very well be in constant flux.

Perhaps this was caused by the breakage of the prev_priority logic. 
With prev_priority we'd only do a small amount of scanning against that
zone before declaring that it is still all_unreclaimable.

> > Also, if I'm understanding the new logic then if the "goal" is 100
> > pages and zone_reclaimable_pages() says "50 pages potentially
> > reclaimable" then kswapd won't reclaim *any* pages.  If so, is that
> > good behaviour?  Should we instead attempt to reclaim some of those 50
> > pages and then give up?  That sounds like a better strategy if we want
> > to keep (say) network Rx happening in a tight memory situation.
> 
> Yes, that is probably a good idea.  I'll see that this is improved for
> atomic allocators.

Having rethought, it still feels to me that we'd be implementing two
ways of doing basically the same thing.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch] mm: skip rebalance of hopeless zones
  2010-12-10 16:27       ` Johannes Weiner
@ 2011-01-05 11:15         ` Johannes Weiner
  0 siblings, 0 replies; 35+ messages in thread
From: Johannes Weiner @ 2011-01-05 11:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, linux-mm

Andrew,

On Fri, Dec 10, 2010 at 05:27:06PM +0100, Johannes Weiner wrote:
> On Thu, Dec 09, 2010 at 01:17:23PM -0800, Andrew Morton wrote:
> > Does that mean we can expect a v2?
> 
> Ok, while comparing Mel's patches with this change on IRC, I realized
> that the enterprise kernel the issue was reported against is lacking
> 'de3fab3 vmscan: kswapd: don't retry balance_pgdat() if all zones are
> unreclaimable'.
> 
> The above change fixed the observed malfunction of course, but Occam's
> Razor suggests that de3fab3 will do so, too.  I'll verify that, but I
> don't expect to send another version of this patch.

The problem is not reproducable on a kernel with de3fab3 applied.  You
were right from the start, it was a bug in the all_unreclaimable code.

The hopeless zone patch fixed the bug as well.  So I had a problem, a
working fix for it, and a broken mental image of the code that had me
convinced the all_unreclaimable logic was just not enough.

Maybe there is still a corner case where the all_unreclaimable logic
falls apart, but unless this happens in reality, I don't think there
is any reason to further pursue this.

> Sorry for the noise.
> 
> 	Hannes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2011-01-05 11:15 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-08 15:16 [patch] mm: skip rebalance of hopeless zones Johannes Weiner
2010-12-08 18:05 ` Rik van Riel
2010-12-08 22:19 ` Andrew Morton
2010-12-09  0:04   ` Johannes Weiner
2010-12-09 21:17     ` Andrew Morton
2010-12-10 16:27       ` Johannes Weiner
2011-01-05 11:15         ` Johannes Weiner
2011-01-04 23:56     ` Andrew Morton
2010-12-09  0:47   ` Rik van Riel
2010-12-09 14:34   ` Mel Gorman
2010-12-09  0:36 ` Simon Kirby
2010-12-09  0:49   ` Rik van Riel
2010-12-09  1:08     ` Simon Kirby
2010-12-09 14:42       ` Mel Gorman
2010-12-09  1:23   ` Andrew Morton
2010-12-09  1:55     ` Minchan Kim
2010-12-09  1:57       ` Minchan Kim
2010-12-09  2:01       ` Andrew Morton
2010-12-09  2:19         ` Minchan Kim
2010-12-09  5:18         ` Minchan Kim
2010-12-09  2:05     ` Simon Kirby
2010-12-09  8:55     ` Pekka Enberg
2010-12-09 14:46       ` Mel Gorman
2010-12-09 14:44     ` Mel Gorman
2010-12-09 18:03       ` Andrew Morton
2010-12-09 18:48       ` Ying Han
2010-12-10 11:34         ` Mel Gorman
2010-12-09 18:39     ` Ying Han
2010-12-10 11:37       ` Mel Gorman
2010-12-10 19:46         ` Ying Han
2010-12-09  1:29 ` Minchan Kim
2010-12-09 18:51 ` Ying Han
2010-12-10  7:25   ` KOSAKI Motohiro
2010-12-10  7:37     ` KOSAKI Motohiro
2010-12-10 10:54   ` Johannes Weiner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.