linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] mm, numa: reclaim from all nodes within reclaim distance
@ 2012-09-18  7:03 David Rientjes
  2012-09-18 21:03 ` Andrew Morton
  2012-09-19 23:46 ` Andrew Morton
  0 siblings, 2 replies; 5+ messages in thread
From: David Rientjes @ 2012-09-18  7:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Minchan Kim, KAMEZAWA Hiroyuki, linux-kernel, linux-mm

RECLAIM_DISTANCE represents the distance between nodes at which it is
deemed too costly to allocate from; it's preferred to try to reclaim from
a local zone before falling back to allocating on a remote node with such
a distance.

To do this, zone_reclaim_mode is set if the distance between any two
nodes on the system is greather than this distance.  This, however, ends
up causing the page allocator to reclaim from every zone regardless of
its affinity.

What we really want is to reclaim only from zones that are closer than 
RECLAIM_DISTANCE.  This patch adds a nodemask to each node that
represents the set of nodes that are within this distance.  During the
zone iteration, if the bit for a zone's node is set for the local node,
then reclaim is attempted; otherwise, the zone is skipped.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmzone.h |    1 +
 mm/page_alloc.c        |   31 ++++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -704,6 +704,7 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
+	nodemask_t reclaim_nodes;	/* Nodes allowed to reclaim from */
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1782,6 +1782,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+
 #else	/* CONFIG_NUMA */
 
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1807,11 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+	return true;
+}
 #endif	/* CONFIG_NUMA */
 
 /*
@@ -1886,7 +1896,8 @@ zonelist_scan:
 				did_zlc_setup = 1;
 			}
 
-			if (zone_reclaim_mode == 0)
+			if (zone_reclaim_mode == 0 ||
+			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 
 			/*
@@ -3328,21 +3339,13 @@ static void build_zonelists(pg_data_t *pgdat)
 	j = 0;
 
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (distance != node_distance(local_node, prev_node))
+		if (node_distance(local_node, node) !=
+		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 
 		prev_node = node;
@@ -4515,12 +4518,18 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+	int i;
 
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
+	for_each_online_node(i)
+		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+			node_set(i, pgdat->reclaim_nodes);
+			zone_reclaim_mode = 1;
+		}
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] mm, numa: reclaim from all nodes within reclaim distance
  2012-09-18  7:03 [patch] mm, numa: reclaim from all nodes within reclaim distance David Rientjes
@ 2012-09-18 21:03 ` Andrew Morton
  2012-09-18 21:44   ` David Rientjes
  2012-09-19 23:46 ` Andrew Morton
  1 sibling, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2012-09-18 21:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Minchan Kim, KAMEZAWA Hiroyuki, linux-kernel, linux-mm

On Tue, 18 Sep 2012 00:03:57 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> RECLAIM_DISTANCE represents the distance between nodes at which it is
> deemed too costly to allocate from; it's preferred to try to reclaim from
> a local zone before falling back to allocating on a remote node with such
> a distance.
> 
> To do this, zone_reclaim_mode is set if the distance between any two
> nodes on the system is greather than this distance.  This, however, ends
> up causing the page allocator to reclaim from every zone regardless of
> its affinity.
> 
> What we really want is to reclaim only from zones that are closer than 
> RECLAIM_DISTANCE.  This patch adds a nodemask to each node that
> represents the set of nodes that are within this distance.  During the
> zone iteration, if the bit for a zone's node is set for the local node,
> then reclaim is attempted; otherwise, the zone is skipped.

Is this a theoretical thing, or does the patch have real observable
effects?

This change makes it more important that the arch code implements
node_distance() accurately (wrt RECLAIM_DISTANCE), yes?  I wonder how
much code screwed that up, and what the effects of such a screwup would
be, and how arch maintainers would go about detecting then fixing such
an error?


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] mm, numa: reclaim from all nodes within reclaim distance
  2012-09-18 21:03 ` Andrew Morton
@ 2012-09-18 21:44   ` David Rientjes
  0 siblings, 0 replies; 5+ messages in thread
From: David Rientjes @ 2012-09-18 21:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Minchan Kim, KAMEZAWA Hiroyuki, linux-kernel, linux-mm

On Tue, 18 Sep 2012, Andrew Morton wrote:

> > RECLAIM_DISTANCE represents the distance between nodes at which it is
> > deemed too costly to allocate from; it's preferred to try to reclaim from
> > a local zone before falling back to allocating on a remote node with such
> > a distance.
> > 
> > To do this, zone_reclaim_mode is set if the distance between any two
> > nodes on the system is greather than this distance.  This, however, ends
> > up causing the page allocator to reclaim from every zone regardless of
> > its affinity.
> > 
> > What we really want is to reclaim only from zones that are closer than 
> > RECLAIM_DISTANCE.  This patch adds a nodemask to each node that
> > represents the set of nodes that are within this distance.  During the
> > zone iteration, if the bit for a zone's node is set for the local node,
> > then reclaim is attempted; otherwise, the zone is skipped.
> 
> Is this a theoretical thing, or does the patch have real observable
> effects?
> 

In its current state, this is for correctness and it could have an 
observable effect on a system where node_distance(a, b) doesn't accurately 
represent the access latency between the two nodes relative to a local 
access.  On x86, this would mean a SLIT that isn't representative of the 
physical topology, which tends to be fairly common, and because 
RECLAIM_DISTANCE > REMOTE_DISTANCE.

> This change makes it more important that the arch code implements
> node_distance() accurately (wrt RECLAIM_DISTANCE), yes?  I wonder how
> much code screwed that up, and what the effects of such a screwup would
> be, and how arch maintainers would go about detecting then fixing such
> an error?
> 

My solution is to get rid of RECLAIM_DISTANCE entirely based on two 
assertions:

 - we don't want to encode any arch-dependent zone reclaiming behavior 
   into the VM, i.e. we don't want mips to hack a node_distance() 
   implementation or RECLAIM_DISTANCE value to change the behavior of the 
   page allocator as a workaround for a bigger problem, and

 - there's no unifying unit and scale to measure when we should reclaim 
   locally or allocate remotely across all architectures and, even if 
   there was, we probably shouldn't trust it to be correct.

So we declare generically that RECLAIM_DISTANCE is 30 and that is what is 
used on x86 where this information is determined by a SLIT and the ACPI 
specification states the values in the SLIT are relative to LOCAL_DISTANCE 
of 10.  Thus, on x86, the VM policy (after this patch) is to prefer to 
skip reclaim from remote zones where the memory latency is three times 
greater or more than accessing locally.

Given that, I eventually want to remove RECLAIM_DISTANCE entirely and 
measure the actual latency of a memory access to remote zones after the 
zonelists are initially built as the criteria to set bits in the new 
reclaim_nodes nodemask.

That's the long-term goal.

We do currently have memory hotplug issues, though, for zone_reclaim_mode 
independent of this patch.  If it was set at boot and we unplug the last 
node on the system with a distance > RECLAIM_DISTANCE, then it remains set 
so we're still always reclaiming when we could just allocate remotely.  
We can't just clear the bit before rebuilding the zonelists after a 
hotplug event, though, because it may never have been set at boot and was 
rather set by the user via the tunable.

Once I've removed RECLAIM_DISTANCE entirely, I think we can leave 
zone_reclaim_mode entirely to the user and just use the new reclaim_nodes 
mask to determine when to reclaim locally vs. allocate remotely, though.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] mm, numa: reclaim from all nodes within reclaim distance
  2012-09-18  7:03 [patch] mm, numa: reclaim from all nodes within reclaim distance David Rientjes
  2012-09-18 21:03 ` Andrew Morton
@ 2012-09-19 23:46 ` Andrew Morton
  2012-09-26  3:50   ` [patch -mm] mm, numa: reclaim from all nodes within reclaim distance fix fix David Rientjes
  1 sibling, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2012-09-19 23:46 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Minchan Kim, KAMEZAWA Hiroyuki, linux-kernel, linux-mm

On Tue, 18 Sep 2012 00:03:57 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> RECLAIM_DISTANCE represents the distance between nodes at which it is
> deemed too costly to allocate from; it's preferred to try to reclaim from
> a local zone before falling back to allocating on a remote node with such
> a distance.
> 
> To do this, zone_reclaim_mode is set if the distance between any two
> nodes on the system is greather than this distance.  This, however, ends
> up causing the page allocator to reclaim from every zone regardless of
> its affinity.
> 
> What we really want is to reclaim only from zones that are closer than 
> RECLAIM_DISTANCE.  This patch adds a nodemask to each node that
> represents the set of nodes that are within this distance.  During the
> zone iteration, if the bit for a zone's node is set for the local node,
> then reclaim is attempted; otherwise, the zone is skipped.

zone_reclaim_mode isn't an lval if CONFIG_NUMA=n:

--- a/mm/page_alloc.c~mm-numa-reclaim-from-all-nodes-within-reclaim-distance-fix
+++ a/mm/page_alloc.c
@@ -4561,7 +4561,9 @@ void __paginginit free_area_init_node(in
 	for_each_online_node(i)
 		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
 			node_set(i, pgdat->reclaim_nodes);
+#ifdef CONFIG_NUMA
 			zone_reclaim_mode = 1;
+#endif
 		}
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 

That may not be a very good fix though - can we get all this NUMAy code
out of a non-NUMA-specific code site?


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [patch -mm] mm, numa: reclaim from all nodes within reclaim distance fix fix
  2012-09-19 23:46 ` Andrew Morton
@ 2012-09-26  3:50   ` David Rientjes
  0 siblings, 0 replies; 5+ messages in thread
From: David Rientjes @ 2012-09-26  3:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Minchan Kim, KAMEZAWA Hiroyuki, linux-kernel, linux-mm

It's cleaner if the iteration is explicitly done only for NUMA kernels.  
No functional change.

Intended to be folded into 
mm-numa-reclaim-from-all-nodes-within-reclaim-distance.patch already in 
-mm.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c |   24 ++++++++++++++++--------
 1 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1802,6 +1802,17 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
 }
 
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+	int i;
+
+	for_each_online_node(i)
+		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+			zone_reclaim_mode = 1;
+		}
+}
+
 #else	/* CONFIG_NUMA */
 
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1827,6 +1838,10 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
+
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
 #endif	/* CONFIG_NUMA */
 
 /*
@@ -4551,20 +4566,13 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
-	int i;
 
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
-	for_each_online_node(i)
-		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
-			node_set(i, pgdat->reclaim_nodes);
-#ifdef CONFIG_NUMA
-			zone_reclaim_mode = 1;
-#endif
-		}
+	init_zone_allows_reclaim(nid);
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-09-26  3:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-18  7:03 [patch] mm, numa: reclaim from all nodes within reclaim distance David Rientjes
2012-09-18 21:03 ` Andrew Morton
2012-09-18 21:44   ` David Rientjes
2012-09-19 23:46 ` Andrew Morton
2012-09-26  3:50   ` [patch -mm] mm, numa: reclaim from all nodes within reclaim distance fix fix David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).