linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
@ 2006-01-13 15:50 Mel Gorman
  2006-01-13 18:13 ` Mika Penttilä
  2006-01-13 20:16 ` Andrew Morton
  0 siblings, 2 replies; 9+ messages in thread
From: Mel Gorman @ 2006-01-13 15:50 UTC (permalink / raw)
  To: akpm; +Cc: lhms-devel, linux-mm, linux-kernel

Hi Andrew,

This patch is divided into two parts and addresses a bug in how zone
fallback lists are calculated and how __GFP_* zone modifiers are mapped to
their equivilant ZONE_* type. It applies to 2.6.15-mm3 and has been tested
on x86 and ppc64. It has been reported by Yasunori Goto that it boots on
ia64. Details as follows;

build_zonelists() attempts to be smart, and uses highest_zone() so that it
doesn't attempt to call build_zonelists_node() for empty zones.  However,
build_zonelists_node() is smart enough to do the right thing by itself and
build_zonelists() already has the zone index that highest_zone() is meant
to provide. So, remove the unnecessary function highest_zone().

The helper function gfp_zone() assumes that the bits used in the zone modifier
of a GFP flag maps directory on to their ZONE_* equivalent and just applies a
mask. However, the bits do not map directly and the wrong fallback lists can
be used. If unluckly, the system can go OOM when plenty of suitable memory
is available. This patch redefines the __GFP_ zone modifier flags to allow
a simple mapping to their equivilant ZONE_ type.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.15-mm3-clean/include/linux/gfp.h linux-2.6.15-mm3-001_fallbacks/include/linux/gfp.h
--- linux-2.6.15-mm3-clean/include/linux/gfp.h	2006-01-11 14:24:18.000000000 +0000
+++ linux-2.6.15-mm3-001_fallbacks/include/linux/gfp.h	2006-01-13 09:18:09.000000000 +0000
@@ -11,15 +11,19 @@ struct vm_area_struct;
 /*
  * GFP bitmasks..
  */
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
-#define __GFP_DMA	((__force gfp_t)0x01u)
-#define __GFP_HIGHMEM	((__force gfp_t)0x02u)
+/*
+ * Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) 
+ * The values here are mapped to their equivilant ZONE_* type by gfp_zone
+ * which makes assumptions on the value of these bits.
+ */
+#define __GFP_DMA	((__force gfp_t)0x02u)
+#define __GFP_HIGHMEM	((__force gfp_t)0x01u)
 #ifdef CONFIG_DMA_IS_DMA32
-#define __GFP_DMA32	((__force gfp_t)0x01)	/* ZONE_DMA is ZONE_DMA32 */
+#define __GFP_DMA32	((__force gfp_t)0x02u)	/* ZONE_DMA is ZONE_DMA32 */
 #elif BITS_PER_LONG < 64
-#define __GFP_DMA32	((__force gfp_t)0x00)	/* ZONE_NORMAL is ZONE_DMA32 */
+#define __GFP_DMA32	((__force gfp_t)0x00u)	/* ZONE_NORMAL is ZONE_DMA32 */
 #else
-#define __GFP_DMA32	((__force gfp_t)0x04)	/* Has own ZONE_DMA32 */
+#define __GFP_DMA32	((__force gfp_t)0x03u)	/* Has own ZONE_DMA32 */
 #endif
 
 /*
@@ -75,10 +79,15 @@ struct vm_area_struct;
 #define GFP_DMA32	__GFP_DMA32
 
 
+/**
+ * GFP zone modifiers need to be mapped to their equivilant ZONE_ value defined
+ * in linux/mmzone.h to determine what fallback list should be used for the
+ * allocation.
+ */
 static inline int gfp_zone(gfp_t gfp)
 {
-	int zone = GFP_ZONEMASK & (__force int) gfp;
-	BUG_ON(zone >= GFP_ZONETYPES);
+	int zone = ((__force int) gfp & GFP_ZONEMASK) ^ 0x2;
+	BUG_ON(zone >= MAX_NR_ZONES);
 	return zone;
 }
 
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.15-mm3-clean/mm/page_alloc.c linux-2.6.15-mm3-001_fallbacks/mm/page_alloc.c
--- linux-2.6.15-mm3-clean/mm/page_alloc.c	2006-01-11 14:24:18.000000000 +0000
+++ linux-2.6.15-mm3-001_fallbacks/mm/page_alloc.c	2006-01-13 09:10:17.000000000 +0000
@@ -1577,18 +1577,6 @@ static int __init build_zonelists_node(p
 	return nr_zones;
 }
 
-static inline int highest_zone(int zone_bits)
-{
-	int res = ZONE_NORMAL;
-	if (zone_bits & (__force int)__GFP_HIGHMEM)
-		res = ZONE_HIGHMEM;
-	if (zone_bits & (__force int)__GFP_DMA32)
-		res = ZONE_DMA32;
-	if (zone_bits & (__force int)__GFP_DMA)
-		res = ZONE_DMA;
-	return res;
-}
-
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
 static int __initdata node_load[MAX_NUMNODES];
@@ -1690,13 +1678,11 @@ static void __init build_zonelists(pg_da
 			node_load[node] += load;
 		prev_node = node;
 		load--;
-		for (i = 0; i < GFP_ZONETYPES; i++) {
+		for (i = 0; i < MAX_NR_ZONES; i++) {
 			zonelist = pgdat->node_zonelists + i;
 			for (j = 0; zonelist->zones[j] != NULL; j++);
 
-			k = highest_zone(i);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
 			zonelist->zones[j] = NULL;
 		}
 	}
@@ -1706,17 +1692,16 @@ static void __init build_zonelists(pg_da
 
 static void __init build_zonelists(pg_data_t *pgdat)
 {
-	int i, j, k, node, local_node;
+	int i, j, node, local_node;
 
 	local_node = pgdat->node_id;
-	for (i = 0; i < GFP_ZONETYPES; i++) {
+	for (i = 0; i < MAX_NR_ZONES; i++) {
 		struct zonelist *zonelist;
 
 		zonelist = pgdat->node_zonelists + i;
 
 		j = 0;
-		k = highest_zone(i);
- 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		j = build_zonelists_node(pgdat, zonelist, j, i);
  		/*
  		 * Now we build the zonelist so that it contains the zones
  		 * of all the other nodes.
@@ -1728,12 +1713,12 @@ static void __init build_zonelists(pg_da
 		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 			if (!node_online(node))
 				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
 		}
 		for (node = 0; node < local_node; node++) {
 			if (!node_online(node))
 				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
 		}
 
 		zonelist->zones[j] = NULL;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
  2006-01-13 15:50 [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists Mel Gorman
@ 2006-01-13 18:13 ` Mika Penttilä
  2006-01-15 14:00   ` Mel Gorman
  2006-01-13 20:16 ` Andrew Morton
  1 sibling, 1 reply; 9+ messages in thread
From: Mika Penttilä @ 2006-01-13 18:13 UTC (permalink / raw)
  To: Mel Gorman; +Cc: akpm, lhms-devel, linux-mm, linux-kernel

Mel Gorman wrote:

>Hi Andrew,
>
>This patch is divided into two parts and addresses a bug in how zone
>fallback lists are calculated and how __GFP_* zone modifiers are mapped to
>their equivilant ZONE_* type. It applies to 2.6.15-mm3 and has been tested
>on x86 and ppc64. It has been reported by Yasunori Goto that it boots on
>ia64. Details as follows;
>
>build_zonelists() attempts to be smart, and uses highest_zone() so that it
>doesn't attempt to call build_zonelists_node() for empty zones.  However,
>build_zonelists_node() is smart enough to do the right thing by itself and
>build_zonelists() already has the zone index that highest_zone() is meant
>to provide. So, remove the unnecessary function highest_zone().
>
>The helper function gfp_zone() assumes that the bits used in the zone modifier
>of a GFP flag maps directory on to their ZONE_* equivalent and just applies a
>mask. However, the bits do not map directly and the wrong fallback lists can
>be used. If unluckly, the system can go OOM when plenty of suitable memory
>is available. This patch redefines the __GFP_ zone modifier flags to allow
>a simple mapping to their equivilant ZONE_ type.
>
>  
>
What's the exact failure case? Afaik, we loop though all the 
GFP_ZONETYPES, building the appropriate zone lists at 0 - 
GFP_ZONETYPES-1 indexes. So the direct GFP -> ZONE mapping should do the 
right thing.

--Mika


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
  2006-01-13 15:50 [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists Mel Gorman
  2006-01-13 18:13 ` Mika Penttilä
@ 2006-01-13 20:16 ` Andrew Morton
  2006-01-14  2:24   ` Dave Hansen
  2006-01-16 15:57   ` [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists Andy Whitcroft
  1 sibling, 2 replies; 9+ messages in thread
From: Andrew Morton @ 2006-01-13 20:16 UTC (permalink / raw)
  To: Mel Gorman
  Cc: lhms-devel, linux-mm, linux-kernel, Dave Hansen, Andy Whitcroft,
	Linus Torvalds

mel@csn.ul.ie (Mel Gorman) wrote:
>
> Hi Andrew,
> 
> This patch is divided into two parts and addresses a bug in how zone
> fallback lists are calculated and how __GFP_* zone modifiers are mapped to
> their equivilant ZONE_* type. It applies to 2.6.15-mm3 and has been tested
> on x86 and ppc64. It has been reported by Yasunori Goto that it boots on
> ia64. Details as follows;

This stuff is nasty.  It'd be nice to split the patch into two (always)
(please).

> build_zonelists() attempts to be smart, and uses highest_zone() so that it
> doesn't attempt to call build_zonelists_node() for empty zones.  However,
> build_zonelists_node() is smart enough to do the right thing by itself and
> build_zonelists() already has the zone index that highest_zone() is meant
> to provide. So, remove the unnecessary function highest_zone().

Dave, Andy: could you please have a think about the fallback list thing?

> The helper function gfp_zone() assumes that the bits used in the zone modifier
> of a GFP flag maps directory on to their ZONE_* equivalent and just applies a
> mask. However, the bits do not map directly and the wrong fallback lists can
> be used. If unluckly, the system can go OOM when plenty of suitable memory
> is available. This patch redefines the __GFP_ zone modifier flags to allow
> a simple mapping to their equivilant ZONE_ type.

Linus, you had a look at this a few weeks ago and I think you had
objections to the gfp-modifiers-are-bitmasks approach?

> Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.15-mm3-clean/include/linux/gfp.h linux-2.6.15-mm3-001_fallbacks/include/linux/gfp.h
> --- linux-2.6.15-mm3-clean/include/linux/gfp.h	2006-01-11 14:24:18.000000000 +0000
> +++ linux-2.6.15-mm3-001_fallbacks/include/linux/gfp.h	2006-01-13 09:18:09.000000000 +0000
> @@ -11,15 +11,19 @@ struct vm_area_struct;
>  /*
>   * GFP bitmasks..
>   */
> -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
> -#define __GFP_DMA	((__force gfp_t)0x01u)
> -#define __GFP_HIGHMEM	((__force gfp_t)0x02u)
> +/*
> + * Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) 
> + * The values here are mapped to their equivilant ZONE_* type by gfp_zone
> + * which makes assumptions on the value of these bits.
> + */
> +#define __GFP_DMA	((__force gfp_t)0x02u)
> +#define __GFP_HIGHMEM	((__force gfp_t)0x01u)
>  #ifdef CONFIG_DMA_IS_DMA32
> -#define __GFP_DMA32	((__force gfp_t)0x01)	/* ZONE_DMA is ZONE_DMA32 */
> +#define __GFP_DMA32	((__force gfp_t)0x02u)	/* ZONE_DMA is ZONE_DMA32 */
>  #elif BITS_PER_LONG < 64
> -#define __GFP_DMA32	((__force gfp_t)0x00)	/* ZONE_NORMAL is ZONE_DMA32 */
> +#define __GFP_DMA32	((__force gfp_t)0x00u)	/* ZONE_NORMAL is ZONE_DMA32 */
>  #else
> -#define __GFP_DMA32	((__force gfp_t)0x04)	/* Has own ZONE_DMA32 */
> +#define __GFP_DMA32	((__force gfp_t)0x03u)	/* Has own ZONE_DMA32 */
>  #endif
>  
>  /*
> @@ -75,10 +79,15 @@ struct vm_area_struct;
>  #define GFP_DMA32	__GFP_DMA32
>  
>  
> +/**
> + * GFP zone modifiers need to be mapped to their equivilant ZONE_ value defined
> + * in linux/mmzone.h to determine what fallback list should be used for the
> + * allocation.
> + */
>  static inline int gfp_zone(gfp_t gfp)
>  {
> -	int zone = GFP_ZONEMASK & (__force int) gfp;
> -	BUG_ON(zone >= GFP_ZONETYPES);
> +	int zone = ((__force int) gfp & GFP_ZONEMASK) ^ 0x2;
> +	BUG_ON(zone >= MAX_NR_ZONES);
>  	return zone;
>  }
>  
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.15-mm3-clean/mm/page_alloc.c linux-2.6.15-mm3-001_fallbacks/mm/page_alloc.c
> --- linux-2.6.15-mm3-clean/mm/page_alloc.c	2006-01-11 14:24:18.000000000 +0000
> +++ linux-2.6.15-mm3-001_fallbacks/mm/page_alloc.c	2006-01-13 09:10:17.000000000 +0000
> @@ -1577,18 +1577,6 @@ static int __init build_zonelists_node(p
>  	return nr_zones;
>  }
>  
> -static inline int highest_zone(int zone_bits)
> -{
> -	int res = ZONE_NORMAL;
> -	if (zone_bits & (__force int)__GFP_HIGHMEM)
> -		res = ZONE_HIGHMEM;
> -	if (zone_bits & (__force int)__GFP_DMA32)
> -		res = ZONE_DMA32;
> -	if (zone_bits & (__force int)__GFP_DMA)
> -		res = ZONE_DMA;
> -	return res;
> -}
> -
>  #ifdef CONFIG_NUMA
>  #define MAX_NODE_LOAD (num_online_nodes())
>  static int __initdata node_load[MAX_NUMNODES];
> @@ -1690,13 +1678,11 @@ static void __init build_zonelists(pg_da
>  			node_load[node] += load;
>  		prev_node = node;
>  		load--;
> -		for (i = 0; i < GFP_ZONETYPES; i++) {
> +		for (i = 0; i < MAX_NR_ZONES; i++) {
>  			zonelist = pgdat->node_zonelists + i;
>  			for (j = 0; zonelist->zones[j] != NULL; j++);
>  
> -			k = highest_zone(i);
> -
> -	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> +	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
>  			zonelist->zones[j] = NULL;
>  		}
>  	}
> @@ -1706,17 +1692,16 @@ static void __init build_zonelists(pg_da
>  
>  static void __init build_zonelists(pg_data_t *pgdat)
>  {
> -	int i, j, k, node, local_node;
> +	int i, j, node, local_node;
>  
>  	local_node = pgdat->node_id;
> -	for (i = 0; i < GFP_ZONETYPES; i++) {
> +	for (i = 0; i < MAX_NR_ZONES; i++) {
>  		struct zonelist *zonelist;
>  
>  		zonelist = pgdat->node_zonelists + i;
>  
>  		j = 0;
> -		k = highest_zone(i);
> - 		j = build_zonelists_node(pgdat, zonelist, j, k);
> + 		j = build_zonelists_node(pgdat, zonelist, j, i);
>   		/*
>   		 * Now we build the zonelist so that it contains the zones
>   		 * of all the other nodes.
> @@ -1728,12 +1713,12 @@ static void __init build_zonelists(pg_da
>  		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
>  			if (!node_online(node))
>  				continue;
> -			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> +			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
>  		}
>  		for (node = 0; node < local_node; node++) {
>  			if (!node_online(node))
>  				continue;
> -			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> +			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
>  		}
>  
>  		zonelist->zones[j] = NULL;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
  2006-01-13 20:16 ` Andrew Morton
@ 2006-01-14  2:24   ` Dave Hansen
  2006-01-14 18:29     ` Mel Gorman
                       ` (2 more replies)
  2006-01-16 15:57   ` [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists Andy Whitcroft
  1 sibling, 3 replies; 9+ messages in thread
From: Dave Hansen @ 2006-01-14  2:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, lhms-devel, linux-mm, linux-kernel, Andy Whitcroft,
	Linus Torvalds

On Fri, 2006-01-13 at 12:16 -0800, Andrew Morton wrote:
> mel@csn.ul.ie (Mel Gorman) wrote:
> > build_zonelists() attempts to be smart, and uses highest_zone() so that it
> > doesn't attempt to call build_zonelists_node() for empty zones.  However,
> > build_zonelists_node() is smart enough to do the right thing by itself and
> > build_zonelists() already has the zone index that highest_zone() is meant
> > to provide. So, remove the unnecessary function highest_zone().
> 
> Dave, Andy: could you please have a think about the fallback list thing?

It's bogus.  Mel, I didn't take a close enough look when we were talking
about it earlier, and I fear I led you astray.  I misunderstood what it
was trying to do, and though that the zone_populated() check replaced
the highest_zone() check, when they actually do completely different
things.

highest_zone(zone_nr) actually means, given these "zone_bits" (which is
actually a set of __GFP_XXXX flags), what is the highest zone number
that we could possibly use to satisfy an allocation with those __GFP
flags.

We can't just get rid of it.  If we do, we might put a highmem zone in
the fallback list for a normal zone.  Badness.

So, Mel, I have couple of patches that I put together that the two
copies of build_zonelists(), and move some of build_zonelists() itself
down into build_zonelists_node(), including the highest_zone() call.
They're no good to you by themselves.  But, I think we can make a little
function to go into the loop in build_zonelists_node().  The new
function would ask, "can this zone be used to satisfy this GFP mask?"
We'd start the loop at the absolutely highest-numbered zone.  I think
that's a decently clean way to do what you want with the reclaim zone.  

In the process of investigating this, I noticed that Andy's nice
calculation and comment for GFP_ZONETYPES went away.  Might be nice to
put it back, just so we know how '5' got there:

http://www.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=ac3461ad632e86e7debd871776683c05ef3ba4c6

Mel, you might also want to take a look at what Linus is suggesting
there.

-- Dave


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
  2006-01-14  2:24   ` Dave Hansen
@ 2006-01-14 18:29     ` Mel Gorman
  2006-01-19 21:52     ` [PATCH 1/2] GFP_ZONETYPES add commentry on how to calculate Andy Whitcroft
  2006-01-19 21:52     ` [PATCH 2/2] GFP_ZONETYPES calculate from GFP_ZONEMASK Andy Whitcroft
  2 siblings, 0 replies; 9+ messages in thread
From: Mel Gorman @ 2006-01-14 18:29 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andrew Morton, lhms-devel, linux-mm, linux-kernel,
	Andy Whitcroft, Linus Torvalds

On Fri, 13 Jan 2006, Dave Hansen wrote:

> On Fri, 2006-01-13 at 12:16 -0800, Andrew Morton wrote:
> > mel@csn.ul.ie (Mel Gorman) wrote:
> > > build_zonelists() attempts to be smart, and uses highest_zone() so that it
> > > doesn't attempt to call build_zonelists_node() for empty zones.  However,
> > > build_zonelists_node() is smart enough to do the right thing by itself and
> > > build_zonelists() already has the zone index that highest_zone() is meant
> > > to provide. So, remove the unnecessary function highest_zone().
> >
> > Dave, Andy: could you please have a think about the fallback list thing?
>
> It's bogus.  Mel, I didn't take a close enough look when we were talking
> about it earlier, and I fear I led you astray.  I misunderstood what it
> was trying to do, and though that the zone_populated() check replaced
> the highest_zone() check, when they actually do completely different
> things.
>
> highest_zone(zone_nr) actually means, given these "zone_bits" (which is
> actually a set of __GFP_XXXX flags), what is the highest zone number
> that we could possibly use to satisfy an allocation with those __GFP
> flags.
>

True, but what is passed in is not "zone_bits", but a ZONE_something type.
Given the bits __GFP_HIGHMEM, it would return ZONE_HIGHMEM. What actually
happens is highest_zone(ZONE_HIGHMEM) gets called and returns ZONE_DMA.
The fallback lists then look like;

[17179569.184000] Dumping pgdat fallback lists
[17179569.184000] 0   Normal      DMA
[17179569.184000] 1      DMA
[17179569.184000] 2  HighMem   Normal      DMA
[17179569.184000] 3      DMA

This currently happens to work. It goes wrong when an additional zone is
created unless more bits are consumed for the zone modifiers than is
really required.

The additional zone is for a zone-based approach to anti-fragmentation.

> We can't just get rid of it.  If we do, we might put a highmem zone in
> the fallback list for a normal zone.  Badness.
>

That would be obviously broken, but I am having trouble seeing how it
could happen as with this patch, we start with the highest possible zone
that can be used and count back.

> So, Mel, I have couple of patches that I put together that the two
> copies of build_zonelists(), and move some of build_zonelists() itself
> down into build_zonelists_node(), including the highest_zone() call.
> They're no good to you by themselves.  But, I think we can make a little
> function to go into the loop in build_zonelists_node().  The new
> function would ask, "can this zone be used to satisfy this GFP mask?"
> We'd start the loop at the absolutely highest-numbered zone.  I think
> that's a decently clean way to do what you want with the reclaim zone.
>

Ok.

> In the process of investigating this, I noticed that Andy's nice
> calculation and comment for GFP_ZONETYPES went away.  Might be nice to
> put it back, just so we know how '5' got there:
>
> http://www.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=ac3461ad632e86e7debd871776683c05ef3ba4c6
>
> Mel, you might also want to take a look at what Linus is suggesting
> there.
>
> -- Dave
>

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
  2006-01-13 18:13 ` Mika Penttilä
@ 2006-01-15 14:00   ` Mel Gorman
  0 siblings, 0 replies; 9+ messages in thread
From: Mel Gorman @ 2006-01-15 14:00 UTC (permalink / raw)
  To: Mika Penttilä; +Cc: akpm, lhms-devel, linux-mm, linux-kernel

On Fri, 13 Jan 2006, Mika Penttilä wrote:

> Mel Gorman wrote:
>
> > Hi Andrew,
> >
> > This patch is divided into two parts and addresses a bug in how zone
> > fallback lists are calculated and how __GFP_* zone modifiers are mapped to
> > their equivilant ZONE_* type. It applies to 2.6.15-mm3 and has been tested
> > on x86 and ppc64. It has been reported by Yasunori Goto that it boots on
> > ia64. Details as follows;
> >
> > build_zonelists() attempts to be smart, and uses highest_zone() so that it
> > doesn't attempt to call build_zonelists_node() for empty zones.  However,
> > build_zonelists_node() is smart enough to do the right thing by itself and
> > build_zonelists() already has the zone index that highest_zone() is meant
> > to provide. So, remove the unnecessary function highest_zone().
> >
> > The helper function gfp_zone() assumes that the bits used in the zone
> > modifier
> > of a GFP flag maps directory on to their ZONE_* equivalent and just applies
> > a
> > mask. However, the bits do not map directly and the wrong fallback lists can
> > be used. If unluckly, the system can go OOM when plenty of suitable memory
> > is available. This patch redefines the __GFP_ zone modifier flags to allow
> > a simple mapping to their equivilant ZONE_ type.
> >
> >
> What's the exact failure case? Afaik, we loop though all the GFP_ZONETYPES,
> building the appropriate zone lists at 0 - GFP_ZONETYPES-1 indexes. So the
> direct GFP -> ZONE mapping should do the right thing.
>

It goes wrong if one tries to add a different zone. What is currently
there happens to work, but there seemed to be confusion of when __GFP_
bits were being used or when ZONE_ indices were used.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists
  2006-01-13 20:16 ` Andrew Morton
  2006-01-14  2:24   ` Dave Hansen
@ 2006-01-16 15:57   ` Andy Whitcroft
  1 sibling, 0 replies; 9+ messages in thread
From: Andy Whitcroft @ 2006-01-16 15:57 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, lhms-devel, linux-mm, linux-kernel, Dave Hansen,
	Linus Torvalds

I think we need to be careful here. Although the __GFP_* modifiers
appear to be directly convertable to ZONE_* types they don't have
to be.  We could potentially have a new modifier which would want
to specify a different list combination whilst not representing
a zone in and of itself; for example __GFP_NODEONLY which might
request use of zones which are NUMA node local.  The bits covered
by GFP_ZONEMASK represent 'zone modifier space', those GFP bits
which affect where we should try and get memory. The zonelists
correspond to the lists of zones to try for that combination in
'zone modifier space' not for a specific zone.

Right now there is a near one-to-one correspondance between
the __GFP_x and ZONE_x identifiers. As more zones are added we
exponentially waste more and more 'zone modifier space' to allow
for the possible combinations. If we are willing and able to assert
that only one memory zone related modifier is valid at once we
could deliberatly squash the zone number into the bottom corner of
'zone modifier space' whilst still maintaining that space and the
ability to allow new bits to be combined with it.

My feeling is that as long as we don't lose the ability to have
modifiers combine and select separate lists and there is currently
no use of combined zone modifiers then we can make this optimisation.

Comments?

-apw


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] GFP_ZONETYPES add commentry on how to calculate
  2006-01-14  2:24   ` Dave Hansen
  2006-01-14 18:29     ` Mel Gorman
@ 2006-01-19 21:52     ` Andy Whitcroft
  2006-01-19 21:52     ` [PATCH 2/2] GFP_ZONETYPES calculate from GFP_ZONEMASK Andy Whitcroft
  2 siblings, 0 replies; 9+ messages in thread
From: Andy Whitcroft @ 2006-01-19 21:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dave Hansen, Mel Gorman, lhms-devel, linux-mm, linux-kernel,
	Andy Whitcroft, Linus Torvalds

GFP_ZONETYPES define using GFP_ZONEMASK and add commentry

Add commentry explaining the optimisation that we can apply to
GFP_ZONETYPES when the lest most bit is a 'loaner', it can only be
set in isolation.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
---
 mmzone.h |    8 ++++++++
 1 file changed, 8 insertions(+)
diff -upN reference/include/linux/mmzone.h current/include/linux/mmzone.h
--- reference/include/linux/mmzone.h
+++ current/include/linux/mmzone.h
@@ -91,6 +91,14 @@ struct per_cpu_pageset {
  * be 8 (2 ** 3) zonelists.  GFP_ZONETYPES defines the number of possible
  * combinations of zone modifiers in "zone modifier space".
  *
+ * As an optimisation any zone modifier bits which are only valid when
+ * no other zone modifier bits are set (loners) should be placed in
+ * the highest order bits of this field.  This allows us to reduce the
+ * extent of the zonelists thus saving space.  For example in the case
+ * of three zone modifier bits, we could require up to eight zonelists.
+ * If the left most zone modifier is a "loner" then the highest valid
+ * zonelist would be four allowing us to allocate only five zonelists.
+ *
  * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
 #define GFP_ZONEMASK	0x07

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] GFP_ZONETYPES calculate from GFP_ZONEMASK
  2006-01-14  2:24   ` Dave Hansen
  2006-01-14 18:29     ` Mel Gorman
  2006-01-19 21:52     ` [PATCH 1/2] GFP_ZONETYPES add commentry on how to calculate Andy Whitcroft
@ 2006-01-19 21:52     ` Andy Whitcroft
  2 siblings, 0 replies; 9+ messages in thread
From: Andy Whitcroft @ 2006-01-19 21:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dave Hansen, Mel Gorman, lhms-devel, linux-mm, linux-kernel,
	Andy Whitcroft, Linus Torvalds

GFP_ZONETYPES calculate from GFP_ZONEMASK

GFP_ZONETYPES's value is directly related to the value of GFP_ZONEMASK. 
It takes one of two forms depending whether the top bit of GFP_ZONEMASK
is a 'loner'.  Supply both forms, enabling the loner.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
---
 mmzone.h |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff -upN reference/include/linux/mmzone.h current/include/linux/mmzone.h
--- reference/include/linux/mmzone.h
+++ current/include/linux/mmzone.h
@@ -98,11 +98,14 @@ struct per_cpu_pageset {
  * of three zone modifier bits, we could require up to eight zonelists.
  * If the left most zone modifier is a "loner" then the highest valid
  * zonelist would be four allowing us to allocate only five zonelists.
+ * Use the first form for GFP_ZONETYPES when the left most bit is not
+ * a "loner", otherwise use the second.
  *
  * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
 #define GFP_ZONEMASK	0x07
-#define GFP_ZONETYPES	5
+/* #define GFP_ZONETYPES       (GFP_ZONEMASK + 1) */           /* Non-loner */
+#define GFP_ZONETYPES  ((GFP_ZONEMASK + 1) / 2 + 1)            /* Loner */
 
 /*
  * On machines where it is needed (eg PCs) we divide physical memory

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2006-01-19 21:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-01-13 15:50 [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists Mel Gorman
2006-01-13 18:13 ` Mika Penttilä
2006-01-15 14:00   ` Mel Gorman
2006-01-13 20:16 ` Andrew Morton
2006-01-14  2:24   ` Dave Hansen
2006-01-14 18:29     ` Mel Gorman
2006-01-19 21:52     ` [PATCH 1/2] GFP_ZONETYPES add commentry on how to calculate Andy Whitcroft
2006-01-19 21:52     ` [PATCH 2/2] GFP_ZONETYPES calculate from GFP_ZONEMASK Andy Whitcroft
2006-01-16 15:57   ` [PATCH] BUG: gfp_zone() not mapping zone modifiers correctly and bad ordering of fallback lists Andy Whitcroft

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).