linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* VM fixes [2/4]
@ 2004-12-24 17:35 Andrea Arcangeli
  2004-12-30 21:12 ` Nick Piggin
  0 siblings, 1 reply; 5+ messages in thread
From: Andrea Arcangeli @ 2004-12-24 17:35 UTC (permalink / raw)
  To: linux-kernel; +Cc: Thomas Gleixner, Andrew Morton

This is the forward port to 2.6 of the lowmem_reserved algorithm I
invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
like google (especially without swap) on x86 with >1G of ram, but it's
needed in all sort of workloads with lots of ram on x86, it's also
needed on x86-64 for dma allocations. This brings 2.6 in sync with
latest 2.4.2x.

From: Andrea Arcangeli <andrea@suse.de>
Subject: keep balance between different classzones

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- x/include/linux/mmzone.h.orig	2004-12-04 08:56:32.000000000 +0100
+++ x/include/linux/mmzone.h	2004-12-24 17:59:13.864424040 +0100
@@ -112,18 +112,14 @@ struct zone {
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
 	/*
-	 * protection[] is a pre-calculated number of extra pages that must be
-	 * available in a zone in order for __alloc_pages() to allocate memory
-	 * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
-	 * be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
-	 * for us to choose to allocate the page from that zone.
-	 *
-	 * It uses both min_free_kbytes and sysctl_lower_zone_protection.
-	 * The protection values are recalculated if either of these values
-	 * change.  The array elements are in zonelist order:
-	 *	[0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
+	 * We don't know if the memory that we're going to allocate will be freeable
+	 * or/and it will be released eventually, so to avoid totally wasting several
+	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+	 * to run OOM on the lower zones despite there's tons of freeable ram
+	 * on the higher zones). This array is recalculated at runtime if the
+	 * sysctl_lowmem_reserve_ratio sysctl changes.
 	 */
-	unsigned long		protection[MAX_NR_ZONES];
+	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 
 	struct per_cpu_pageset	pageset[NR_CPUS];
 
@@ -366,7 +362,8 @@ struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
 					void __user *, size_t *, loff_t *);
-int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
+extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
--- x/include/linux/sysctl.h.orig	2004-12-04 08:56:32.000000000 +0100
+++ x/include/linux/sysctl.h	2004-12-24 17:59:13.865423888 +0100
@@ -159,7 +159,7 @@ enum
 	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
 	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
-	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
+	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
 	VM_LAPTOP_MODE=23,	/* vm laptop mode */
--- x/kernel/sysctl.c.orig	2004-12-04 08:56:33.000000000 +0100
+++ x/kernel/sysctl.c	2004-12-24 17:59:13.868423432 +0100
@@ -62,7 +62,6 @@ extern int core_uses_pid;
 extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
-extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
@@ -736,14 +735,13 @@ static ctl_table vm_table[] = {
 	 },
 #endif
 	{
-		.ctl_name	= VM_LOWER_ZONE_PROTECTION,
-		.procname	= "lower_zone_protection",
-		.data		= &sysctl_lower_zone_protection,
-		.maxlen		= sizeof(sysctl_lower_zone_protection),
+		.ctl_name	= VM_LOWMEM_RESERVE_RATIO,
+		.procname	= "lowmem_reserve_ratio",
+		.data		= &sysctl_lowmem_reserve_ratio,
+		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
 		.mode		= 0644,
-		.proc_handler	= &lower_zone_protection_sysctl_handler,
+		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
 	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
--- x/mm/page_alloc.c.orig	2004-12-04 08:56:33.000000000 +0100
+++ x/mm/page_alloc.c	2004-12-24 17:59:36.182031248 +0100
@@ -42,7 +42,15 @@ unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 long nr_swap_pages;
 int numnodes = 1;
-int sysctl_lower_zone_protection = 0;
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ *	1G machine -> (16M dma, 784M normal, 224M high)
+ *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
 
 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);
@@ -583,19 +591,6 @@ buffered_rmqueue(struct zone *zone, int 
 
 /*
  * This is the 'heart' of the zoned buddy allocator.
- *
- * Herein lies the mysterious "incremental min".  That's the
- *
- *	local_low = z->pages_low;
- *	min += local_low;
- *
- * thing.  The intent here is to provide additional protection to low zones for
- * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
- * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
- * request.  This preserves additional space in those lower zones for requests
- * which really do need memory from those zones.  It means that on a decent
- * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
- * zone untouched.
  */
 struct page * fastcall
 __alloc_pages(unsigned int gfp_mask, unsigned int order,
@@ -608,7 +603,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int alloc_type;
+	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
 
@@ -628,11 +623,11 @@ __alloc_pages(unsigned int gfp_mask, uns
 		return NULL;
 	}
 
-	alloc_type = zone_idx(zones[0]);
+	classzone_idx = zone_idx(zones[0]);
 
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
-		min = z->pages_low + (1<<order) + z->protection[alloc_type];
+		min = z->pages_low + (1<<order) + z->lowmem_reserve[classzone_idx];
 
 		if (z->free_pages < min)
 			continue;
@@ -655,7 +650,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 			min /= 2;
 		if (can_try_harder)
 			min -= min / 4;
-		min += (1<<order) + z->protection[alloc_type];
+		min += (1<<order) + z->lowmem_reserve[classzone_idx];
 
 		if (z->free_pages < min)
 			continue;
@@ -698,7 +693,7 @@ rebalance:
 			min /= 2;
 		if (can_try_harder)
 			min -= min / 4;
-		min += (1<<order) + z->protection[alloc_type];
+		min += (1<<order) + z->lowmem_reserve[classzone_idx];
 
 		if (z->free_pages < min)
 			continue;
@@ -1117,9 +1112,9 @@ void show_free_areas(void)
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
 			);
-		printk("protections[]:");
+		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
-			printk(" %lu", zone->protection[i]);
+			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 
@@ -1816,87 +1811,29 @@ void __init page_alloc_init(void)
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
-static unsigned long higherzone_val(struct zone *z, int max_zone,
-					int alloc_type)
-{
-	int z_idx = zone_idx(z);
-	struct zone *higherzone;
-	unsigned long pages;
-
-	/* there is no higher zone to get a contribution from */
-	if (z_idx == MAX_NR_ZONES-1)
-		return 0;
-
-	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
-
-	/* We always start with the higher zone's protection value */
-	pages = higherzone->protection[alloc_type];
-
-	/*
-	 * We get a lower-zone-protection contribution only if there are
-	 * pages in the higher zone and if we're not the highest zone
-	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
-	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
-	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
-	 */
-	if (higherzone->present_pages && z_idx < alloc_type)
-		pages += higherzone->pages_low * sysctl_lower_zone_protection;
-
-	return pages;
-}
-
 /*
- * setup_per_zone_protection - called whenver min_free_kbytes or
- *	sysctl_lower_zone_protection changes.  Ensures that each zone
- *	has a correct pages_protected value, so an adequate number of
+ * setup_per_zone_lowmem_reserve - called whenever
+ *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
- *
- *	This algorithm is way confusing.  I tries to keep the same behavior
- *	as we had with the incremental min iterative algorithm.
  */
-static void setup_per_zone_protection(void)
+static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
-	struct zone *zones, *zone;
-	int max_zone;
-	int i, j;
+	int j, idx;
 
 	for_each_pgdat(pgdat) {
-		zones = pgdat->node_zones;
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone * zone = pgdat->node_zones + j;
+			unsigned long present_pages = zone->present_pages;
 
-		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
-			if (zones[i].present_pages)
-				max_zone = i;
+			zone->lowmem_reserve[j] = 0;
 
-		/*
-		 * For each of the different allocation types:
-		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
-		 */
-		for (i = 0; i < GFP_ZONETYPES; i++) {
-			/*
-			 * For each of the zones:
-			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
-			 */
-			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
-				zone = &zones[j];
+			for (idx = j-1; idx >= 0; idx--) {
+				struct zone * lower_zone = pgdat->node_zones + idx;
 
-				/*
-				 * We never protect zones that don't have memory
-				 * in them (j>max_zone) or zones that aren't in
-				 * the zonelists for a certain type of
-				 * allocation (j>=i).  We have to assign these
-				 * to zero because the lower zones take
-				 * contributions from the higher zones.
-				 */
-				if (j > max_zone || j >= i) {
-					zone->protection[i] = 0;
-					continue;
-				}
-				/*
-				 * The contribution of the next higher zone
-				 */
-				zone->protection[i] = higherzone_val(zone,
-								max_zone, i);
+				lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx];
+				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
@@ -1991,7 +1928,7 @@ static int __init init_per_zone_pages_mi
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
-	setup_per_zone_protection();
+	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
@@ -2006,20 +1943,23 @@ int min_free_kbytes_sysctl_handler(ctl_t
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	setup_per_zone_pages_min();
-	setup_per_zone_protection();
 	return 0;
 }
 
 /*
- * lower_zone_protection_sysctl_handler - just a wrapper around
- *	proc_dointvec() so that we can call setup_per_zone_protection()
- *	whenever sysctl_lower_zone_protection changes.
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ *	whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
  */
-int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 		 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	setup_per_zone_protection();
+	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: VM fixes [2/4]
  2004-12-24 17:35 VM fixes [2/4] Andrea Arcangeli
@ 2004-12-30 21:12 ` Nick Piggin
  2005-01-02 16:32   ` Andrea Arcangeli
  0 siblings, 1 reply; 5+ messages in thread
From: Nick Piggin @ 2004-12-30 21:12 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, Thomas Gleixner, Andrew Morton

Andrea Arcangeli wrote:
> This is the forward port to 2.6 of the lowmem_reserved algorithm I
> invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> like google (especially without swap) on x86 with >1G of ram, but it's
> needed in all sort of workloads with lots of ram on x86, it's also
> needed on x86-64 for dma allocations. This brings 2.6 in sync with
> latest 2.4.2x.
> 

This looks OK to me. It really simplifies the code there a lot too.

The only questions I have are: should it be on by default? I don't think
we ever reached an agreement. I'd say yes, after a run in -mm because it
does potentially fix corner cases where lower zones get filled with un-
freeable memory which could have been satisfied with higher zones.

And second, any chance you could you port it to the mm patches already in
-mm? Won't be a big job, just some clashes in __alloc_pages...

mm-keep-count-of-free-areas.patch
mm-higher-order-watermarks.patch
mm-higher-order-watermarks-fix.patch
mm-teach-kswapd-about-higher-order-areas.patch

Thanks,
Nick

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: VM fixes [2/4]
  2004-12-30 21:12 ` Nick Piggin
@ 2005-01-02 16:32   ` Andrea Arcangeli
  2005-01-03 12:25     ` Marcelo Tosatti
  0 siblings, 1 reply; 5+ messages in thread
From: Andrea Arcangeli @ 2005-01-02 16:32 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, Thomas Gleixner, Andrew Morton

On Fri, Dec 31, 2004 at 08:12:42AM +1100, Nick Piggin wrote:
> Andrea Arcangeli wrote:
> >This is the forward port to 2.6 of the lowmem_reserved algorithm I
> >invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> >like google (especially without swap) on x86 with >1G of ram, but it's
> >needed in all sort of workloads with lots of ram on x86, it's also
> >needed on x86-64 for dma allocations. This brings 2.6 in sync with
> >latest 2.4.2x.
> >
> 
> This looks OK to me. It really simplifies the code there a lot too.
> 
> The only questions I have are: should it be on by default? I don't think
> we ever reached an agreement. I'd say yes, after a run in -mm because it
> does potentially fix corner cases where lower zones get filled with un-
> freeable memory which could have been satisfied with higher zones.

Great, thanks for the review! I definitely agree it should be on by
default, I already had an hang report that was solved by more recent
kernels and that probably can only be explained by lowmem_reserve since
there aren't other mm changes in 2.6.5 based trees. 

> And second, any chance you could you port it to the mm patches already in
> -mm? Won't be a big job, just some clashes in __alloc_pages...

I already had to port to 2.6.5 too, and that's enough for now unless I
first get a positive ack that it will be merged (if I hadn't more
interesting things to develop, I would be happily porting it).

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: VM fixes [2/4]
  2005-01-02 16:32   ` Andrea Arcangeli
@ 2005-01-03 12:25     ` Marcelo Tosatti
  2005-01-03 16:30       ` Andrea Arcangeli
  0 siblings, 1 reply; 5+ messages in thread
From: Marcelo Tosatti @ 2005-01-03 12:25 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Nick Piggin, linux-kernel, Thomas Gleixner, Andrew Morton

On Sun, Jan 02, 2005 at 05:32:36PM +0100, Andrea Arcangeli wrote:
> On Fri, Dec 31, 2004 at 08:12:42AM +1100, Nick Piggin wrote:
> > Andrea Arcangeli wrote:
> > >This is the forward port to 2.6 of the lowmem_reserved algorithm I
> > >invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> > >like google (especially without swap) on x86 with >1G of ram, but it's
> > >needed in all sort of workloads with lots of ram on x86, it's also
> > >needed on x86-64 for dma allocations. This brings 2.6 in sync with
> > >latest 2.4.2x.
> > >
> > 
> > This looks OK to me. It really simplifies the code there a lot too.
> > 
> > The only questions I have are: should it be on by default? I don't think
> > we ever reached an agreement. I'd say yes, after a run in -mm because it
> > does potentially fix corner cases where lower zones get filled with un-
> > freeable memory which could have been satisfied with higher zones.
> 
> Great, thanks for the review! I definitely agree it should be on by
> default, I already had an hang report that was solved by more recent
> kernels and that probably can only be explained by lowmem_reserve since
> there aren't other mm changes in 2.6.5 based trees. 
> 
> > And second, any chance you could you port it to the mm patches already in
> > -mm? Won't be a big job, just some clashes in __alloc_pages...
> 
> I already had to port to 2.6.5 too, and that's enough for now unless I
> first get a positive ack that it will be merged (if I hadn't more
> interesting things to develop, I would be happily porting it).

I believe it can be accepted easily if you change the variable names
from protection to lowmem_reserve.

Is there a need for that or its just your taste? :)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: VM fixes [2/4]
  2005-01-03 12:25     ` Marcelo Tosatti
@ 2005-01-03 16:30       ` Andrea Arcangeli
  0 siblings, 0 replies; 5+ messages in thread
From: Andrea Arcangeli @ 2005-01-03 16:30 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Nick Piggin, linux-kernel, Thomas Gleixner, Andrew Morton

On Mon, Jan 03, 2005 at 10:25:18AM -0200, Marcelo Tosatti wrote:
> On Sun, Jan 02, 2005 at 05:32:36PM +0100, Andrea Arcangeli wrote:
> > On Fri, Dec 31, 2004 at 08:12:42AM +1100, Nick Piggin wrote:
> > > Andrea Arcangeli wrote:
> > > >This is the forward port to 2.6 of the lowmem_reserved algorithm I
> > > >invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> > > >like google (especially without swap) on x86 with >1G of ram, but it's
> > > >needed in all sort of workloads with lots of ram on x86, it's also
> > > >needed on x86-64 for dma allocations. This brings 2.6 in sync with
> > > >latest 2.4.2x.
> > > >
> > > 
> > > This looks OK to me. It really simplifies the code there a lot too.
> > > 
> > > The only questions I have are: should it be on by default? I don't think
> > > we ever reached an agreement. I'd say yes, after a run in -mm because it
> > > does potentially fix corner cases where lower zones get filled with un-
> > > freeable memory which could have been satisfied with higher zones.
> > 
> > Great, thanks for the review! I definitely agree it should be on by
> > default, I already had an hang report that was solved by more recent
> > kernels and that probably can only be explained by lowmem_reserve since
> > there aren't other mm changes in 2.6.5 based trees. 
> > 
> > > And second, any chance you could you port it to the mm patches already in
> > > -mm? Won't be a big job, just some clashes in __alloc_pages...
> > 
> > I already had to port to 2.6.5 too, and that's enough for now unless I
> > first get a positive ack that it will be merged (if I hadn't more
> > interesting things to develop, I would be happily porting it).
> 
> I believe it can be accepted easily if you change the variable names
> from protection to lowmem_reserve.
> 
> Is there a need for that or its just your taste? :)

The naming is in sync with 2.4, I called that feature lowmem_reserve
when I wrote it.  Protection doesn't actually mean anything. Memory
protection, mprotect, what?

The object of the feature is to reserve lower memory in function of the
classzone allocation, and in function of the zone we're allocating from.
So lowmem_reserve sounds a much better name. And it wasn't me to change
it, it was the 2.6 kernel calling it differently in the first place.
Note that at first 2.6 was doing stuff very differently from 2.4 too
(and it wasn't working right infact). Now it's in perfect sync with the 2.4
algorightm I wrote originally and so I thought it would be much cleaner
to call it the same way as 2.4, which is more self explanatory too.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2005-01-03 16:30 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-12-24 17:35 VM fixes [2/4] Andrea Arcangeli
2004-12-30 21:12 ` Nick Piggin
2005-01-02 16:32   ` Andrea Arcangeli
2005-01-03 12:25     ` Marcelo Tosatti
2005-01-03 16:30       ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).