* [PATCH 1/9] Add __GFP_EASYRCLM flag and update callers
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
@ 2006-01-26 18:43 ` Mel Gorman
2006-01-26 18:43 ` [PATCH 2/9] Create the ZONE_EASYRCLM zone Mel Gorman
` (8 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:43 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
This creates a zone modifier __GFP_EASYRCLM and a set of GFP flags called
GFP_RCLMUSER. The only difference between GFP_HIGHUSER and GFP_RCLMUSER is the
zone that is used. Callers appropriate to use the ZONE_EASYRCLM are changed.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/fs/compat.c linux-2.6.16-rc1-mm3-101_antifrag_flags/fs/compat.c
--- linux-2.6.16-rc1-mm3-clean/fs/compat.c 2006-01-25 13:42:45.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/fs/compat.c 2006-01-26 18:08:22.000000000 +0000
@@ -1397,7 +1397,7 @@ static int compat_copy_strings(int argc,
page = bprm->page[i];
new = 0;
if (!page) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_RCLMUSER);
bprm->page[i] = page;
if (!page) {
ret = -ENOMEM;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/fs/exec.c linux-2.6.16-rc1-mm3-101_antifrag_flags/fs/exec.c
--- linux-2.6.16-rc1-mm3-clean/fs/exec.c 2006-01-25 13:42:45.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/fs/exec.c 2006-01-26 18:08:22.000000000 +0000
@@ -238,7 +238,7 @@ static int copy_strings(int argc, char _
page = bprm->page[i];
new = 0;
if (!page) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_RCLMUSER);
bprm->page[i] = page;
if (!page) {
ret = -ENOMEM;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/fs/inode.c linux-2.6.16-rc1-mm3-101_antifrag_flags/fs/inode.c
--- linux-2.6.16-rc1-mm3-clean/fs/inode.c 2006-01-25 13:42:45.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/fs/inode.c 2006-01-26 18:08:22.000000000 +0000
@@ -147,7 +147,7 @@ static struct inode *alloc_inode(struct
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+ mapping_set_gfp_mask(mapping, GFP_RCLMUSER);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/include/asm-i386/page.h linux-2.6.16-rc1-mm3-101_antifrag_flags/include/asm-i386/page.h
--- linux-2.6.16-rc1-mm3-clean/include/asm-i386/page.h 2006-01-25 13:42:45.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/include/asm-i386/page.h 2006-01-26 18:08:22.000000000 +0000
@@ -36,7 +36,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_page_vma(GFP_RCLMUSER | __GFP_ZERO, vma, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/*
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/include/linux/gfp.h linux-2.6.16-rc1-mm3-101_antifrag_flags/include/linux/gfp.h
--- linux-2.6.16-rc1-mm3-clean/include/linux/gfp.h 2006-01-17 07:44:47.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/include/linux/gfp.h 2006-01-26 18:08:22.000000000 +0000
@@ -21,6 +21,7 @@ struct vm_area_struct;
#else
#define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */
#endif
+#define __GFP_EASYRCLM ((__force gfp_t)0x08u)
/*
* Action modifiers - doesn't change the zoning
@@ -65,6 +66,8 @@ struct vm_area_struct;
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
__GFP_HIGHMEM)
+#define GFP_RCLMUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
+ __GFP_EASYRCLM)
/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/include/linux/highmem.h linux-2.6.16-rc1-mm3-101_antifrag_flags/include/linux/highmem.h
--- linux-2.6.16-rc1-mm3-clean/include/linux/highmem.h 2006-01-17 07:44:47.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/include/linux/highmem.h 2006-01-26 18:08:22.000000000 +0000
@@ -47,7 +47,7 @@ static inline void clear_user_highpage(s
static inline struct page *
alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
{
- struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+ struct page *page = alloc_page_vma(GFP_RCLMUSER, vma, vaddr);
if (page)
clear_user_highpage(page, vaddr);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/mm/memory.c linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/memory.c
--- linux-2.6.16-rc1-mm3-clean/mm/memory.c 2006-01-25 13:42:46.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/memory.c 2006-01-26 18:08:22.000000000 +0000
@@ -1470,7 +1470,7 @@ gotten:
if (!new_page)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ new_page = alloc_page_vma(GFP_RCLMUSER, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address);
@@ -2069,7 +2069,7 @@ retry:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ page = alloc_page_vma(GFP_RCLMUSER, vma, address);
if (!page)
goto oom;
copy_user_highpage(page, new_page, address);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/mm/shmem.c linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/shmem.c
--- linux-2.6.16-rc1-mm3-clean/mm/shmem.c 2006-01-25 13:42:46.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/shmem.c 2006-01-26 18:08:22.000000000 +0000
@@ -921,6 +921,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
pvma.vm_pgoff = idx;
pvma.vm_end = PAGE_SIZE;
+ if (gfp & __GFP_HIGHMEM)
+ gfp = (gfp & ~__GFP_HIGHMEM) | __GFP_EASYRCLM;
page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
@@ -936,6 +938,8 @@ shmem_swapin(struct shmem_inode_info *in
static inline struct page *
shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
{
+ if (gfp & __GFP_HIGHMEM)
+ gfp = (gfp & ~__GFP_HIGHMEM) | __GFP_EASYRCLM;
return alloc_page(gfp | __GFP_ZERO);
}
#endif
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-clean/mm/swap_state.c linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/swap_state.c
--- linux-2.6.16-rc1-mm3-clean/mm/swap_state.c 2006-01-25 13:42:46.000000000 +0000
+++ linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/swap_state.c 2006-01-26 18:08:22.000000000 +0000
@@ -334,7 +334,7 @@ struct page *read_swap_cache_async(swp_e
* Get a new page to read into from swap.
*/
if (!new_page) {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ new_page = alloc_page_vma(GFP_RCLMUSER, vma, addr);
if (!new_page)
break; /* Out of memory */
}
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 2/9] Create the ZONE_EASYRCLM zone
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
2006-01-26 18:43 ` [PATCH 1/9] Add __GFP_EASYRCLM flag and update callers Mel Gorman
@ 2006-01-26 18:43 ` Mel Gorman
2006-01-26 18:44 ` [PATCH 3/9] x86 - Specify amount of kernel memory at boot time Mel Gorman
` (7 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:43 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
This patch adds the ZONE_EASYRCLM zone and updates relevant contants and
helper functions. After this patch is applied, memory that is hot-added on
the x86 will be placed in ZONE_EASYRCLM. Memory hot-added on the ppc64 still
goes to ZONE_DMA.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-101_antifrag_flags/include/linux/mmzone.h linux-2.6.16-rc1-mm3-102_addzone/include/linux/mmzone.h
--- linux-2.6.16-rc1-mm3-101_antifrag_flags/include/linux/mmzone.h 2006-01-25 13:42:46.000000000 +0000
+++ linux-2.6.16-rc1-mm3-102_addzone/include/linux/mmzone.h 2006-01-26 18:09:04.000000000 +0000
@@ -73,9 +73,10 @@ struct per_cpu_pageset {
#define ZONE_DMA32 1
#define ZONE_NORMAL 2
#define ZONE_HIGHMEM 3
+#define ZONE_EASYRCLM 4
-#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
+#define MAX_NR_ZONES 5 /* Sync this with ZONES_SHIFT */
+#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
/*
@@ -103,7 +104,7 @@ struct per_cpu_pageset {
*
* NOTE! Make sure this matches the zones in <linux/gfp.h>
*/
-#define GFP_ZONEMASK 0x07
+#define GFP_ZONEMASK 0x0f
/* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */
#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */
@@ -408,7 +409,7 @@ static inline int populated_zone(struct
static inline int is_highmem_idx(int idx)
{
- return (idx == ZONE_HIGHMEM);
+ return (idx == ZONE_HIGHMEM || idx == ZONE_EASYRCLM);
}
static inline int is_normal_idx(int idx)
@@ -424,7 +425,8 @@ static inline int is_normal_idx(int idx)
*/
static inline int is_highmem(struct zone *zone)
{
- return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
+ return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM ||
+ zone == zone->zone_pgdat->node_zones + ZONE_EASYRCLM;
}
static inline int is_normal(struct zone *zone)
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/page_alloc.c linux-2.6.16-rc1-mm3-102_addzone/mm/page_alloc.c
--- linux-2.6.16-rc1-mm3-101_antifrag_flags/mm/page_alloc.c 2006-01-25 13:42:46.000000000 +0000
+++ linux-2.6.16-rc1-mm3-102_addzone/mm/page_alloc.c 2006-01-26 18:09:04.000000000 +0000
@@ -66,7 +66,7 @@ int percpu_pagelist_fraction;
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32, 32 };
EXPORT_SYMBOL(totalram_pages);
@@ -77,7 +77,8 @@ EXPORT_SYMBOL(totalram_pages);
struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal",
+ "HighMem", "EasyRclm" };
int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
@@ -753,6 +754,7 @@ static inline void prep_zero_page(struct
int i;
BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+ BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_EASYRCLM)) == __GFP_EASYRCLM);
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
@@ -1260,7 +1262,7 @@ unsigned int nr_free_buffer_pages(void)
*/
unsigned int nr_free_pagecache_pages(void)
{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+ return nr_free_zone_pages(gfp_zone(GFP_RCLMUSER));
}
#ifdef CONFIG_HIGHMEM
@@ -1270,7 +1272,7 @@ unsigned int nr_free_highpages (void)
unsigned int pages = 0;
for_each_pgdat(pgdat)
- pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ pages += pgdat->node_zones[ZONE_EASYRCLM].free_pages;
return pages;
}
@@ -1575,7 +1577,7 @@ static int __init build_zonelists_node(p
{
struct zone *zone;
- BUG_ON(zone_type > ZONE_HIGHMEM);
+ BUG_ON(zone_type > ZONE_EASYRCLM);
do {
zone = pgdat->node_zones + zone_type;
@@ -1595,6 +1597,8 @@ static int __init build_zonelists_node(p
static inline int highest_zone(int zone_bits)
{
int res = ZONE_NORMAL;
+ if (zone_bits & (__force int)__GFP_EASYRCLM)
+ res = ZONE_EASYRCLM;
if (zone_bits & (__force int)__GFP_HIGHMEM)
res = ZONE_HIGHMEM;
if (zone_bits & (__force int)__GFP_DMA32)
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 3/9] x86 - Specify amount of kernel memory at boot time
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
2006-01-26 18:43 ` [PATCH 1/9] Add __GFP_EASYRCLM flag and update callers Mel Gorman
2006-01-26 18:43 ` [PATCH 2/9] Create the ZONE_EASYRCLM zone Mel Gorman
@ 2006-01-26 18:44 ` Mel Gorman
2006-01-26 18:44 ` [PATCH 4/9] ppc64 " Mel Gorman
` (6 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:44 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
This patch was originally written by Kamezawa Hiroyuki.
It should be possible for the administrator to specify at boot-time how much
memory should be used for the kernel and how much should go to ZONE_EASYRCLM.
After this patch is applied, the boot option kernelcore= can be used to
specify how much memory should be used by the kernel.
(Note that Kamezawa called this parameter coremem= . This was renamed because
of the way ppc64 parses command line arguments and would confuse coremem=
with mem=. The name was chosen that could be used across architectures)
The value of kernelcore is important. If it is too small, there will be more
pressure on ZONE_NORMAL and a potential loss of performance. If it is about
896MB, it means that ZONE_HIGHMEM will have a size of zero. Any differences in
tests will depend on whether CONFIG_HIGHPTE is set in the standard kernel or
not. With lots of memory, the ideal is to specify a kernelcore that gives
ZONE_NORMAL it's full size and a ZONE_HIGHMEM for PTEs. The right value
depends, like any tunable, on the workload.
It is also important to note that if kernelcore is less than the maximum
size of ZONE_NORMAL, GFP_HIGHMEM allocations will use ZONE_NORMAL, not the
reachable portion of ZONE_EASYRCLM.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-102_addzone/arch/i386/kernel/setup.c linux-2.6.16-rc1-mm3-103_x86coremem/arch/i386/kernel/setup.c
--- linux-2.6.16-rc1-mm3-102_addzone/arch/i386/kernel/setup.c 2006-01-25 13:42:41.000000000 +0000
+++ linux-2.6.16-rc1-mm3-103_x86coremem/arch/i386/kernel/setup.c 2006-01-26 18:09:48.000000000 +0000
@@ -121,6 +121,9 @@ int bootloader_type;
/* user-defined highmem size */
static unsigned int highmem_pages = -1;
+/* user-defined easy-reclaim-size */
+static unsigned int core_mem_pages = -1;
+static unsigned int easyrclm_pages = 0;
/*
* Setup options
*/
@@ -921,6 +924,15 @@ static void __init parse_cmdline_early (
*/
else if (!memcmp(from, "vmalloc=", 8))
__VMALLOC_RESERVE = memparse(from+8, &from);
+ /*
+ * kernelcore=size sets the amount of memory for use for
+ * kernel allocations that cannot be reclaimed easily.
+ * The remaining memory is set aside for easy reclaim
+ * for features like memory remove or huge page allocations
+ */
+ else if (!memcmp(from, "kernelcore=",11)) {
+ core_mem_pages = memparse(from+11, &from) >> PAGE_SHIFT;
+ }
next_char:
c = *(from++);
@@ -990,6 +1002,17 @@ void __init find_max_pfn(void)
}
}
+unsigned long __init calculate_core_memory(unsigned long max_low_pfn)
+{
+ if (max_low_pfn < core_mem_pages) {
+ highmem_pages -= (core_mem_pages - max_low_pfn);
+ } else {
+ max_low_pfn = core_mem_pages;
+ highmem_pages = 0;
+ }
+ easyrclm_pages = max_pfn - core_mem_pages;
+ return max_low_pfn;
+}
/*
* Determine low and high memory ranges:
*/
@@ -1046,6 +1069,8 @@ unsigned long __init find_max_low_pfn(vo
printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
#endif
}
+ if (core_mem_pages != -1)
+ max_low_pfn = calculate_core_memory(max_low_pfn);
return max_low_pfn;
}
@@ -1166,7 +1191,8 @@ void __init zone_sizes_init(void)
zones_size[ZONE_DMA] = max_dma;
zones_size[ZONE_NORMAL] = low - max_dma;
#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+ zones_size[ZONE_HIGHMEM] = highend_pfn - low - easyrclm_pages;
+ zones_size[ZONE_EASYRCLM] = easyrclm_pages;
#endif
}
free_area_init(zones_size);
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 4/9] ppc64 - Specify amount of kernel memory at boot time
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (2 preceding siblings ...)
2006-01-26 18:44 ` [PATCH 3/9] x86 - Specify amount of kernel memory at boot time Mel Gorman
@ 2006-01-26 18:44 ` Mel Gorman
2006-02-07 21:06 ` [Lhms-devel] " Joel Schopp
2006-01-26 18:44 ` [PATCH 5/9] At boot, determine what zone memory will hot-add to Mel Gorman
` (5 subsequent siblings)
9 siblings, 1 reply; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:44 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
This patch adds the kernelcore= parameter for ppc64.
The amount of memory will requested will not be reserved in all nodes. The
first node that is found that can accomodate the requested amount of memory
and have remaining more for ZONE_EASYRCLM is used. If a node has memory holes,
it also will not be used.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-103_x86coremem/arch/powerpc/mm/numa.c linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/powerpc/mm/numa.c
--- linux-2.6.16-rc1-mm3-103_x86coremem/arch/powerpc/mm/numa.c 2006-01-17 07:44:47.000000000 +0000
+++ linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/powerpc/mm/numa.c 2006-01-26 18:10:29.000000000 +0000
@@ -21,6 +21,7 @@
#include <asm/lmb.h>
#include <asm/system.h>
#include <asm/smp.h>
+#include <asm/machdep.h>
static int numa_enabled = 1;
@@ -722,20 +723,51 @@ void __init paging_init(void)
unsigned long zones_size[MAX_NR_ZONES];
unsigned long zholes_size[MAX_NR_ZONES];
int nid;
+ unsigned long core_mem_size = 0;
+ unsigned long core_mem_pfn = 0;
+ char *opt;
memset(zones_size, 0, sizeof(zones_size));
memset(zholes_size, 0, sizeof(zholes_size));
+ /* Check if ZONE_EASYRCLM should be populated */
+ opt = strstr(cmd_line, "kernelcore=");
+ if (opt) {
+ opt += 11;
+ core_mem_size = memparse(opt, &opt);
+ core_mem_pfn = core_mem_size >> PAGE_SHIFT;
+ }
+
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn, pages_present;
get_region(nid, &start_pfn, &end_pfn, &pages_present);
- zones_size[ZONE_DMA] = end_pfn - start_pfn;
- zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present;
+ /*
+ * Set up a zone for EASYRCLM as long as this node is large
+ * enough to accomodate the requested size and that there
+ * are no memory holes
+ */
+ if (end_pfn - start_pfn <= core_mem_pfn ||
+ end_pfn - start_pfn != pages_present) {
+ zones_size[ZONE_DMA] = end_pfn - start_pfn;
+ zholes_size[ZONE_DMA] =
+ zones_size[ZONE_DMA] - pages_present;
+ if (core_mem_pfn > end_pfn - start_pfn)
+ core_mem_pfn -= (end_pfn - start_pfn);
+ } else {
+ zones_size[ZONE_DMA] = core_mem_pfn;
+ zones_size[ZONE_EASYRCLM] = end_pfn - core_mem_pfn;
+ zholes_size[ZONE_DMA] = 0;
+ zholes_size[ZONE_EASYRCLM] = 0;
+ core_mem_pfn = 0;
+ }
- dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
+ dbg("free_area_init DMA node %d %lx %lx (hole: %lx)\n", nid,
zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
+ dbg("free_area_init EASYRCLM node %d %lx %lx (hole: %lx)\n",
+ nid, zones_size[ZONE_EASYRCLM], start_pfn,
+ zholes_size[ZONE_DMA]);
free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn,
zholes_size);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-103_x86coremem/mm/page_alloc.c linux-2.6.16-rc1-mm3-104_ppc64coremem/mm/page_alloc.c
--- linux-2.6.16-rc1-mm3-103_x86coremem/mm/page_alloc.c 2006-01-26 18:09:04.000000000 +0000
+++ linux-2.6.16-rc1-mm3-104_ppc64coremem/mm/page_alloc.c 2006-01-26 18:10:29.000000000 +0000
@@ -1583,7 +1583,11 @@ static int __init build_zonelists_node(p
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
#ifndef CONFIG_HIGHMEM
- BUG_ON(zone_type > ZONE_NORMAL);
+ /*
+ * On architectures with only ZONE_DMA, it is still
+ * valid to have a ZONE_EASYRCLM
+ */
+ BUG_ON(zone_type == ZONE_HIGHMEM);
#endif
zonelist->zones[nr_zones++] = zone;
check_highest_zone(zone_type);
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [Lhms-devel] [PATCH 4/9] ppc64 - Specify amount of kernel memory at boot time
2006-01-26 18:44 ` [PATCH 4/9] ppc64 " Mel Gorman
@ 2006-02-07 21:06 ` Joel Schopp
2006-02-08 10:23 ` Mel Gorman
0 siblings, 1 reply; 17+ messages in thread
From: Joel Schopp @ 2006-02-07 21:06 UTC (permalink / raw)
To: Mel Gorman; +Cc: linux-mm, linux-kernel, lhms-devel
> This patch adds the kernelcore= parameter for ppc64
...
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-103_x86coremem/mm/page_alloc.c linux-2.6.16-rc1-mm3-104_ppc64coremem/mm/page_alloc.c
> --- linux-2.6.16-rc1-mm3-103_x86coremem/mm/page_alloc.c 2006-01-26 18:09:04.000000000 +0000
> +++ linux-2.6.16-rc1-mm3-104_ppc64coremem/mm/page_alloc.c 2006-01-26 18:10:29.000000000 +0000
Not to nitpick, but this chunk should go in a different patch, it's not
ppc64 specific.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [Lhms-devel] [PATCH 4/9] ppc64 - Specify amount of kernel memory at boot time
2006-02-07 21:06 ` [Lhms-devel] " Joel Schopp
@ 2006-02-08 10:23 ` Mel Gorman
0 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-02-08 10:23 UTC (permalink / raw)
To: Joel Schopp; +Cc: linux-mm, linux-kernel, lhms-devel
On Tue, 7 Feb 2006, Joel Schopp wrote:
> > This patch adds the kernelcore= parameter for ppc64
>
> ...
>
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff
> > linux-2.6.16-rc1-mm3-103_x86coremem/mm/page_alloc.c
> > linux-2.6.16-rc1-mm3-104_ppc64coremem/mm/page_alloc.c
> > --- linux-2.6.16-rc1-mm3-103_x86coremem/mm/page_alloc.c 2006-01-26
> > 18:09:04.000000000 +0000
> > +++ linux-2.6.16-rc1-mm3-104_ppc64coremem/mm/page_alloc.c 2006-01-26
> > 18:10:29.000000000 +0000
>
> Not to nitpick, but this chunk should go in a different patch, it's not ppc64
> specific.
>
You're right. It was put in here because it was testing this patch on
ppc64 that the bug was revealed. It should be moved to the patch that adds
the actual zone.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 5/9] At boot, determine what zone memory will hot-add to
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (3 preceding siblings ...)
2006-01-26 18:44 ` [PATCH 4/9] ppc64 " Mel Gorman
@ 2006-01-26 18:44 ` Mel Gorman
2006-01-26 18:45 ` [PATCH 6/9] Allow HugeTLB allocations to use ZONE_EASYRCLM Mel Gorman
` (4 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:44 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
Once ZONE_EASYRCLM is added, the x86 by default adds to ZONE_EASYRCLM and
ppc64 by default uses ZONE_DMA. This patch changes the behavior slightly on
x86 and ppc64.
o By default, ZONE_DMA is used on ppc64 and ZONE_HIGHMEM is used on x86
o If kernelcore is specified at boot time, x86 and ppc64 hotadd to ZONE_EASYRCLM
o If kernelcore and noeasyrclm is used, ppc64 will use ZONE_DMA and x86 will
use ZONE_HIGHMEM
This is a list of scenarios and what happens with different options on an
x86 with 1.5GiB of physical RAM. ./activate is a script that tries to online
all inactive physical memory.
Boot with no special parameters
- ./activate does nothing
- All high memory in HIGHMEM
Boot with mem=512MB
- Machine boots with 512MB active RAM
- ./activate adds memory to ZONE_HIGHMEM
- No memory in ZONE_EASYRCLM
Boot with kernelcore=512MB
- Machine boots with 1.5GiB RAM
- ./activate does nothing
- No memory in HIGHMEM
- Some of what would be NORMAL and all of HIGHMEM is in EASYRCLM
Boot with kernelcore=512MB mem=512MB
- Machine boots with 512MB RAM
- ./activate adds memory to ZONE_EASYRCLM
- No memory in HIGHMEM
Boot with kernelcore=512MB mem=512MB noeasyrclm
- Machine boots with 512MB RAM
- ./activate adds memory to ZONE_EASYRCLM
- No memory in HIGHMEM
- With noeasyrclm, this is identical to booting with just mem=512MB
Boot with kernelcore=1024MB mem=1024MB
- Machine boots with 1024MB RAM
- Some memory already in ZONE_HIGHMEM
- ./activate adds memory to ZONE_EASYRCLM
Boot with kernelcore=1024MB mem=1024MB noeasyrclm
- Machine boots with 1024MB RAM
- Some memory already in ZONE_EASYRCLM
- ./activate adds memory to ZONE_HIGHMEM
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/i386/kernel/setup.c linux-2.6.16-rc1-mm3-106_zonechoose/arch/i386/kernel/setup.c
--- linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/i386/kernel/setup.c 2006-01-26 18:09:48.000000000 +0000
+++ linux-2.6.16-rc1-mm3-106_zonechoose/arch/i386/kernel/setup.c 2006-01-26 18:13:00.000000000 +0000
@@ -124,6 +124,8 @@ static unsigned int highmem_pages = -1;
/* user-defined easy-reclaim-size */
static unsigned int core_mem_pages = -1;
static unsigned int easyrclm_pages = 0;
+static int hotadd_zone_offset=-1;
+
/*
* Setup options
*/
@@ -932,6 +934,18 @@ static void __init parse_cmdline_early (
*/
else if (!memcmp(from, "kernelcore=",11)) {
core_mem_pages = memparse(from+11, &from) >> PAGE_SHIFT;
+
+ if (hotadd_zone_offset == -1)
+ hotadd_zone_offset = ZONE_EASYRCLM;
+ }
+
+ /*
+ * Once kernelcore= is specified, the default zone to add to
+ * is ZONE_EASYRCLM. This parameter allows an administrator
+ * to override that
+ */
+ else if (!memcmp(from, "noeasyrclm", 10)) {
+ hotadd_zone_offset = ZONE_HIGHMEM;
}
next_char:
@@ -1561,6 +1575,13 @@ void __init setup_arch(char **cmdline_p)
#endif
}
+struct zone *get_zone_for_hotadd(struct pglist_data *pgdata) {
+ if (unlikely(hotadd_zone_offset == -1))
+ hotadd_zone_offset = ZONE_HIGHMEM;
+
+ return &pgdata->node_zones[hotadd_zone_offset];
+}
+
#include "setup_arch_post.h"
/*
* Local Variables:
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/i386/mm/init.c linux-2.6.16-rc1-mm3-106_zonechoose/arch/i386/mm/init.c
--- linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/i386/mm/init.c 2006-01-25 13:42:41.000000000 +0000
+++ linux-2.6.16-rc1-mm3-106_zonechoose/arch/i386/mm/init.c 2006-01-26 18:11:12.000000000 +0000
@@ -655,7 +655,7 @@ void __init mem_init(void)
int add_memory(u64 start, u64 size)
{
struct pglist_data *pgdata = &contig_page_data;
- struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+ struct zone *zone = get_zone_for_hotadd(pgdata);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/powerpc/mm/mem.c linux-2.6.16-rc1-mm3-106_zonechoose/arch/powerpc/mm/mem.c
--- linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/powerpc/mm/mem.c 2006-01-17 07:44:47.000000000 +0000
+++ linux-2.6.16-rc1-mm3-106_zonechoose/arch/powerpc/mm/mem.c 2006-01-26 18:11:12.000000000 +0000
@@ -129,7 +129,7 @@ int __devinit add_memory(u64 start, u64
create_section_mapping(start, start + size);
/* this should work for most non-highmem platforms */
- zone = pgdata->node_zones;
+ zone = get_zone_for_hotadd(pgdata);
return __add_pages(zone, start_pfn, nr_pages);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/powerpc/mm/numa.c linux-2.6.16-rc1-mm3-106_zonechoose/arch/powerpc/mm/numa.c
--- linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/powerpc/mm/numa.c 2006-01-26 18:10:29.000000000 +0000
+++ linux-2.6.16-rc1-mm3-106_zonechoose/arch/powerpc/mm/numa.c 2006-01-26 18:11:12.000000000 +0000
@@ -39,6 +39,7 @@ EXPORT_SYMBOL(node_data);
static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
static int min_common_depth;
static int n_mem_addr_cells, n_mem_size_cells;
+static int hotadd_zone_offset = -1;
/*
* We need somewhere to store start/end/node for each region until we have
@@ -736,6 +737,13 @@ void __init paging_init(void)
opt += 11;
core_mem_size = memparse(opt, &opt);
core_mem_pfn = core_mem_size >> PAGE_SHIFT;
+ hotadd_zone_offset = ZONE_EASYRCLM;
+ }
+
+ /* Check if the administrator requests only ZONE_DMA be used */
+ opt = strstr(cmd_line, "noeasyrclm");
+ if (opt) {
+ hotadd_zone_offset = ZONE_DMA;
}
for_each_online_node(nid) {
@@ -844,4 +852,11 @@ got_numa_domain:
}
return numa_domain;
}
+
+struct zone *get_zone_for_hotadd(struct pglist_data *pgdata) {
+ if (unlikely(hotadd_zone_offset == -1))
+ hotadd_zone_offset = ZONE_DMA;
+
+ return &pgdata->node_zones[hotadd_zone_offset];
+}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/x86_64/mm/init.c linux-2.6.16-rc1-mm3-106_zonechoose/arch/x86_64/mm/init.c
--- linux-2.6.16-rc1-mm3-104_ppc64coremem/arch/x86_64/mm/init.c 2006-01-25 13:42:42.000000000 +0000
+++ linux-2.6.16-rc1-mm3-106_zonechoose/arch/x86_64/mm/init.c 2006-01-26 18:11:12.000000000 +0000
@@ -495,7 +495,7 @@ void online_page(struct page *page)
int add_memory(u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(0);
- struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+ struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-3;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-104_ppc64coremem/include/linux/memory_hotplug.h linux-2.6.16-rc1-mm3-106_zonechoose/include/linux/memory_hotplug.h
--- linux-2.6.16-rc1-mm3-104_ppc64coremem/include/linux/memory_hotplug.h 2006-01-17 07:44:47.000000000 +0000
+++ linux-2.6.16-rc1-mm3-106_zonechoose/include/linux/memory_hotplug.h 2006-01-26 18:11:12.000000000 +0000
@@ -57,6 +57,7 @@ extern void online_page(struct page *pag
extern int add_memory(u64 start, u64 size);
extern int remove_memory(u64 start, u64 size);
extern int online_pages(unsigned long, unsigned long);
+extern struct zone *get_zone_for_hotadd(struct pglist_data *);
/* reasonably generic interface to expand the physical pages in a zone */
extern int __add_pages(struct zone *zone, unsigned long start_pfn,
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 6/9] Allow HugeTLB allocations to use ZONE_EASYRCLM
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (4 preceding siblings ...)
2006-01-26 18:44 ` [PATCH 5/9] At boot, determine what zone memory will hot-add to Mel Gorman
@ 2006-01-26 18:45 ` Mel Gorman
2006-01-26 18:45 ` [PATCH 7/9] Add documentation for extra boot parameters Mel Gorman
` (3 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:45 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
On ppc64 at least, a HugeTLB is the same size as a memory section. Hence,
it causes no fragmentation that is worth caring about because a section can
still be offlined.
Once HugeTLB is allowed to use ZONE_EASYRCLM, the size of the zone becomes a
"soft" area where HugeTLB allocations may be satisified. For example, take
a situation where a system administrator is not willing to reserve HugeTLB
pages at boot time. In this case, he can use kernelcore to size the EasyRclm
zone which is still usable by normal processes. If a job starts that need
HugeTLB pages, one could dd a file the size of physical memory, delete it
and have a good chance of getting a number of HugeTLB pages. To get all of
EasyRclm as HugeTLB pages, the ability to drain per-cpu pages is required.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-106_zonechoose/mm/hugetlb.c linux-2.6.16-rc1-mm3-107_hugetlb_use_easyrclm/mm/hugetlb.c
--- linux-2.6.16-rc1-mm3-106_zonechoose/mm/hugetlb.c 2006-01-17 07:44:47.000000000 +0000
+++ linux-2.6.16-rc1-mm3-107_hugetlb_use_easyrclm/mm/hugetlb.c 2006-01-26 18:13:43.000000000 +0000
@@ -49,7 +49,7 @@ static struct page *dequeue_huge_page(st
for (z = zonelist->zones; *z; z++) {
nid = (*z)->zone_pgdat->node_id;
- if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+ if (cpuset_zone_allowed(*z, GFP_RCLMUSER) &&
!list_empty(&hugepage_freelists[nid]))
break;
}
@@ -68,7 +68,7 @@ static struct page *alloc_fresh_huge_pag
{
static int nid = 0;
struct page *page;
- page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+ page = alloc_pages_node(nid, GFP_RCLMUSER|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
nid = (nid + 1) % num_online_nodes();
if (page) {
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-106_zonechoose/mm/mempolicy.c linux-2.6.16-rc1-mm3-107_hugetlb_use_easyrclm/mm/mempolicy.c
--- linux-2.6.16-rc1-mm3-106_zonechoose/mm/mempolicy.c 2006-01-25 13:42:46.000000000 +0000
+++ linux-2.6.16-rc1-mm3-107_hugetlb_use_easyrclm/mm/mempolicy.c 2006-01-26 18:13:43.000000000 +0000
@@ -1169,7 +1169,7 @@ struct zonelist *huge_zonelist(struct vm
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+ return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_RCLMUSER);
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 7/9] Add documentation for extra boot parameters
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (5 preceding siblings ...)
2006-01-26 18:45 ` [PATCH 6/9] Allow HugeTLB allocations to use ZONE_EASYRCLM Mel Gorman
@ 2006-01-26 18:45 ` Mel Gorman
2006-01-26 18:45 ` [PATCH 8/9] ForTesting - Prevent OOM killer firing for high-order allocations Mel Gorman
` (2 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:45 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
Once all patches are applied, two new command-line parameters exist -
kernelcore and noeasyrclm. This patch adds the necessary documentation.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-107_hugetlb_use_easyrclm/Documentation/kernel-parameters.txt linux-2.6.16-rc1-mm3-108_docs/Documentation/kernel-parameters.txt
--- linux-2.6.16-rc1-mm3-107_hugetlb_use_easyrclm/Documentation/kernel-parameters.txt 2006-01-25 13:42:42.000000000 +0000
+++ linux-2.6.16-rc1-mm3-108_docs/Documentation/kernel-parameters.txt 2006-01-26 18:14:24.000000000 +0000
@@ -702,6 +702,16 @@ running once the system is up.
js= [HW,JOY] Analog joystick
See Documentation/input/joystick.txt.
+ kernelcore=nn[KMG] [KNL,IA-32,PPC] On the x86 and ppc64, this
+ parameter specifies the amount of memory usable
+ by the kernel and places the rest in an EasyRclm
+ zone. The EasyRclm zone is used for the allocation
+ of pages on behalf of a process and for HugeTLB
+ pages. On ppc64, it is likely that memory sections
+ on this zone can be offlined. Note that allocations
+ like PTEs-from-HighMem still use the HighMem zone
+ if it exists, and the Normal zone if it does not.
+
keepinitrd [HW,ARM]
kstack=N [IA-32,X86-64] Print N words from the kernel stack
@@ -1004,6 +1014,16 @@ running once the system is up.
nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects.
+ noeasyrclm [IA-32,PPC] If kernelcore= is specified, the default
+ zone to add memory to for IA-32 and PPC is EasyRclm. If
+ this is undesirable, noeasyrclm can be specified to
+ force the adding of memory on IA-32 to ZONE_HIGHMEM
+ and to ZONE_DMA on PPC. This is desirable when the
+ EasyRclm zone is setup as a "soft" area for HugeTLB
+ pages to be allocated from to give the chance for
+ administrators to grow the reserved number of Huge
+ pages when the system has been running for some time.
+
noexec [IA-64]
noexec [IA-32,X86-64]
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 8/9] ForTesting - Prevent OOM killer firing for high-order allocations
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (6 preceding siblings ...)
2006-01-26 18:45 ` [PATCH 7/9] Add documentation for extra boot parameters Mel Gorman
@ 2006-01-26 18:45 ` Mel Gorman
2006-01-26 18:46 ` [PATCH 9/9] ForTesting - Drain the per-cpu caches with high order allocations fail Mel Gorman
2006-01-27 0:29 ` [PATCH 0/9] Reducing fragmentation using zones v4 KAMEZAWA Hiroyuki
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:45 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
Stop going OOM for high-order allocations. During testing of high order
allocations, we do not want the OOM killing everything in sight.
For comparison between kernels during the high order allocatioon stress
test, this patch is applied to both the stock -mm kernel and the kernel
using ZONE_EASYRCLM.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-108_docs/mm/page_alloc.c linux-2.6.16-rc1-mm3-902_highorderoom/mm/page_alloc.c
--- linux-2.6.16-rc1-mm3-108_docs/mm/page_alloc.c 2006-01-26 18:10:29.000000000 +0000
+++ linux-2.6.16-rc1-mm3-902_highorderoom/mm/page_alloc.c 2006-01-26 18:15:07.000000000 +0000
@@ -1095,8 +1095,11 @@ rebalance:
if (page)
goto got_pg;
- out_of_memory(gfp_mask, order);
- goto restart;
+ /* Only go OOM for low-order allocations */
+ if (order <= 3) {
+ out_of_memory(gfp_mask, order);
+ goto restart;
+ }
}
/*
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 9/9] ForTesting - Drain the per-cpu caches with high order allocations fail
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (7 preceding siblings ...)
2006-01-26 18:45 ` [PATCH 8/9] ForTesting - Prevent OOM killer firing for high-order allocations Mel Gorman
@ 2006-01-26 18:46 ` Mel Gorman
2006-01-27 0:29 ` [PATCH 0/9] Reducing fragmentation using zones v4 KAMEZAWA Hiroyuki
9 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-26 18:46 UTC (permalink / raw)
To: linux-mm; +Cc: Mel Gorman, linux-kernel, lhms-devel
The presense of free per-cpu pages appear to cause fragmentation because
contiguous free blocks do not merge with their buddies. This can skew the
results between runs a lot because how many HugeTLB pages there are available
depends on luck. This patch was applied to both stock and anti-frag kernels
to give more consistant results.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.16-rc1-mm3-902_highorderoom/mm/page_alloc.c linux-2.6.16-rc1-mm3-903_drainpercpu/mm/page_alloc.c
--- linux-2.6.16-rc1-mm3-902_highorderoom/mm/page_alloc.c 2006-01-26 18:15:07.000000000 +0000
+++ linux-2.6.16-rc1-mm3-903_drainpercpu/mm/page_alloc.c 2006-01-26 18:15:49.000000000 +0000
@@ -623,7 +623,8 @@ void drain_remote_pages(void)
}
#endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+#if defined(CONFIG_PM) || \
+ defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
unsigned long flags;
@@ -685,6 +686,27 @@ void drain_local_pages(void)
__drain_pages(smp_processor_id());
local_irq_restore(flags);
}
+
+void smp_drain_local_pages(void *arg)
+{
+ drain_local_pages();
+}
+
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_local_pages(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __drain_pages(smp_processor_id());
+ local_irq_restore(flags);
+
+ smp_call_function(smp_drain_local_pages, NULL, 0, 1);
+}
+#else
+void drain_all_local_pages(void) {}
#endif /* CONFIG_PM */
static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
@@ -1073,6 +1095,9 @@ rebalance:
did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+ if (order > 3)
+ drain_all_local_pages();
+
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 0/9] Reducing fragmentation using zones v4
2006-01-26 18:43 [PATCH 0/9] Reducing fragmentation using zones v4 Mel Gorman
` (8 preceding siblings ...)
2006-01-26 18:46 ` [PATCH 9/9] ForTesting - Drain the per-cpu caches with high order allocations fail Mel Gorman
@ 2006-01-27 0:29 ` KAMEZAWA Hiroyuki
2006-01-27 0:41 ` [Lhms-devel] " KAMEZAWA Hiroyuki
9 siblings, 1 reply; 17+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-01-27 0:29 UTC (permalink / raw)
To: Mel Gorman; +Cc: linux-mm, linux-kernel, lhms-devel
Hi, Mel-san
Mel Gorman wrote:
> Changelog since v4
> o Minor bugs
> o ppc64 can specify kernelcore
> o Ability to disable use of ZONE_EASYRCLM at boot time
> o HugeTLB uses ZONE_EASYRCLM
> o Add drain-percpu caches for testing
> o boot-parameter documentation added
>
Could you add this patch to your set ?
This was needed to boot my x86 machine without HIGHMEM.
-- Kame
Index: linux-2.6.16-rc1-mm3/mm/highmem.c
===================================================================
--- linux-2.6.16-rc1-mm3.orig/mm/highmem.c
+++ linux-2.6.16-rc1-mm3/mm/highmem.c
@@ -225,9 +225,10 @@ static __init int init_emergency_pool(vo
struct sysinfo i;
si_meminfo(&i);
si_swapinfo(&i);
-
+#ifdef CONFIG_HIGHMEM /* we can add HIGHMEM after boot */
if (!i.totalhigh)
return 0;
+#endif
page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
if (!page_pool)
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [Lhms-devel] Re: [PATCH 0/9] Reducing fragmentation using zones v4
2006-01-27 0:29 ` [PATCH 0/9] Reducing fragmentation using zones v4 KAMEZAWA Hiroyuki
@ 2006-01-27 0:41 ` KAMEZAWA Hiroyuki
2006-01-27 10:29 ` Mel Gorman
0 siblings, 1 reply; 17+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-01-27 0:41 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: Mel Gorman, linux-mm, linux-kernel, lhms-devel
KAMEZAWA Hiroyuki wrote:
> Could you add this patch to your set ?
> This was needed to boot my x86 machine without HIGHMEM.
>
Sorry, I sent a wrong patch..
This is correct one.
-- Kame
Index: linux-2.6.16-rc1-mm3/mm/highmem.c
===================================================================
--- linux-2.6.16-rc1-mm3.orig/mm/highmem.c
+++ linux-2.6.16-rc1-mm3/mm/highmem.c
@@ -225,9 +225,6 @@ static __init int init_emergency_pool(vo
struct sysinfo i;
si_meminfo(&i);
si_swapinfo(&i);
-
- if (!i.totalhigh)
- return 0;
page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
if (!page_pool)
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [Lhms-devel] Re: [PATCH 0/9] Reducing fragmentation using zones v4
2006-01-27 0:41 ` [Lhms-devel] " KAMEZAWA Hiroyuki
@ 2006-01-27 10:29 ` Mel Gorman
2006-01-27 11:19 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 17+ messages in thread
From: Mel Gorman @ 2006-01-27 10:29 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-kernel, lhms-devel
On Fri, 27 Jan 2006, KAMEZAWA Hiroyuki wrote:
> KAMEZAWA Hiroyuki wrote:
> > Could you add this patch to your set ?
> > This was needed to boot my x86 machine without HIGHMEM.
> >
> Sorry, I sent a wrong patch..
> This is correct one.
I can add it although I would like to know more about the problem. I tried
booting with and without CONFIG_HIGHMEM both stock kernels and with
anti-frag and they all boot fine. What causes your machine to die? Does it
occur with stock -mm or just with anti-frag?
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [Lhms-devel] Re: [PATCH 0/9] Reducing fragmentation using zones v4
2006-01-27 10:29 ` Mel Gorman
@ 2006-01-27 11:19 ` KAMEZAWA Hiroyuki
2006-01-27 11:42 ` Mel Gorman
0 siblings, 1 reply; 17+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-01-27 11:19 UTC (permalink / raw)
To: Mel Gorman; +Cc: linux-mm, linux-kernel, lhms-devel
Mel Gorman wrote:
> On Fri, 27 Jan 2006, KAMEZAWA Hiroyuki wrote:
>
>> KAMEZAWA Hiroyuki wrote:
>>> Could you add this patch to your set ?
>>> This was needed to boot my x86 machine without HIGHMEM.
>>>
>> Sorry, I sent a wrong patch..
>> This is correct one.
>
> I can add it although I would like to know more about the problem. I tried
> booting with and without CONFIG_HIGHMEM both stock kernels and with
> anti-frag and they all boot fine. What causes your machine to die? Does it
> occur with stock -mm or just with anti-frag?
>
Sorry, it looks there is no problem with your newest set :(
This was problem of my tree...
Sigh, I should be more carefull.
my note is attached.
Sorry,
-- Kame
== Note ==
I replaced si_meminfo() like following
==
#ifdef CONFIG_HIGHMEM
val->totalhigh = nr_total_zonetype_pages(ZONE_HIGHMEM);
val->freehigh = nr_free_zonetype_pages(ZONE_HIGHMEM);
#else
==
If ZONE_HIGHMEM has no pages, val->totalhigh is 0 and mempool for bounce buffer
is not initialized.
But, now
==
#ifdef CONFIG_HIGHMEM
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
#else
==
totalhigh_pages is defined by highstart_pfn and highend_pfn.
By Zone_EasyRclm, totalhigh_pages is not affected.
mempool for bounce buffer is properly initialized....
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [Lhms-devel] Re: [PATCH 0/9] Reducing fragmentation using zones v4
2006-01-27 11:19 ` KAMEZAWA Hiroyuki
@ 2006-01-27 11:42 ` Mel Gorman
0 siblings, 0 replies; 17+ messages in thread
From: Mel Gorman @ 2006-01-27 11:42 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-kernel, lhms-devel
On Fri, 27 Jan 2006, KAMEZAWA Hiroyuki wrote:
> Mel Gorman wrote:
> > On Fri, 27 Jan 2006, KAMEZAWA Hiroyuki wrote:
> >
> > > KAMEZAWA Hiroyuki wrote:
> > > > Could you add this patch to your set ?
> > > > This was needed to boot my x86 machine without HIGHMEM.
> > > >
> > > Sorry, I sent a wrong patch..
> > > This is correct one.
> >
> > I can add it although I would like to know more about the problem. I tried
> > booting with and without CONFIG_HIGHMEM both stock kernels and with
> > anti-frag and they all boot fine. What causes your machine to die? Does it
> > occur with stock -mm or just with anti-frag?
> >
> Sorry, it looks there is no problem with your newest set :(
Not a problem. If nothing else, testing CONFIG_HIGHMEM showed that there
is a compile bug when memory hotplug is set but highmem is not, so some
good came of this. At least I know you are trying the patches out :)
> This was problem of my tree...
>
> Sigh, I should be more carefull.
> my note is attached.
>
> Sorry,
Not to worry, thanks for the note.
> -- Kame
>
> == Note ==
>
> I replaced si_meminfo() like following
> ==
> #ifdef CONFIG_HIGHMEM
> val->totalhigh = nr_total_zonetype_pages(ZONE_HIGHMEM);
> val->freehigh = nr_free_zonetype_pages(ZONE_HIGHMEM);
> #else
> ==
> If ZONE_HIGHMEM has no pages, val->totalhigh is 0 and mempool for bounce
> buffer
> is not initialized.
>
> But, now
> ==
> #ifdef CONFIG_HIGHMEM
> val->totalhigh = totalhigh_pages;
> val->freehigh = nr_free_highpages();
> #else
> ==
>
> totalhigh_pages is defined by highstart_pfn and highend_pfn.
> By Zone_EasyRclm, totalhigh_pages is not affected.
> mempool for bounce buffer is properly initialized....
>
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
^ permalink raw reply [flat|nested] 17+ messages in thread