linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] Sparse Memory Handling (hot-add foundation)
@ 2005-02-18  0:03 Dave Hansen
  2005-02-18  0:05 ` [RFC][PATCH] Memory Hotplug Dave Hansen
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Dave Hansen @ 2005-02-18  0:03 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: lhms, linux-mm, David C. Hansen [imap], Andy Whitcroft

[-- Attachment #1: Type: text/plain, Size: 943 bytes --]

The attached patch, largely written by Andy Whitcroft, implements a
feature which is similar to DISCONTIGMEM, but has some added features.
Instead of splitting up the mem_map for each NUMA node, this splits it
up into areas that represent fixed blocks of memory.  This allows
individual pieces of that memory to be easily added and removed.

Because it is so similar to DISCONTIGMEM, it can actually be used in
place of it on NUMA systems such as the NUMAQ, or Summit architectures.
This patch includes an i386 and ppc64 implementation, but there are
x86_64 and ia64 implementations as well.

There are a number of individual patches (with descriptions) which are
rolled up in the attached patch: all of the files up to and including
"G2-no-memory-at-high_memory-ppc64.patch" from this directory:
http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out/

I can post individual patches if anyone would like to comment on them.  

-- Dave

[-- Attachment #2: sparse-2.6.11-rc3.patch --]
[-- Type: text/x-patch, Size: 81708 bytes --]

--- sparse/arch/arm/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/arm/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -501,10 +501,6 @@
 				bdata->node_boot_start >> PAGE_SHIFT, zhole_size);
 	}
 
-#ifndef CONFIG_DISCONTIGMEM
-	mem_map = contig_page_data.node_mem_map;
-#endif
-
 	/*
 	 * finish off the bad pages once
 	 * the mem_map is initialised
--- sparse/arch/arm26/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/arm26/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -309,8 +309,6 @@
 	free_area_init_node(0, pgdat, zone_size,
 			bdata->node_boot_start >> PAGE_SHIFT, zhole_size);
 
-	mem_map = NODE_DATA(0)->node_mem_map;
-
 	/*
 	 * finish off the bad pages once
 	 * the mem_map is initialised
--- sparse/arch/cris/arch-v10/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/cris/arch-v10/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -184,7 +184,6 @@
 	 */
 
 	free_area_init_node(0, &contig_page_data, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0);
-	mem_map = contig_page_data.node_mem_map;
 }
 
 /* Initialize remaps of some I/O-ports. It is important that this
--- sparse/arch/i386/Kconfig~B-sparse-080-alloc_remap-i386	2005-02-17 15:47:43.000000000 -0800
+++ /arch/i386/Kconfig	2005-02-17 15:47:47.000000000 -0800
@@ -68,7 +68,7 @@
 
 config X86_NUMAQ
 	bool "NUMAQ (IBM/Sequent)"
-	select DISCONTIGMEM
+	#select DISCONTIGMEM
 	select NUMA
 	help
 	  This option is used for getting Linux to run on a (IBM/Sequent) NUMA
@@ -759,16 +759,22 @@
 comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI)
 
-config DISCONTIGMEM
+config HAVE_ARCH_BOOTMEM_NODE
 	bool
 	depends on NUMA
 	default y
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_ALLOC_REMAP
 	bool
 	depends on NUMA
 	default y
 
+config ARCH_SPARSEMEM_DEFAULT
+	bool
+	depends on (X86_NUMAQ || X86_SUMMIT)
+
+source "mm/Kconfig"
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM4G || HIGHMEM64G
--- sparse/arch/i386/kernel/numaq.c~B-sparse-140-abstract-discontig	2005-02-17 15:47:45.000000000 -0800
+++ /arch/i386/kernel/numaq.c	2005-02-17 15:47:45.000000000 -0800
@@ -32,7 +32,7 @@
 #include <asm/numaq.h>
 
 /* These are needed before the pgdat's are created */
-extern long node_start_pfn[], node_end_pfn[];
+extern long node_start_pfn[], node_end_pfn[], node_remap_size[];
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
@@ -59,6 +59,8 @@
 				eq->hi_shrd_mem_start - eq->priv_mem_size);
 			node_end_pfn[node] = MB_TO_PAGES(
 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+			node_remap_size[node] += memory_present(node,
+				node_start_pfn[node], node_end_pfn[node]);
 		}
 	}
 }
--- sparse/arch/i386/kernel/setup.c~FROM-MM-refactor-i386-memory-setup	2005-02-17 15:47:38.000000000 -0800
+++ /arch/i386/kernel/setup.c	2005-02-17 15:48:55.000000000 -0800
@@ -40,6 +40,8 @@
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/edd.h>
+#include <linux/nodemask.h>
+#include <linux/mmzone.h>
 #include <video/edid.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
@@ -951,8 +953,6 @@
 	return max_low_pfn;
 }
 
-#ifndef CONFIG_DISCONTIGMEM
-
 /*
  * Free all available memory for boot time allocation.  Used
  * as a callback function by efi_memory_walk()
@@ -1026,15 +1026,15 @@
 		reserve_bootmem(addr, PAGE_SIZE);	
 }
 
+#ifndef CONFIG_DISCONTIGMEM
+void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
 {
-	unsigned long bootmap_size, start_pfn, max_low_pfn;
-
 	/*
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
 	 */
-	start_pfn = PFN_UP(init_pg_tables_end);
+	min_low_pfn = PFN_UP(init_pg_tables_end);
 
 	find_max_pfn();
 
@@ -1050,10 +1050,52 @@
 #endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
+
+	setup_bootmem_allocator();
+
+	/*
+	 * This will only work for contiguous memory systems.
+	 *
+	 * Leave the evil #ifdef as a big FIXME until you do
+	 * this properly
+	 */
+#ifdef CONFIG_SPARSEMEM
+	memory_present(/*node*/0, /*start_pfn*/0, max_pfn);
+#endif
+	return max_low_pfn;
+}
+
+void __init zone_sizes_init(void)
+{
+	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned int max_dma, low;
+
+	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	low = max_low_pfn;
+
+	if (low < max_dma)
+		zones_size[ZONE_DMA] = low;
+	else {
+		zones_size[ZONE_DMA] = max_dma;
+		zones_size[ZONE_NORMAL] = low - max_dma;
+#ifdef CONFIG_HIGHMEM
+		zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+#endif
+	}
+	free_area_init(zones_size);
+}
+#else
+extern unsigned long __init setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_DISCONTIGMEM */
+
+void __init setup_bootmem_allocator(void)
+{
+	unsigned long bootmap_size;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
-	bootmap_size = init_bootmem(start_pfn, max_low_pfn);
+	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
 
 	register_bootmem_low_pages(max_low_pfn);
 
@@ -1063,7 +1105,7 @@
 	 * the (very unlikely) case of us accidentally initializing the
 	 * bootmem allocator with an invalid RAM area.
 	 */
-	reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
+	reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +
 			 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
 
 	/*
@@ -1120,11 +1162,25 @@
 		}
 	}
 #endif
-	return max_low_pfn;
 }
-#else
-extern unsigned long setup_memory(void);
-#endif /* !CONFIG_DISCONTIGMEM */
+
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+void __init remapped_pgdat_init(void)
+{
+	int nid;
+
+	for_each_online_node(nid) {
+		if (nid != 0)
+			memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+	}
+}
 
 /*
  * Request address space for all standard RAM and ROM resources
@@ -1395,6 +1451,9 @@
 	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
 #endif
 	paging_init();
+	remapped_pgdat_init();
+	sparse_init();
+	zone_sizes_init();
 
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
--- sparse/arch/i386/kernel/srat.c~B-sparse-140-abstract-discontig	2005-02-17 15:47:45.000000000 -0800
+++ /arch/i386/kernel/srat.c	2005-02-17 15:47:45.000000000 -0800
@@ -58,7 +58,7 @@
 static int zholes_size_init;
 static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
 
-extern unsigned long node_start_pfn[], node_end_pfn[];
+extern unsigned long node_start_pfn[], node_end_pfn[], node_remap_size[];
 
 extern void * boot_ioremap(unsigned long, unsigned long);
 
@@ -266,6 +266,10 @@
 		       j, node_memory_chunk[j].nid,
 		       node_memory_chunk[j].start_pfn,
 		       node_memory_chunk[j].end_pfn);
+		node_remap_size[node_memory_chunk[j].nid] += memory_present(
+						node_memory_chunk[j].nid,
+						node_memory_chunk[j].start_pfn,
+						node_memory_chunk[j].end_pfn);
 	}
  
 	/*calculate node_start_pfn/node_end_pfn arrays*/
--- sparse/arch/i386/mm/Makefile~B-sparse-160-sparsemem-i386	2005-02-17 15:47:47.000000000 -0800
+++ /arch/i386/mm/Makefile	2005-02-17 15:47:47.000000000 -0800
@@ -4,7 +4,7 @@
 
 obj-y	:= init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o
 
-obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
+obj-$(CONFIG_NUMA) += discontig.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HIGHMEM) += highmem.o
 obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
--- sparse/arch/i386/mm/boot_ioremap.c~FROM-MM-mostly-i386-mm-cleanup	2005-02-17 15:47:39.000000000 -0800
+++ /arch/i386/mm/boot_ioremap.c	2005-02-17 15:47:39.000000000 -0800
@@ -61,8 +61,8 @@
 /* the virtual space we're going to remap comes from this array */
 #define BOOT_IOREMAP_PAGES 4
 #define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
-__initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] 
-		__attribute__ ((aligned (PAGE_SIZE)));
+static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
+		       __attribute__ ((aligned (PAGE_SIZE)));
 
 /*
  * This only applies to things which need to ioremap before paging_init()
--- sparse/arch/i386/mm/discontig.c~FROM-MM-consolidate-set_max_mapnr_init-implementations	2005-02-17 15:47:37.000000000 -0800
+++ /arch/i386/mm/discontig.c	2005-02-17 15:47:49.000000000 -0800
@@ -42,12 +42,17 @@
  *                  populated the following initialisation.
  *
  * 1) node_online_map  - the map of all nodes configured (online) in the system
- * 2) physnode_map     - the mapping between a pfn and owning node
- * 3) node_start_pfn   - the starting page frame number for a node
+ * 2) node_start_pfn   - the starting page frame number for a node
  * 3) node_end_pfn     - the ending page fram number for a node
  */
+unsigned long node_start_pfn[MAX_NUMNODES];
+unsigned long node_end_pfn[MAX_NUMNODES];
+
+#ifdef CONFIG_DISCONTIGMEM
+/* XXX: this chunk is really the correct contents of discontig.c */
 
 /*
+ * 4) physnode_map     - the mapping between a pfn and owning node
  * physnode_map keeps track of the physical memory layout of a generic
  * numa node on a 256Mb break (each element of the array will
  * represent 256Mb of memory and will be marked by the node id.  so,
@@ -60,8 +65,23 @@
  */
 s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 
-unsigned long node_start_pfn[MAX_NUMNODES];
-unsigned long node_end_pfn[MAX_NUMNODES];
+unsigned long memory_present(int nid, unsigned long start, unsigned long end)
+{
+	unsigned long pfn;
+
+	printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
+			nid, start, end);
+	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
+	printk(KERN_DEBUG "  ");
+	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
+		physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+		printk(KERN_DEBUG "%ld ", pfn);
+	}
+	printk(KERN_DEBUG "\n");
+
+	return (end - start + 1) * sizeof(struct page);
+}
+#endif
 
 extern unsigned long find_max_low_pfn(void);
 extern void find_max_pfn(void);
@@ -82,6 +102,9 @@
 void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
+void *node_remap_end_vaddr[MAX_NUMNODES];
+void *node_remap_alloc_vaddr[MAX_NUMNODES];
+
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
@@ -119,6 +142,18 @@
 		BUG();
 }
 
+/* Find the owning node for a pfn. */
+int early_pfn_to_nid(unsigned long pfn)
+{
+	int nid;
+
+	for (nid = 0; nid < MAX_NUMNODES && node_end_pfn[nid] != 0; nid++)
+		if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn)
+			return nid;
+
+	return 0;
+}
+
 /* 
  * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
  * method.  For node zero take this from the bottom of memory, for
@@ -133,48 +168,22 @@
 	else {
 		NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
 		min_low_pfn += PFN_UP(sizeof(pg_data_t));
-		memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
 	}
 }
 
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-static void __init register_bootmem_low_pages(unsigned long system_max_low_pfn)
+void *alloc_remap(int nid, unsigned long size)
 {
-	int i;
+	void *allocation = node_remap_alloc_vaddr[nid];
 
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long curr_pfn, last_pfn, size;
-		/*
-		 * Reserve usable low memory
-		 */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		/*
-		 * We are rounding up the start address of usable memory:
-		 */
-		curr_pfn = PFN_UP(e820.map[i].addr);
-		if (curr_pfn >= system_max_low_pfn)
-			continue;
-		/*
-		 * ... and at the end of the usable range downwards:
-		 */
-		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+	size = ALIGN(size, L1_CACHE_BYTES);
 
-		if (last_pfn > system_max_low_pfn)
-			last_pfn = system_max_low_pfn;
+	if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+		return 0;
 
-		/*
-		 * .. finally, did all the rounding and playing
-		 * around just make the area go away?
-		 */
-		if (last_pfn <= curr_pfn)
-			continue;
+	node_remap_alloc_vaddr[nid] += size;
+	memset(allocation, 0, size);
 
-		size = last_pfn - curr_pfn;
-		free_bootmem_node(NODE_DATA(0), PFN_PHYS(curr_pfn), PFN_PHYS(size));
-	}
+	return allocation;
 }
 
 void __init remap_numa_kva(void)
@@ -184,8 +193,6 @@
 	int node;
 
 	for_each_online_node(node) {
-		if (node == 0)
-			continue;
 		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
 			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
 			set_pmd_pfn((ulong) vaddr, 
@@ -199,22 +206,44 @@
 {
 	int nid;
 	unsigned long size, reserve_pages = 0;
+	unsigned long pfn;
 
 	for_each_online_node(nid) {
-		if (nid == 0)
+		/*
+		 * The acpi/srat node info can show hot-add memroy zones
+		 * where memory could be added but not currently present.
+		 */
+		if (node_start_pfn[nid] > max_pfn)
 			continue;
-		/* calculate the size of the mem_map needed in bytes */
-		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
-			* sizeof(struct page) + sizeof(pg_data_t);
+
+		if (node_end_pfn[nid] > max_pfn)
+			node_end_pfn[nid] = max_pfn;
+
+		/* ensure the remap includes space for the pgdat. */
+		size = node_remap_size[nid] + sizeof(pg_data_t);
+
 		/* convert size to large (pmd size) pages, rounding up */
 		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
 		size = size * PTRS_PER_PTE;
+
+		/*
+		 * Validate the region we are allocating only contains valid
+		 * pages.
+		 */
+		for (pfn = node_end_pfn[nid] - size;
+		     pfn < node_end_pfn[nid]; pfn++)
+			if (!page_is_ram(pfn))
+				break;
+
+		if (pfn != node_end_pfn[nid])
+			size = 0;
+
 		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
 				size, nid);
 		node_remap_size[nid] = size;
-		reserve_pages += size;
 		node_remap_offset[nid] = reserve_pages;
+		reserve_pages += size;
 		printk("Shrinking node %d from %ld pages to %ld pages\n",
 			nid, node_end_pfn[nid], node_end_pfn[nid] - size);
 		node_end_pfn[nid] -= size;
@@ -225,22 +254,12 @@
 	return reserve_pages;
 }
 
-/*
- * workaround for Dell systems that neglect to reserve EBDA
- */
-static void __init reserve_ebda_region_node(void)
-{
-	unsigned int addr;
-	addr = get_bios_ebda();
-	if (addr)
-		reserve_bootmem_node(NODE_DATA(0), addr, PAGE_SIZE);
-}
-
+extern void setup_bootmem_allocator(void);
 unsigned long __init setup_memory(void)
 {
 	int nid;
-	unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
-	unsigned long reserve_pages, pfn;
+	unsigned long system_start_pfn, system_max_low_pfn;
+	unsigned long reserve_pages;
 
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -251,26 +270,11 @@
 	 */
 	get_memcfg_numa();
 
-	/* Fill in the physnode_map */
-	for_each_online_node(nid) {
-		printk("Node: %d, start_pfn: %ld, end_pfn: %ld\n",
-				nid, node_start_pfn[nid], node_end_pfn[nid]);
-		printk("  Setting physnode_map array to node %d for pfns:\n  ",
-				nid);
-		for (pfn = node_start_pfn[nid]; pfn < node_end_pfn[nid];
-	       				pfn += PAGES_PER_ELEMENT) {
-			physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
-			printk("%ld ", pfn);
-		}
-		printk("\n");
-	}
-
 	reserve_pages = calculate_numa_remap_pages();
 
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	find_max_pfn();
 	system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
 	printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
 			reserve_pages, max_low_pfn + reserve_pages);
@@ -291,12 +295,18 @@
 			(ulong) pfn_to_kaddr(max_low_pfn));
 	for_each_online_node(nid) {
 		node_remap_start_vaddr[nid] = pfn_to_kaddr(
-			(highstart_pfn + reserve_pages) - node_remap_offset[nid]);
+				highstart_pfn + node_remap_offset[nid]);
+		/* Init the node remap allocator */
+		node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+			(node_remap_size[nid] * PAGE_SIZE);
+		node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+			ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
 		allocate_pgdat(nid);
 		printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
 			(ulong) node_remap_start_vaddr[nid],
-			(ulong) pfn_to_kaddr(highstart_pfn + reserve_pages
-			    - node_remap_offset[nid] + node_remap_size[nid]));
+			(ulong) pfn_to_kaddr(highstart_pfn
+			   + node_remap_offset[nid] + node_remap_size[nid]));
 	}
 	printk("High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
@@ -304,70 +314,10 @@
 	for_each_online_node(nid)
 		find_max_pfn_node(nid);
 
+	memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
 	NODE_DATA(0)->bdata = &node0_bdata;
-
-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), min_low_pfn, 0, system_max_low_pfn);
-
-	register_bootmem_low_pages(system_max_low_pfn);
-
-	/*
-	 * Reserve the bootmem bitmap itself as well. We do this in two
-	 * steps (first step was init_bootmem()) because this catches
-	 * the (very unlikely) case of us accidentally initializing the
-	 * bootmem allocator with an invalid RAM area.
-	 */
-	reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +
-		 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
-
-	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
-	 */
-	reserve_bootmem_node(NODE_DATA(0), 0, PAGE_SIZE);
-
-	/*
-	 * But first pinch a few for the stack/trampoline stuff
-	 * FIXME: Don't need the extra page at 4K, but need to fix
-	 * trampoline before removing it. (see the GDT stuff)
-	 */
-	reserve_bootmem_node(NODE_DATA(0), PAGE_SIZE, PAGE_SIZE);
-
-	/* reserve EBDA region, it's a 4K region */
-	reserve_ebda_region_node();
-
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
-
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (LOADER_TYPE && INITRD_START) {
-		if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) {
-			reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
-			initrd_end = initrd_start+INITRD_SIZE;
-		}
-		else {
-			printk(KERN_ERR "initrd extends beyond end of memory "
-			    "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-			    INITRD_START + INITRD_SIZE,
-			    system_max_low_pfn << PAGE_SHIFT);
-			initrd_start = 0;
-		}
-	}
-#endif
-	return system_max_low_pfn;
+	setup_bootmem_allocator();
+	return max_low_pfn;
 }
 
 void __init zone_sizes_init(void)
@@ -382,8 +332,6 @@
 	for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) {
 		if (!node_online(nid))
 			continue;
-		if (nid)
-			memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
 		NODE_DATA(nid)->pgdat_next = pgdat_list;
 		pgdat_list = NODE_DATA(nid);
 	}
@@ -418,23 +366,9 @@
 			}
 		}
 		zholes_size = get_zholes_size(nid);
-		/*
-		 * We let the lmem_map for node 0 be allocated from the
-		 * normal bootmem allocator, but other nodes come from the
-		 * remapped KVA area - mbligh
-		 */
-		if (!nid)
-			free_area_init_node(nid, NODE_DATA(nid),
-					zones_size, start, zholes_size);
-		else {
-			unsigned long lmem_map;
-			lmem_map = (unsigned long)node_remap_start_vaddr[nid];
-			lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1;
-			lmem_map &= PAGE_MASK;
-			NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map;
-			free_area_init_node(nid, NODE_DATA(nid), zones_size,
-				start, zholes_size);
-		}
+
+		free_area_init_node(nid, NODE_DATA(nid), zones_size, start,
+				zholes_size);
 	}
 	return;
 }
@@ -443,35 +377,34 @@
 {
 #ifdef CONFIG_HIGHMEM
 	struct zone *zone;
+	struct page *page;
 
 	for_each_zone(zone) {
-		unsigned long node_pfn, node_high_size, zone_start_pfn;
-		struct page * zone_mem_map;
-		
+		unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+
 		if (!is_highmem(zone))
 			continue;
 
-		printk("Initializing %s for node %d\n", zone->name,
-			zone->zone_pgdat->node_id);
-
-		node_high_size = zone->spanned_pages;
-		zone_mem_map = zone->zone_mem_map;
 		zone_start_pfn = zone->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+		printk("Initializing %s for node %d (%08lx:%08lx)\n",
+				zone->name, zone->zone_pgdat->node_id,
+				zone_start_pfn, zone_end_pfn);
 
-		for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) {
-			one_highpage_init((struct page *)(zone_mem_map + node_pfn),
-					  zone_start_pfn + node_pfn, bad_ppro);
+		/*
+		 * Make use of the guarentee that *_mem_map will be
+		 * contigious in sections aligned at MAX_ORDER.
+		 */
+		page = pfn_to_page(zone_start_pfn);
+		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++, page++) {
+			if (!pfn_valid(node_pfn))
+				continue;
+			if ((node_pfn & ((1 << MAX_ORDER) - 1)) == 0)
+				page = pfn_to_page(node_pfn);
+			one_highpage_init(page, node_pfn, bad_ppro);
 		}
 	}
 	totalram_pages += totalhigh_pages;
 #endif
 }
-
-void __init set_max_mapnr_init(void)
-{
-#ifdef CONFIG_HIGHMEM
-	num_physpages = highend_pfn;
-#else
-	num_physpages = max_low_pfn;
-#endif
-}
--- sparse/arch/i386/mm/init.c~FROM-MM-consolidate-set_max_mapnr_init-implementations	2005-02-17 15:47:37.000000000 -0800
+++ /arch/i386/mm/init.c	2005-02-17 15:48:56.000000000 -0800
@@ -191,7 +191,7 @@
 
 extern int is_available_memory(efi_memory_desc_t *);
 
-static inline int page_is_ram(unsigned long pagenr)
+int page_is_ram(unsigned long pagenr)
 {
 	int i;
 	unsigned long addr, end;
@@ -239,7 +239,7 @@
 #define kmap_get_fixmap_pte(vaddr)					\
 	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
 
-void __init kmap_init(void)
+static void __init kmap_init(void)
 {
 	unsigned long kmap_vstart;
 
@@ -250,7 +250,7 @@
 	kmap_prot = PAGE_KERNEL;
 }
 
-void __init permanent_kmaps_init(pgd_t *pgd_base)
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -280,17 +280,17 @@
 		SetPageReserved(page);
 }
 
-#ifndef CONFIG_DISCONTIGMEM
-void __init set_highmem_pages_init(int bad_ppro) 
+#ifdef CONFIG_NUMA
+extern void set_highmem_pages_init(int);
+#else
+static void __init set_highmem_pages_init(int bad_ppro)
 {
 	int pfn;
 	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
 		one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
 	totalram_pages += totalhigh_pages;
 }
-#else
-extern void set_highmem_pages_init(int);
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_FLATMEM */
 
 #else
 #define kmap_init() do { } while (0)
@@ -301,10 +301,10 @@
 unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
 unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
 
-#ifndef CONFIG_DISCONTIGMEM
-#define remap_numa_kva() do {} while (0)
-#else
+#ifdef CONFIG_NUMA
 extern void __init remap_numa_kva(void);
+#else
+#define remap_numa_kva() do {} while (0)
 #endif
 
 static void __init pagetable_init (void)
@@ -394,31 +394,6 @@
 	flush_tlb_all();
 }
 
-#ifndef CONFIG_DISCONTIGMEM
-void __init zone_sizes_init(void)
-{
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-	unsigned int max_dma, high, low;
-	
-	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	low = max_low_pfn;
-	high = highend_pfn;
-	
-	if (low < max_dma)
-		zones_size[ZONE_DMA] = low;
-	else {
-		zones_size[ZONE_DMA] = max_dma;
-		zones_size[ZONE_NORMAL] = low - max_dma;
-#ifdef CONFIG_HIGHMEM
-		zones_size[ZONE_HIGHMEM] = high - low;
-#endif
-	}
-	free_area_init(zones_size);	
-}
-#else
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_DISCONTIGMEM */
-
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask = ~_PAGE_NX;
 
@@ -519,7 +494,6 @@
 	__flush_tlb_all();
 
 	kmap_init();
-	zone_sizes_init();
 }
 
 /*
@@ -529,7 +503,7 @@
  * but fortunately the switch to using exceptions got rid of all that.
  */
 
-void __init test_wp_bit(void)
+static void __init test_wp_bit(void)
 {
 	printk("Checking if this processor honours the WP bit even in supervisor mode... ");
 
@@ -548,20 +522,17 @@
 	}
 }
 
-#ifndef CONFIG_DISCONTIGMEM
 static void __init set_max_mapnr_init(void)
 {
 #ifdef CONFIG_HIGHMEM
-	max_mapnr = num_physpages = highend_pfn;
+	num_physpages = highend_pfn;
 #else
-	max_mapnr = num_physpages = max_low_pfn;
+	num_physpages = max_low_pfn;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
 #endif
 }
-#define __free_all_bootmem() free_all_bootmem()
-#else
-#define __free_all_bootmem() free_all_bootmem_node(NODE_DATA(0))
-extern void set_max_mapnr_init(void);
-#endif /* !CONFIG_DISCONTIGMEM */
 
 static struct kcore_list kcore_mem, kcore_vmalloc; 
 
@@ -572,7 +543,7 @@
 	int tmp;
 	int bad_ppro;
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 	if (!mem_map)
 		BUG();
 #endif
@@ -592,13 +563,13 @@
 	set_max_mapnr_init();
 
 #ifdef CONFIG_HIGHMEM
-	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
+	high_memory = (char *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+	high_memory = (char *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
 
 	/* this will put all low memory onto the freelists */
-	totalram_pages += __free_all_bootmem();
+	totalram_pages += free_all_bootmem();
 
 	reservedpages = 0;
 	for (tmp = 0; tmp < max_low_pfn; tmp++)
--- sparse/arch/ia64/mm/contig.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/ia64/mm/contig.c	2005-02-17 15:47:42.000000000 -0800
@@ -283,7 +283,7 @@
 		vmem_map = (struct page *) vmalloc_end;
 		efi_memmap_walk(create_mem_map_page_table, NULL);
 
-		mem_map = contig_page_data.node_mem_map = vmem_map;
+		NODE_DATA(0)->node_mem_map = vmem_map;
 		free_area_init_node(0, &contig_page_data, zones_size,
 				    0, zholes_size);
 
--- sparse/arch/m32r/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/m32r/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -121,8 +121,6 @@
 
 	free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0);
 
-	mem_map = contig_page_data.node_mem_map;
-
 	return 0;
 }
 #else	/* CONFIG_DISCONTIGMEM */
--- sparse/arch/ppc64/Kconfig~B-sparse-170-sparsemem-ppc64	2005-02-17 15:47:48.000000000 -0800
+++ /arch/ppc64/Kconfig	2005-02-17 15:47:51.000000000 -0800
@@ -192,17 +192,32 @@
 	depends on SMP
 	default "32"
 
+config ARCH_HAS_BOOTPA
+	bool
+	default y
+
 config HMT
 	bool "Hardware multithreading"
 	depends on SMP && PPC_PSERIES
 
-config DISCONTIGMEM
-	bool "Discontiguous Memory Support"
-	depends on SMP && PPC_PSERIES
+source "mm/Kconfig"
+
+config ARCH_SPARSEMEM_DEFAULT
+	bool
+	depends on PPC_PSERIES
+
+config ARCH_DISCONTIGMEM_DISABLE
+	bool
+	depends on !SMP || !PPC_PSERIES
+
+config ARCH_SPARSEMEM_DISABLE
+	bool
+	depends on !SMP || !PPC_PSERIES
 
 config NUMA
 	bool "NUMA support"
-	depends on DISCONTIGMEM
+	default y if (DISCONTIGMEM)
+	default y if (SPARSEMEM)
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
--- sparse/arch/ppc64/kernel/prom_init.c~G0-ppc64-__boot-fixes	2005-02-17 15:47:51.000000000 -0800
+++ /arch/ppc64/kernel/prom_init.c	2005-02-17 15:47:51.000000000 -0800
@@ -913,11 +913,11 @@
 	extern unsigned long __secondary_hold_spinloop;
 	extern unsigned long __secondary_hold_acknowledge;
 	unsigned long *spinloop
-		= (void *)virt_to_abs(&__secondary_hold_spinloop);
+		= (void *)boot_virt_to_abs(&__secondary_hold_spinloop);
 	unsigned long *acknowledge
-		= (void *)virt_to_abs(&__secondary_hold_acknowledge);
+		= (void *)boot_virt_to_abs(&__secondary_hold_acknowledge);
 	unsigned long secondary_hold
-		= virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold));
+		= boot_virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold));
 	struct prom_t *_prom = PTRRELOC(&prom);
 
 	prom_debug("prom_hold_cpus: start...\n");
@@ -1563,7 +1563,7 @@
 	if ( r3 && r4 && r4 != 0xdeadbeef) {
 		u64 val;
 
-		RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __pa(r3) : r3;
+		RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __boot_pa(r3) : r3;
 		RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4;
 
 		val = (u64)RELOC(prom_initrd_start);
--- sparse/arch/ppc64/kernel/rtas.c~G0-ppc64-__boot-fixes	2005-02-17 15:47:51.000000000 -0800
+++ /arch/ppc64/kernel/rtas.c	2005-02-17 15:47:51.000000000 -0800
@@ -36,6 +36,7 @@
 struct rtas_t rtas = { 
 	.lock = SPIN_LOCK_UNLOCKED
 };
+static unsigned long rtas_args_paddr;
 
 EXPORT_SYMBOL(rtas);
 
@@ -192,8 +193,7 @@
 	for (i = 0; i < nret; ++i)
 		rtas_args->rets[i] = 0;
 
-	PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n",
-		__pa(rtas_args));
+	PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n", rtas_args_paddr);
 	enter_rtas(__pa(rtas_args));
 	PPCDBG(PPCDBG_RTAS, "\treturned from rtas ...\n");
 
@@ -605,6 +605,8 @@
 #endif /* CONFIG_HOTPLUG_CPU */
 	}
 
+	/* Get and save off phys address of rtas structure argunemt field */
+	rtas_args_paddr = __boot_pa(&rtas.args);
 }
 
 
--- sparse/arch/ppc64/kernel/setup.c~G0-ppc64-__boot-fixes	2005-02-17 15:47:51.000000000 -0800
+++ /arch/ppc64/kernel/setup.c	2005-02-17 15:47:51.000000000 -0800
@@ -411,7 +411,7 @@
 	 * tree, like retreiving the physical memory map or
 	 * calculating/retreiving the hash table size
 	 */
-	early_init_devtree(__va(dt_ptr));
+	early_init_devtree(__boot_va(dt_ptr));
 
 	/*
 	 * Iterate all ppc_md structures until we find the proper
@@ -544,10 +544,10 @@
 
 	prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL);
 	if (prop != NULL) {
-		initrd_start = (unsigned long)__va(*prop);
+		initrd_start = (unsigned long)__boot_va(*prop);
 		prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL);
 		if (prop != NULL) {
-			initrd_end = (unsigned long)__va(*prop);
+			initrd_end = (unsigned long)__boot_va(*prop);
 			initrd_below_start_ok = 1;
 		} else
 			initrd_start = 0;
@@ -954,9 +954,9 @@
 	 * SLB misses on them.
 	 */
 	for_each_cpu(i) {
-		softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE,
+		softirq_ctx[i] = (struct thread_info *)__boot_va(lmb_alloc_base(THREAD_SIZE,
 					THREAD_SIZE, 0x10000000));
-		hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE,
+		hardirq_ctx[i] = (struct thread_info *)__boot_va(lmb_alloc_base(THREAD_SIZE,
 					THREAD_SIZE, 0x10000000));
 	}
 }
@@ -985,7 +985,7 @@
 	limit = min(0x10000000UL, lmb.rmo_size);
 
 	for_each_cpu(i)
-		paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128,
+		paca[i].emergency_sp = __boot_va(lmb_alloc_base(PAGE_SIZE, 128,
 						limit)) + PAGE_SIZE;
 }
 
@@ -1026,6 +1026,10 @@
 
 	/* set up the bootmem stuff with available memory */
 	do_init_bootmem();
+#ifdef CONFIG_SPARSEMEM
+	sparse_init();
+#endif
+
 
 	ppc_md.setup_arch();
 
--- sparse/arch/ppc64/mm/Makefile~B-sparse-170-sparsemem-ppc64	2005-02-17 15:47:48.000000000 -0800
+++ /arch/ppc64/mm/Makefile	2005-02-17 15:47:49.000000000 -0800
@@ -6,6 +6,6 @@
 
 obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \
 	slb_low.o slb.o stab.o mmap.o
-obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_NUMA) += numa.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o
--- sparse/arch/ppc64/mm/hash_utils.c~G0-ppc64-__boot-fixes	2005-02-17 15:47:51.000000000 -0800
+++ /arch/ppc64/mm/hash_utils.c	2005-02-17 15:47:51.000000000 -0800
@@ -119,12 +119,12 @@
 #ifdef CONFIG_PPC_PSERIES
 		if (systemcfg->platform & PLATFORM_LPAR)
 			ret = pSeries_lpar_hpte_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
+				boot_virt_to_abs(addr) >> PAGE_SHIFT,
 				0, mode, 1, large);
 		else
 #endif /* CONFIG_PPC_PSERIES */
 			ret = native_hpte_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
+				boot_virt_to_abs(addr) >> PAGE_SHIFT,
 				0, mode, 1, large);
 
 		if (ret == -1) {
--- sparse/arch/ppc64/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/ppc64/mm/init.c	2005-02-17 15:48:56.000000000 -0800
@@ -593,13 +593,24 @@
  * Initialize the bootmem system and give it all the memory we
  * have available.
  */
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NUMA
 void __init do_init_bootmem(void)
 {
 	unsigned long i;
 	unsigned long start, bootmap_pages;
 	unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
 	int boot_mapsize;
+#ifdef CONFIG_SPARSEMEM
+	unsigned long start_pfn, end_pfn;
+
+	/*
+	 * Note presence of first (logical/coalasced) LMB which will
+	 * contain RMO region
+	 */
+	start_pfn = lmb.memory.region[0].physbase >> PAGE_SHIFT;
+	end_pfn = start_pfn + (lmb.memory.region[0].size >> PAGE_SHIFT);
+	memory_present(0, start_pfn, end_pfn);
+#endif
 
 	/*
 	 * Find an area to use for the bootmem bitmap.  Calculate the size of
@@ -615,12 +626,21 @@
 
 	max_pfn = max_low_pfn;
 
-	/* add all physical memory to the bootmem map. Also find the first */
+	/* add all physical memory to the bootmem map.  Also, note the
+	 * presence of all LMBs */
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long physbase, size;
 
 		physbase = lmb.memory.region[i].physbase;
 		size = lmb.memory.region[i].size;
+#ifdef CONFIG_SPARSEMEM
+		if (i) { /* already created mappings for first LMB */
+			start_pfn = physbase >> PAGE_SHIFT;
+			end_pfn = start_pfn + (size >> PAGE_SHIFT);
+		}
+		memory_present(0, start_pfn, end_pfn);
+#endif
+
 		free_bootmem(physbase, size);
 	}
 
@@ -658,9 +678,8 @@
 
 	free_area_init_node(0, &contig_page_data, zones_size,
 			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
-	mem_map = contig_page_data.node_mem_map;
 }
-#endif /* CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_NUMA */
 
 static struct kcore_list kcore_vmem;
 
@@ -691,7 +710,7 @@
 
 void __init mem_init(void)
 {
-#ifdef CONFIG_DISCONTIGMEM
+#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM)
 	int nid;
 #endif
 	pg_data_t *pgdat;
@@ -703,7 +722,7 @@
 	/* The strange -1 +1 is to avoid calling __va on an invalid address */
 	high_memory = (void *) (__va(max_low_pfn * PAGE_SIZE - 1) + 1);
 
-#ifdef CONFIG_DISCONTIGMEM
+#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM)
         for_each_online_node(nid) {
 		if (NODE_DATA(nid)->node_spanned_pages != 0) {
 			printk("freeing bootmem node %x\n", nid);
@@ -718,7 +737,7 @@
 
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			page = pgdat->node_mem_map + i;
+			page = pfn_to_page(i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
@@ -901,3 +920,80 @@
 	if (!zero_cache)
 		panic("pgtable_cache_init(): could not create zero_cache!\n");
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	free_cold_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+/*
+ * This works only for the non-NUMA case.  Later, we'll need a lookup
+ * to convert from real physical addresses to nid, that doesn't use
+ * pfn_to_nid().
+ */
+int __devinit add_memory(u64 start, u64 size, unsigned long attr)
+{
+	struct pglist_data *pgdata = NODE_DATA(0);
+	struct zone *zone;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	/* this should work for most non-highmem platforms */
+	zone = pgdata->node_zones;
+
+	return __add_pages(zone, start_pfn, nr_pages, attr);
+
+	return 0;
+}
+
+/*
+ * First pass at this code will check to determine if the remove
+ * request is within the RMO.  Do not allow removal within the RMO.
+ */
+int __devinit remove_memory(u64 start, u64 size, unsigned long attr)
+{
+	struct zone *zone;
+	unsigned long start_pfn, end_pfn, nr_pages;
+
+	start_pfn = start >> PAGE_SHIFT;
+	nr_pages = size >> PAGE_SHIFT;
+	end_pfn = start_pfn + nr_pages;
+
+	printk("%s(): Attempting to remove memoy in range "
+			"%lx to %lx\n", __func__, start, start+size);
+	/*
+	 * check for range within RMO
+	 */
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	printk("%s(): memory will be removed from "
+			"the %s zone\n", __func__, zone->name);
+
+	/*
+	 * not handling removing memory ranges that
+	 * overlap multiple zones yet
+	 */
+	if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
+		goto overlap;
+
+	/* make sure it is NOT in RMO */
+	if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
+		printk("%s(): range to be removed must NOT be in RMO!\n",
+			__func__);
+		goto in_rmo;
+	}
+
+	return __remove_pages(zone, start_pfn, nr_pages, attr);
+
+overlap:
+	printk("%s(): memory range to be removed overlaps "
+		"multiple zones!!!\n", __func__);
+in_rmo:
+	return -1;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
--- sparse/arch/ppc64/mm/numa.c~B-sparse-130-add-early_pfn_to_nid	2005-02-17 15:47:44.000000000 -0800
+++ /arch/ppc64/mm/numa.c	2005-02-17 15:47:49.000000000 -0800
@@ -58,6 +58,22 @@
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(nr_cpus_in_node);
 
+#ifdef CONFIG_DISCONTIGMEM
+unsigned long memory_present(int nid, unsigned long start, unsigned long end)
+{
+	unsigned long i;
+
+	/* XXX/APW: fix the loop instead ... */
+	start <<= PAGE_SHIFT;
+	end <<= PAGE_SHIFT;
+
+	for (i = start ; i < end; i += MEMORY_INCREMENT)
+		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = nid;
+
+	return 0;
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
 static inline void map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -276,9 +292,12 @@
 		return -1;
 	}
 
+	/* XXX/APW this is another memmodel thing, like memmodel_init() */
+	/* XXX/APW this is DISCONTIG */
 	numa_memory_lookup_table =
 		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
 	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+	/* XXX/APW we should be allocating the phys_section[] here. */
 
 	for (i = 0; i < entries ; i++)
 		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
@@ -378,9 +397,8 @@
 				size / PAGE_SIZE;
 		}
 
-		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
-			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
-				numa_domain;
+		memory_present(numa_domain, start >> PAGE_SHIFT,
+					       (start + size) >> PAGE_SHIFT);
 
 		ranges--;
 		if (ranges)
@@ -428,8 +446,7 @@
 	init_node_data[0].node_start_pfn = 0;
 	init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
 
-	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
-		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
+	memory_present(0, 0, init_node_data[0].node_spanned_pages);
 
 	node0_io_hole_size = top_of_ram - total_ram;
 }
@@ -628,6 +645,8 @@
 	memset(zones_size, 0, sizeof(zones_size));
 	memset(zholes_size, 0, sizeof(zholes_size));
 
+	memmodel_init();
+
 	for_each_online_node(nid) {
 		unsigned long start_pfn;
 		unsigned long end_pfn;
@@ -662,3 +681,20 @@
 	return 0;
 }
 early_param("numa", early_numa);
+
+/* Find the owning node for a pfn. */
+int early_pfn_to_nid(unsigned long pfn)
+{
+	int nid;
+
+	for (nid = 0; nid < MAX_NUMNODES &&
+			init_node_data[nid].node_spanned_pages; nid++) {
+		unsigned long start = init_node_data[nid].node_start_pfn;
+		unsigned long end = start +
+				init_node_data[nid].node_spanned_pages;
+		if (start <= pfn && pfn <= end)
+			return nid;
+	}
+
+	return 0;
+}
--- sparse/arch/sh/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/sh/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -216,8 +216,6 @@
 #endif
 	NODE_DATA(0)->node_mem_map = NULL;
 	free_area_init_node(0, NODE_DATA(0), zones_size, __MEMORY_START >> PAGE_SHIFT, 0);
-	/* XXX: MRB-remove - this doesn't seem sane, should this be done somewhere else ?*/
-	mem_map = NODE_DATA(0)->node_mem_map;
 
 #ifdef CONFIG_DISCONTIGMEM
 	/*
--- sparse/arch/sh64/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/sh64/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -124,9 +124,6 @@
 	zones_size[ZONE_DMA] = MAX_LOW_PFN - START_PFN;
 	NODE_DATA(0)->node_mem_map = NULL;
 	free_area_init_node(0, NODE_DATA(0), zones_size, __MEMORY_START >> PAGE_SHIFT, 0);
-
-	/* XXX: MRB-remove - this doesn't seem sane, should this be done somewhere else ?*/
-	mem_map = NODE_DATA(0)->node_mem_map;
 }
 
 void __init mem_init(void)
--- sparse/arch/sparc/mm/srmmu.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/sparc/mm/srmmu.c	2005-02-17 15:47:42.000000000 -0800
@@ -1343,7 +1343,6 @@
 
 		free_area_init_node(0, &contig_page_data, zones_size,
 				    pfn_base, zholes_size);
-		mem_map = contig_page_data.node_mem_map;
 	}
 }
 
--- sparse/arch/sparc/mm/sun4c.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/sparc/mm/sun4c.c	2005-02-17 15:47:42.000000000 -0800
@@ -2116,7 +2116,6 @@
 
 		free_area_init_node(0, &contig_page_data, zones_size,
 				    pfn_base, zholes_size);
-		mem_map = contig_page_data.node_mem_map;
 	}
 
 	cnt = 0;
--- sparse/arch/sparc64/mm/init.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/sparc64/mm/init.c	2005-02-17 15:47:42.000000000 -0800
@@ -1512,7 +1512,6 @@
 
 		free_area_init_node(0, &contig_page_data, zones_size,
 				    phys_base >> PAGE_SHIFT, zholes_size);
-		mem_map = contig_page_data.node_mem_map;
 	}
 
 	device_scan();
--- sparse/arch/um/kernel/mem.c~FROM-MM-mostly-i386-mm-cleanup	2005-02-17 15:47:39.000000000 -0800
+++ /arch/um/kernel/mem.c	2005-02-17 15:47:39.000000000 -0800
@@ -138,7 +138,7 @@
 	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\
  			  (vaddr)), (vaddr))
 
-void __init kmap_init(void)
+static void __init kmap_init(void)
 {
 	unsigned long kmap_vstart;
 
--- sparse/arch/um/kernel/physmem.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/um/kernel/physmem.c	2005-02-17 15:47:42.000000000 -0800
@@ -294,7 +294,6 @@
 		INIT_LIST_HEAD(&p->lru);
 	}
 
-	mem_map = map;
 	max_mapnr = total_pages;
 	return(0);
 }
--- sparse/arch/v850/kernel/setup.c~A6-no_arch_mem_map_init	2005-02-17 15:47:42.000000000 -0800
+++ /arch/v850/kernel/setup.c	2005-02-17 15:47:42.000000000 -0800
@@ -283,5 +283,4 @@
 	NODE_DATA(0)->node_mem_map = NULL;
 	free_area_init_node (0, NODE_DATA(0), zones_size,
 			     ADDR_TO_PAGE (PAGE_OFFSET), 0);
-	mem_map = NODE_DATA(0)->node_mem_map;
 }
--- sparse/include/asm-frv/highmem.h~FROM-MM-mostly-i386-mm-cleanup	2005-02-17 15:47:39.000000000 -0800
+++ /include/asm-frv/highmem.h	2005-02-17 15:47:39.000000000 -0800
@@ -44,8 +44,6 @@
 #define kmap_pte ______kmap_pte_in_TLB
 extern pte_t *pkmap_page_table;
 
-extern void kmap_init(void);
-
 #define flush_cache_kmaps()  do { } while (0)
 
 /*
--- sparse/include/asm-i386/highmem.h~FROM-MM-mostly-i386-mm-cleanup	2005-02-17 15:47:39.000000000 -0800
+++ /include/asm-i386/highmem.h	2005-02-17 15:48:56.000000000 -0800
@@ -33,8 +33,6 @@
 extern pgprot_t kmap_prot;
 extern pte_t *pkmap_page_table;
 
-extern void kmap_init(void);
-
 /*
  * Right now we initialize only a single pte table. It can be extended
  * easily, subsequent pte tables have to be allocated in one physical
--- sparse/include/asm-i386/mmzone.h~A2.1-re-memset-i386-pgdats	2005-02-17 15:47:39.000000000 -0800
+++ /include/asm-i386/mmzone.h	2005-02-17 15:47:47.000000000 -0800
@@ -8,7 +8,9 @@
 
 #include <asm/smp.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#if CONFIG_NUMA
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid)	(node_data[nid])
 
 #ifdef CONFIG_NUMA
 	#ifdef CONFIG_X86_NUMAQ
@@ -21,8 +23,28 @@
 	#define get_zholes_size(n) (0)
 #endif /* CONFIG_NUMA */
 
-extern struct pglist_data *node_data[];
-#define NODE_DATA(nid)		(node_data[nid])
+extern int get_memcfg_numa_flat(void );
+/*
+ * This allows any one NUMA architecture to be compiled
+ * for, and still fall back to the flat function if it
+ * fails.
+ */
+static inline void get_memcfg_numa(void)
+{
+#ifdef CONFIG_X86_NUMAQ
+	if (get_memcfg_numaq())
+		return;
+#elif CONFIG_ACPI_SRAT
+	if (get_memcfg_from_srat())
+		return;
+#endif
+
+	get_memcfg_numa_flat();
+}
+
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DISCONTIGMEM
 
 /*
  * generic node memory support, the following assumptions apply:
@@ -124,24 +146,28 @@
 }
 #endif
 
-extern int get_memcfg_numa_flat(void );
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_SPARSEMEM
+
 /*
- * This allows any one NUMA architecture to be compiled
- * for, and still fall back to the flat function if it
- * fails.
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the
+ *    flags field of the struct page
  */
-static inline void get_memcfg_numa(void)
-{
-#ifdef CONFIG_X86_NUMAQ
-	if (get_memcfg_numaq())
-		return;
-#elif CONFIG_ACPI_SRAT
-	if (get_memcfg_from_srat())
-		return;
-#endif
 
-	get_memcfg_numa_flat();
-}
+/*
+ * SECTION_SIZE_BITS		2^N: how big each section will be
+ * MAX_PHYSADDR_BITS		2^N: how much physical address space we have
+ * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
+ */
+#define SECTION_SIZE_BITS       28
+#define MAX_PHYSADDR_BITS       36
+#define MAX_PHYSMEM_BITS	36
 
-#endif /* CONFIG_DISCONTIGMEM */
+/* XXX: FIXME -- wli */
+#define kern_addr_valid(kaddr)  (0)
+
+#endif /* CONFIG_SPARSEMEM */
 #endif /* _ASM_MMZONE_H_ */
--- sparse/include/asm-i386/page.h~B-sparse-075-validate-remap-pages	2005-02-17 15:47:42.000000000 -0800
+++ /include/asm-i386/page.h	2005-02-17 15:47:49.000000000 -0800
@@ -119,6 +119,8 @@
 
 extern int sysctl_legacy_va_layout;
 
+extern int page_is_ram(unsigned long pagenr);
+
 #endif /* __ASSEMBLY__ */
 
 #ifdef __ASSEMBLY__
@@ -131,14 +133,16 @@
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
 #define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
 #define MAXMEM			(-__PAGE_OFFSET-__VMALLOC_RESERVE)
-#define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
-#define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define __boot_pa(x)		((unsigned long)(x)-PAGE_OFFSET)
+#define __boot_va(x)		((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define __pa(x)			__boot_pa(x)
+#define __va(x)			__boot_va(x)
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_FLATMEM */
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
--- sparse/include/asm-i386/pgtable.h~B-sparse-160-sparsemem-i386	2005-02-17 15:47:47.000000000 -0800
+++ /include/asm-i386/pgtable.h	2005-02-17 15:47:47.000000000 -0800
@@ -396,9 +396,9 @@
 
 #endif /* !__ASSEMBLY__ */
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 #define kern_addr_valid(addr)	(1)
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_FLATMEM */
 
 #define io_remap_page_range(vma, vaddr, paddr, size, prot)		\
 		remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot)
--- sparse/include/asm-ppc/highmem.h~FROM-MM-mostly-i386-mm-cleanup	2005-02-17 15:47:39.000000000 -0800
+++ /include/asm-ppc/highmem.h	2005-02-17 15:47:39.000000000 -0800
@@ -35,8 +35,6 @@
 extern pgprot_t kmap_prot;
 extern pte_t *pkmap_page_table;
 
-extern void kmap_init(void) __init;
-
 /*
  * Right now we initialize only a single pte table. It can be extended
  * easily, subsequent pte tables have to be allocated in one physical
--- sparse/include/asm-ppc64/abs_addr.h~G1-kravetz-ppc64-fixes	2005-02-17 15:47:51.000000000 -0800
+++ /include/asm-ppc64/abs_addr.h	2005-02-17 15:47:51.000000000 -0800
@@ -104,5 +104,7 @@
 /* Convenience macros */
 #define virt_to_abs(va) phys_to_abs(__pa(va))
 #define abs_to_virt(aa) __va(abs_to_phys(aa))
+#define boot_virt_to_abs(va) phys_to_abs(__boot_pa(va))
+#define boot_abs_to_virt(aa) __boot_va(abs_to_phys(aa))
 
 #endif /* _ABS_ADDR_H */
--- sparse/include/asm-ppc64/dma.h~G0-ppc64-__boot-fixes	2005-02-17 15:47:51.000000000 -0800
+++ /include/asm-ppc64/dma.h	2005-02-17 15:47:51.000000000 -0800
@@ -26,6 +26,8 @@
 /* The maximum address that we can perform a DMA transfer to on this platform */
 /* Doesn't really apply... */
 #define MAX_DMA_ADDRESS  (~0UL)
+#define MAX_DMA_PHYSADDR MAX_DMA_ADDRESS
+#define MAX_DMA_PHYSADDR MAX_DMA_ADDRESS
 
 #define dma_outb	outb
 #define dma_inb		inb
--- sparse/include/asm-ppc64/mmzone.h~B-sparse-170-sparsemem-ppc64	2005-02-17 15:47:49.000000000 -0800
+++ /include/asm-ppc64/mmzone.h	2005-02-17 15:47:51.000000000 -0800
@@ -10,9 +10,34 @@
 #include <linux/config.h>
 #include <asm/smp.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_SPARSEMEM
+
+/* generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the
+ *    flags field of the struct page
+ */
+
+/*
+ * SECTION_SIZE_BITS		2^N: how big each section will be
+ * MAX_PHYSADDR_BITS		2^N: how much physical address space we have
+ * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
+ */
+#define SECTION_SIZE_BITS	24
+#define MAX_PHYSADDR_BITS	38
+#define MAX_PHYSMEM_BITS	36
+
+#endif /* CONFIG_SPARSEMEM */
+
+#if defined(CONFIG_NUMA)
+
+#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM)
 
 extern struct pglist_data *node_data[];
+/*
+ * Return a pointer to the node data for node n.
+ */
+#define NODE_DATA(nid)		(node_data[nid])
 
 /*
  * Following are specific to this numa platform.
@@ -27,6 +52,10 @@
 #define MEMORY_INCREMENT_SHIFT 24
 #define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT)
 
+#endif /* CONFIG_DISCONTIGMEM || CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_DISCONTIGMEM
+
 /* NUMA debugging, will not work on a DLPAR machine */
 #undef DEBUG_NUMA
 
@@ -49,11 +78,6 @@
 
 #define pfn_to_nid(pfn)		pa_to_nid((pfn) << PAGE_SHIFT)
 
-/*
- * Return a pointer to the node data for node n.
- */
-#define NODE_DATA(nid)		(node_data[nid])
-
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
 
 /*
@@ -91,4 +115,16 @@
 #define discontigmem_pfn_valid(pfn)		((pfn) < num_physpages)
 
 #endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_SPARSEMEM
+
+#define pa_to_nid(pa)							\
+({									\
+	pfn_to_nid(pa >> PAGE_SHIFT);					\
+})
+
+#endif /* CONFIG_SPARSEMEM */
+
+#endif /* CONFIG_NUMA */
+
 #endif /* _ASM_MMZONE_H_ */
--- sparse/include/asm-ppc64/page.h~B-sparse-170-sparsemem-ppc64	2005-02-17 15:47:49.000000000 -0800
+++ /include/asm-ppc64/page.h	2005-02-17 15:47:50.000000000 -0800
@@ -179,7 +179,10 @@
 	return order;
 }
 
-#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
+#define __boot_pa(x)   ((unsigned long)(x)-PAGE_OFFSET)
+#define __boot_va(x)   ((void *)((unsigned long)(x) + KERNELBASE))
+#define __pa(x)		__boot_pa(x)
+#define __va(x)		__boot_va(x)
 
 extern int page_is_ram(unsigned long pfn);
 
@@ -215,13 +218,13 @@
 #define __bpn_to_ba(x) ((((unsigned long)(x))<<PAGE_SHIFT) + KERNELBASE)
 #define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT)
 
-#define __va(x) ((void *)((unsigned long)(x) + KERNELBASE))
-
 #ifdef CONFIG_DISCONTIGMEM
 #define page_to_pfn(page)	discontigmem_page_to_pfn(page)
 #define pfn_to_page(pfn)	discontigmem_pfn_to_page(pfn)
 #define pfn_valid(pfn)		discontigmem_pfn_valid(pfn)
-#else
+#endif
+/* XXX/APW: why is SPARSEMEM not here */
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
--- sparse/include/linux/bootmem.h~B-sparse-080-alloc_remap-i386	2005-02-17 15:47:43.000000000 -0800
+++ /include/linux/bootmem.h	2005-02-17 15:47:50.000000000 -0800
@@ -36,6 +36,10 @@
 					 * up searching */
 } bootmem_data_t;
 
+#ifndef MAX_DMA_PHYSADDR
+#define MAX_DMA_PHYSADDR (__boot_pa(MAX_DMA_ADDRESS))
+#endif
+
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
 extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend);
 extern void __init free_bootmem (unsigned long addr, unsigned long size);
@@ -43,11 +47,11 @@
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
-	__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem((x), SMP_CACHE_BYTES, MAX_DMA_PHYSADDR)
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem((x), SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
-	__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem((x), PAGE_SIZE, MAX_DMA_PHYSADDR)
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem((x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
@@ -60,13 +64,22 @@
 extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, MAX_DMA_PHYSADDR)
 #define alloc_bootmem_pages_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, MAX_DMA_PHYSADDR)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
+extern void *alloc_remap(int nid, unsigned long size);
+#else
+static inline void *alloc_remap(int nid, unsigned long size)
+{
+	return NULL;
+}
+#endif
+
 extern unsigned long __initdata nr_kernel_pages;
 extern unsigned long __initdata nr_all_pages;
 
--- sparse/include/linux/mm.h~B-sparse-100-cleanup-node-zone	2005-02-17 15:47:43.000000000 -0800
+++ /include/linux/mm.h	2005-02-17 15:47:45.000000000 -0800
@@ -398,19 +398,93 @@
 /*
  * The zone field is never updated after free_area_init_core()
  * sets it, so none of the operations on it need to be atomic.
- * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total,
- * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits.
  */
-#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT)
-#define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
+
+
+/*
+ * page->flags layout:
+ *
+ * There are three possibilities for how page->flags get
+ * laid out.  The first is for the normal case, without
+ * sparsemem.  The second is for sparsemem when there is
+ * plenty of space for node and section.  The last is when
+ * we have run out of space and have to fall back to an
+ * alternate (slower) way of determining the node.
+ *
+ *        No sparsemem: |       NODE     | ZONE | ... | FLAGS |
+ * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
+ *   no space for node: | SECTION |     ZONE    | ... | FLAGS |
+ */
+#if SECTIONS_SHIFT+NODES_SHIFT+ZONES_SHIFT <= FLAGS_RESERVED
+#define NODES_WIDTH		NODES_SHIFT
+#else
+#define NODES_WIDTH		0
+#endif
+
+#ifdef CONFIG_SPARSEMEM
+#define SECTIONS_WIDTH		SECTIONS_SHIFT
+#else
+#define SECTIONS_WIDTH		0
+#endif
+
+#define ZONES_WIDTH		ZONES_SHIFT
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
+#define SECTIONS_PGOFF		((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+
+/*
+ * We are going to use the flags for the page to node mapping if its in
+ * there.  This includes the case where there is no node, so it is implicit.
+ */
+#define FLAGS_HAS_NODE		(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+
+#ifndef PFN_SECTION_SHIFT
+#define PFN_SECTION_SHIFT 0
+#endif
+
+/*
+ * Define the bit shifts to access each section.  For non-existant
+ * sections we define the shift as 0; that plus a 0 mask ensures
+ * the compiler will optimise away reference to them.
+ */
+#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
+
+/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
+#if FLAGS_HAS_NODE
+#define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#else
+#define ZONETABLE_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#endif
+#define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
+
+#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#endif
+
+#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
+#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
+#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
+#define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
 
 static inline unsigned long page_zonenum(struct page *page)
 {
-	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
+	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
+static inline struct zone *page_zone(struct page *page);
 static inline unsigned long page_to_nid(struct page *page)
 {
-	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
+	if (FLAGS_HAS_NODE)
+		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+	else
+		return page_zone(page)->zone_pgdat->node_id;
+}
+static inline unsigned long page_to_section(struct page *page)
+{
+	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
 }
 
 struct zone;
@@ -418,13 +492,32 @@
 
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[page->flags >> NODEZONE_SHIFT];
+	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
+			ZONETABLE_MASK];
+}
+
+static inline void set_page_zone(struct page *page, unsigned long zone)
+{
+	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
+	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+}
+static inline void set_page_node(struct page *page, unsigned long node)
+{
+	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
+	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
+}
+static inline void set_page_section(struct page *page, unsigned long section)
+{
+	page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+	page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
 }
 
-static inline void set_page_zone(struct page *page, unsigned long nodezone_num)
+static inline void set_page_links(struct page *page, unsigned long zone,
+	unsigned long node, unsigned long pfn)
 {
-	page->flags &= ~(~0UL << NODEZONE_SHIFT);
-	page->flags |= nodezone_num << NODEZONE_SHIFT;
+	set_page_zone(page, zone);
+	set_page_node(page, node);
+	set_page_section(page, pfn >> PFN_SECTION_SHIFT);
 }
 
 #ifndef CONFIG_DISCONTIGMEM
--- sparse/include/linux/mmzone.h~B-sparse-100-cleanup-node-zone	2005-02-17 15:47:43.000000000 -0800
+++ /include/linux/mmzone.h	2005-02-17 15:48:56.000000000 -0800
@@ -372,44 +372,165 @@
 /* Returns the number of the current Node. */
 #define numa_node_id()		(cpu_to_node(_smp_processor_id()))
 
-#ifndef CONFIG_DISCONTIGMEM
-
-extern struct pglist_data contig_page_data;
+#ifndef CONFIG_NUMA
 #define NODE_DATA(nid)		(&contig_page_data)
+extern struct pglist_data contig_page_data;
+#endif
+
+#ifdef CONFIG_FLATMEM
+
 #define NODE_MEM_MAP(nid)	mem_map
 #define MAX_NODES_SHIFT		1
 #define pfn_to_nid(pfn)		(0)
 
-#else /* CONFIG_DISCONTIGMEM */
+#else /* !CONFIG_FLATMEM */
 
 #include <asm/mmzone.h>
 
+#endif /* CONFIG_FLATMEM */
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
  * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
  */
-#define MAX_NODES_SHIFT		6
+#define FLAGS_RESERVED		10
+
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
-#define MAX_NODES_SHIFT		10
+#define FLAGS_RESERVED		32
+
+#else
+
+#error BITS_PER_LONG not defined
+
 #endif
 
-#endif /* !CONFIG_DISCONTIGMEM */
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * SECTION_SHIFT    		#bits space required to store a section #
+ *
+ * PA_SECTION_SHIFT		physical address to/from section number
+ * PFN_SECTION_SHIFT		pfn to/from section number
+ */
+#define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
 
-#if NODES_SHIFT > MAX_NODES_SHIFT
-#error NODES_SHIFT > MAX_NODES_SHIFT
+#define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
+#define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
+
+#define NR_MEM_SECTIONS	(1 << SECTIONS_SHIFT)
+
+#define PAGES_PER_SECTION       (1 << PFN_SECTION_SHIFT)
+#define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
+
+#if MAX_ORDER > SECTION_SIZE_BITS
+#error MAX_ORDER exceeds SECTION_SIZE_BITS
 #endif
 
-/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */
-#define MAX_ZONES_SHIFT		2
+struct page;
+struct mem_section {
+	/*
+	 * This is, logically, a pointer to an array of struct
+	 * pages.  However, it is stored with some other magic.
+	 * (see sparse.c::sparse_init_one_section())
+	 *
+	 * Making it a UL at least makes someone do a cast
+	 * before using it wrong.
+	 */
+	unsigned long section_mem_map;
+};
+
+extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
-#if ZONES_SHIFT > MAX_ZONES_SHIFT
-#error ZONES_SHIFT > MAX_ZONES_SHIFT
+/*
+ * We use the lower bits of the mem_map pointer to store
+ * a little bit of information.  There should be at least
+ * 3 bits here due to 32-bit alignment.
+ */
+#define	SECTION_MARKED_PRESENT	(1UL<<0)
+#define SECTION_HAS_MEM_MAP	(1UL<<1)
+#define SECTION_MAP_LAST_BIT	(1UL<<2)
+#define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+
+static inline struct page *__section_mem_map_addr(struct mem_section *section)
+{
+	unsigned long map = section->section_mem_map;
+	map &= SECTION_MAP_MASK;
+	return (struct page *)map;
+}
+
+static inline int valid_section(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_MARKED_PRESENT);
+}
+
+static inline int section_has_mem_map(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_HAS_MEM_MAP);
+}
+
+/*
+ * Given a kernel address, find the home node of the underlying memory.
+ */
+#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
+
+static inline struct mem_section *__pfn_to_section(unsigned long pfn)
+{
+	return &mem_section[pfn >> PFN_SECTION_SHIFT];
+}
+
+#define pfn_to_page(pfn) 						\
+({ 									\
+	unsigned long __pfn = (pfn);					\
+	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
+})
+#define page_to_pfn(page)						\
+({									\
+	page - __section_mem_map_addr(&mem_section[page_to_section(page)]);	\
+})
+
+static inline int pfn_valid(unsigned long pfn)
+{
+	if ((pfn >> PFN_SECTION_SHIFT) >= NR_MEM_SECTIONS)
+		return 0;
+	return valid_section(&mem_section[pfn >> PFN_SECTION_SHIFT]);
+}
+
+/*
+ * APW/XXX: these are _only_ used during initialisation, therefore they
+ * can use __initdata ... they should have names to indicate this
+ * restriction.
+ */
+#ifdef CONFIG_NUMA
+#define pfn_to_nid		early_pfn_to_nid
+#else
+#define pfn_to_nid(pfn) 0
+#define early_pfn_to_nid(pfn)	0
 #endif
 
+#define pfn_to_pgdat(pfn)						\
+({									\
+	NODE_DATA(pfn_to_nid(pfn));					\
+})
+
+#define early_pfn_valid(pfn)	pfn_valid(pfn)
+void sparse_init(void);
+
+#else
+
+#define sparse_init()	do {} while (0)
+
+#endif /* CONFIG_SPARSEMEM */
+
+#ifndef early_pfn_valid
+#define early_pfn_valid(pfn)	(1)
+#endif
+
+unsigned long memory_present(int nid, unsigned long start, unsigned long end);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
--- sparse/include/linux/numa.h~B-sparse-150-sparsemem	2005-02-17 15:47:45.000000000 -0800
+++ /include/linux/numa.h	2005-02-17 15:47:45.000000000 -0800
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_FLATMEM
 #include <asm/numnodes.h>
 #endif
 
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /mm/Kconfig	2005-02-17 15:48:56.000000000 -0800
@@ -0,0 +1,19 @@
+choice
+	prompt "Memory model"
+	default SPARSEMEM if ARCH_SPARSEMEM_DEFAULT
+	default FLATMEM
+
+config DISCONTIGMEM
+	bool "Discontigious Memory"
+	depends on !ARCH_DISCONTIGMEM_DISABLE
+
+config SPARSEMEM
+	bool "Sparse Memory"
+	depends on !ARCH_SPARSEMEM_DISABLE
+
+config FLATMEM
+	bool "Flat Memory"
+
+endchoice
+
+
--- sparse/mm/Makefile~B-sparse-150-sparsemem	2005-02-17 15:47:45.000000000 -0800
+++ /mm/Makefile	2005-02-17 15:48:56.000000000 -0800
@@ -15,6 +15,7 @@
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
+obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 
--- sparse/mm/bootmem.c~B-sparse-150-sparsemem	2005-02-17 15:47:45.000000000 -0800
+++ /mm/bootmem.c	2005-02-17 15:47:49.000000000 -0800
@@ -256,6 +256,7 @@
 static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 {
 	struct page *page;
+	unsigned long pfn;
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long i, count, total = 0;
 	unsigned long idx;
@@ -266,15 +267,29 @@
 
 	count = 0;
 	/* first extant page of the node */
-	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+	pfn = bdata->node_boot_start >> PAGE_SHIFT;
 	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
 	map = bdata->node_bootmem_map;
 	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
 	if (bdata->node_boot_start == 0 ||
 	    ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
 		gofast = 1;
+
+	/*
+	 * APW/XXX: we are making an assumption that our node_boot_start
+	 * is aligned to BITS_PER_LONG ... is this valid/enforced.
+	 */
+	/*
+	 * Make use of the guarentee that *_mem_map will be
+	 * contigious in sections aligned at MAX_ORDER.
+	 */
+	page = pfn_to_page(pfn);
 	for (i = 0; i < idx; ) {
 		unsigned long v = ~map[i / BITS_PER_LONG];
+
+		if ((pfn & ((1 << MAX_ORDER) - 1)) == 0)
+			page = pfn_to_page(pfn);
+
 		if (gofast && v == ~0UL) {
 			int j, order;
 
@@ -304,6 +319,7 @@
 			i+=BITS_PER_LONG;
 			page += BITS_PER_LONG;
 		}
+		pfn += BITS_PER_LONG;
 	}
 	total += count;
 
--- sparse/mm/memory.c~B-sparse-150-sparsemem	2005-02-17 15:47:45.000000000 -0800
+++ /mm/memory.c	2005-02-17 15:47:45.000000000 -0800
@@ -59,7 +59,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 struct page *mem_map;
--- sparse/mm/page_alloc.c~A1-pcp_zone_init	2005-02-17 15:47:38.000000000 -0800
+++ /mm/page_alloc.c	2005-02-17 15:48:56.000000000 -0800
@@ -61,7 +61,7 @@
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -191,6 +191,35 @@
 }
 
 /*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equasion:
+ *     B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equasion:
+ *     P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ */
+static inline struct page *__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
+{
+	unsigned long buddy_idx = page_idx ^ (1 << order);
+
+	return page + (buddy_idx - page_idx);;
+}
+
+static inline unsigned long __find_combined_index(unsigned long page_idx, unsigned int order)
+{
+	return (page_idx & ~(1 << order));
+}
+
+/*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is free &&
@@ -233,44 +262,43 @@
  * -- wli
  */
 
-static inline void __free_pages_bulk (struct page *page, struct page *base,
+static inline void __free_pages_bulk (struct page *page,
 		struct zone *zone, unsigned int order)
 {
 	unsigned long page_idx;
-	struct page *coalesced;
 	int order_size = 1 << order;
 
 	if (unlikely(order))
 		destroy_compound_page(page, order);
 
-	page_idx = page - base;
+	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
 	BUG_ON(page_idx & (order_size - 1));
 	BUG_ON(bad_range(zone, page));
 
 	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
+		unsigned long combined_idx;
 		struct free_area *area;
 		struct page *buddy;
-		int buddy_idx;
 
-		buddy_idx = (page_idx ^ (1 << order));
-		buddy = base + buddy_idx;
+		combined_idx = __find_combined_index(page_idx, order);
+		buddy = __page_find_buddy(page, page_idx, order);
+
 		if (bad_range(zone, buddy))
 			break;
 		if (!page_is_buddy(buddy, order))
-			break;
-		/* Move the buddy up one level. */
+			break;		/* Move the buddy up one level. */
 		list_del(&buddy->lru);
 		area = zone->free_area + order;
 		area->nr_free--;
 		rmv_page_order(buddy);
-		page_idx &= buddy_idx;
+		page = page + (combined_idx - page_idx);
+		page_idx = combined_idx;
 		order++;
 	}
-	coalesced = base + page_idx;
-	set_page_order(coalesced, order);
-	list_add(&coalesced->lru, &zone->free_area[order].free_list);
+	set_page_order(page, order);
+	list_add(&page->lru, &zone->free_area[order].free_list);
 	zone->free_area[order].nr_free++;
 }
 
@@ -309,10 +337,9 @@
 		struct list_head *list, unsigned int order)
 {
 	unsigned long flags;
-	struct page *base, *page = NULL;
+	struct page *page = NULL;
 	int ret = 0;
 
-	base = zone->zone_mem_map;
 	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
@@ -320,7 +347,7 @@
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->lru);
-		__free_pages_bulk(page, base, zone, order);
+		__free_pages_bulk(page, zone, order);
 		ret++;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -1370,7 +1397,6 @@
 	/* initialize zonelists */
 	for (i = 0; i < GFP_ZONETYPES; i++) {
 		zonelist = pgdat->node_zonelists + i;
-		memset(zonelist, 0, sizeof(*zonelist));
 		zonelist->zones[0] = NULL;
 	}
 
@@ -1417,7 +1443,6 @@
 		struct zonelist *zonelist;
 
 		zonelist = pgdat->node_zonelists + i;
-		memset(zonelist, 0, sizeof(*zonelist));
 
 		j = 0;
 		k = ZONE_NORMAL;
@@ -1532,11 +1557,20 @@
 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
-	struct page *start = pfn_to_page(start_pfn);
 	struct page *page;
+	int pfn;
 
-	for (page = start; page < (start + size); page++) {
-		set_page_zone(page, NODEZONE(nid, zone));
+	/*
+	 * Make use of the guarentee that *_mem_map will be
+	 * contigious in sections aligned at MAX_ORDER.
+	 */
+	page = pfn_to_page(start_pfn);
+	for (pfn = start_pfn; pfn < (start_pfn + size); pfn++, page++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		if ((pfn & ((1 << MAX_ORDER) - 1)) == 0)
+			page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 0);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
@@ -1560,11 +1594,106 @@
 	}
 }
 
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn >> PFN_SECTION_SHIFT;
+	unsigned long end = (pfn + size) >> PFN_SECTION_SHIFT;
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+	unsigned long batch;
+	int cpu;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/4 of a meg - there's
+	 * no point in going beyond the size of L2 cache.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 256 * 1024)
+		batch = (256 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		struct per_cpu_pages *pcp;
+
+		pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+		pcp->count = 0;
+		pcp->low = 2 * batch;
+		pcp->high = 6 * batch;
+		pcp->batch = 1 * batch;
+		INIT_LIST_HEAD(&pcp->list);
+
+		pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+		pcp->count = 0;
+		pcp->low = 0;
+		pcp->high = 2 * batch;
+		pcp->batch = 1 * batch;
+		INIT_LIST_HEAD(&pcp->list);
+	}
+	printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
+}
+
+static __devinit void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int table_size_bytes;
+	int i;
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_size = wait_table_size(zone_size_pages);
+	zone->wait_table_bits =
+		wait_table_bits(zone->wait_table_size);
+	table_size_bytes = zone->wait_table_size * sizeof(wait_queue_head_t);
+	if (system_state < SYSTEM_RUNNING)
+		zone->wait_table = alloc_bootmem_node(zone->zone_pgdat,
+						      table_size_bytes);
+	else
+		zone->wait_table = kmalloc(table_size_bytes, GFP_KERNEL);
+
+	for(i = 0; i < zone->wait_table_size; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+}
+
+static void init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size)
+{
+	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nid = pgdat->node_id;
+
+	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+	zone->zone_start_pfn = zone_start_pfn;
+
+	if ((zone_start_pfn) & (zone_required_alignment-1))
+		printk("BUG: wrong zone alignment, it will crash\n");
+
+	memmap_init(size, nid, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	pgdat->nr_zones++;
+}
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -1574,9 +1703,8 @@
 static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
-	unsigned long i, j;
-	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
-	int cpu, nid = pgdat->node_id;
+	unsigned long j;
+	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
 	pgdat->nr_zones = 0;
@@ -1586,9 +1714,7 @@
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
-		unsigned long batch;
 
-		zone_table[NODEZONE(nid, j)] = zone;
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
@@ -1606,40 +1732,7 @@
 		zone->free_pages = 0;
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
-		/*
-		 * The per-cpu-pages pools are set to around 1000th of the
-		 * size of the zone.  But no more than 1/4 of a meg - there's
-		 * no point in going beyond the size of L2 cache.
-		 *
-		 * OK, so we don't know how big the cache is.  So guess.
-		 */
-		batch = zone->present_pages / 1024;
-		if (batch * PAGE_SIZE > 256 * 1024)
-			batch = (256 * 1024) / PAGE_SIZE;
-		batch /= 4;		/* We effectively *= 4 below */
-		if (batch < 1)
-			batch = 1;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
-			pcp->count = 0;
-			pcp->low = 2 * batch;
-			pcp->high = 6 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-
-			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
-			pcp->count = 0;
-			pcp->low = 0;
-			pcp->high = 2 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-		}
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-				zone_names[j], realsize, batch);
+		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
@@ -1649,44 +1742,40 @@
 		if (!size)
 			continue;
 
-		/*
-		 * The per-page waitqueue mechanism uses hashed waitqueues
-		 * per zone.
-		 */
-		zone->wait_table_size = wait_table_size(size);
-		zone->wait_table_bits =
-			wait_table_bits(zone->wait_table_size);
-		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, zone->wait_table_size
-						* sizeof(wait_queue_head_t));
-
-		for(i = 0; i < zone->wait_table_size; ++i)
-			init_waitqueue_head(zone->wait_table + i);
-
-		pgdat->nr_zones = j+1;
-
-		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
-		zone->zone_start_pfn = zone_start_pfn;
-
-		if ((zone_start_pfn) & (zone_required_alignment-1))
-			printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
-
 		memmap_init(size, nid, j, zone_start_pfn);
 
-		zone_start_pfn += size;
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
 
-		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+		zone_wait_table_init(zone, size);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
+		zone_start_pfn += size;
 	}
 }
 
-void __init node_alloc_mem_map(struct pglist_data *pgdat)
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	unsigned long size;
+	struct page *map;
+
+	/*
+	 * Make sure that the architecture hasn't already allocated
+	 * a node_mem_map, and that the node contains memory.
+	 */
+	if (pgdat->node_mem_map || !pgdat->node_spanned_pages)
+		return;
 
 	size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-	pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
-#ifndef CONFIG_DISCONTIGMEM
-	mem_map = contig_page_data.node_mem_map;
+	map = alloc_remap(pgdat->node_id, size);
+	if (!map)
+		map = alloc_bootmem_node(pgdat, size);
+	pgdat->node_mem_map = map;
+
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
 #endif
 }
 
@@ -1698,8 +1787,7 @@
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
 
-	if (!pfn_to_page(node_start_pfn))
-		node_alloc_mem_map(pgdat);
+	alloc_node_mem_map(pgdat);
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /mm/sparse.c	2005-02-17 15:47:46.000000000 -0800
@@ -0,0 +1,115 @@
+/*
+ * Non-linear memory mappings.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+
+/*
+ * Permenant non-linear data:
+ *
+ * 1) mem_section	- memory sections, mem_map's for valid memory
+ */
+struct mem_section mem_section[NR_MEM_SECTIONS];
+EXPORT_SYMBOL(mem_section);
+
+/* Record a memory area against a node. */
+unsigned long memory_present(int nid, unsigned long start, unsigned long end)
+{
+	unsigned long pfn = start;
+	unsigned long size = 0;
+
+	start &= PAGE_SECTION_MASK;
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+		int section = pfn >> PFN_SECTION_SHIFT;
+		if (!mem_section[section].section_mem_map) {
+			mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+			size += (PAGES_PER_SECTION * sizeof (struct page));
+		}
+	}
+
+	return size;
+}
+
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, int pnum)
+{
+	return (unsigned long)(mem_map - (pnum << PFN_SECTION_SHIFT));
+}
+
+static __attribute((unused))
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, int pnum)
+{
+	return ((struct page *)coded_mem_map) + (pnum << PFN_SECTION_SHIFT);
+}
+
+static int sparse_init_one_section(struct mem_section *ms, int pnum, struct page *mem_map)
+{
+	if (!valid_section(ms))
+		return -EINVAL;
+
+	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+
+	return 1;
+}
+
+static struct page *sparse_early_mem_map_alloc(int pnum)
+{
+	struct page *map;
+	int nid = early_pfn_to_nid(pnum << PFN_SECTION_SHIFT);
+
+	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+	if (map)
+		return map;
+
+	map = alloc_bootmem_node(NODE_DATA(nid),
+			sizeof(struct page) * PAGES_PER_SECTION);
+	if (map)
+		return map;
+
+	printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+	mem_section[pnum].section_mem_map = 0;
+	return NULL;
+}
+
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void sparse_init(void)
+{
+	int pnum;
+	struct page *map;
+
+	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+		if (!valid_section_nr(pnum))
+			continue;
+
+		map = sparse_early_mem_map_alloc(pnum);
+		if (map)
+			sparse_init_one_section(&mem_section[pnum], pnum, map);
+	}
+}
+
+/*
+ * returns the number of sections whose mem_maps were properly
+ * set.  If this is zero, then that means that the passed-in
+ * map was not consumed and must be freed.
+ */
+int sparse_add_one_section(int phys_start_pfn, int nr_pages, struct page *map)
+{
+	struct mem_section *ms = __pfn_to_section(phys_start_pfn);
+
+	if (ms->section_mem_map & SECTION_MARKED_PRESENT)
+		return -EEXIST;
+
+	ms->section_mem_map |= SECTION_MARKED_PRESENT;
+
+	return sparse_init_one_section(ms, phys_start_pfn >> PFN_SECTION_SHIFT, map);
+}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH] Memory Hotplug
  2005-02-18  0:03 [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Dave Hansen
@ 2005-02-18  0:05 ` Dave Hansen
  2005-02-18 21:52   ` Rik van Riel
  2005-02-18  5:16 ` [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Mike Kravetz
  2005-02-18 10:04 ` Andi Kleen
  2 siblings, 1 reply; 9+ messages in thread
From: Dave Hansen @ 2005-02-18  0:05 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: lhms, linux-mm, Andy Whitcroft

[-- Attachment #1: Type: text/plain, Size: 1398 bytes --]

The attached patch is a prototype implementation of memory hot-add.  It
allows you to boot your system, and add memory to it later.  Why would
you want to do this?  Well, it's a step before memory removal which can
help cope with things like bad RAM.  This is primarily useful for a
machine that you don't want to reboot during an upgrade.

For instance, on my 1GB laptop, I booted with mem=512M on the kernel
command-line.  Once I had booted, I did the following:

cd /sys/devices/system/memory
echo 0x20000000 > probe
echo 0x30000000 > probe
echo online > memory2/state
echo online > memory3/state

and the last 512MB of my laptop's memory was onlined.  The onlining
operations can occur from an /etc/hotplug script if desired.

Here's the config file that I used:
http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/configs/config-i386-T41-laptop

The important config options are:
CONFIG_MEMORY_HOTPLUG=y 
CONFIG_SPARSEMEM=y
CONFIG_SIMULATED_MEM_HOTPLUG=y

This patch depends on the previously posed "Sparse Memory Handling
(hot-add foundation)" patch.

There are a number of individual patches (with descriptions) which are
rolled up in the attached patch: all of the files listed after
"G2-no-memory-at-high_memory-ppc64.patch" from this directory:
http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out/

I can post individual patches if anyone would like to comment on them.

-- Dave

[-- Attachment #2: memory-hot-add-2.6.11-rc3.patch --]
[-- Type: text/x-patch, Size: 55181 bytes --]

--- memhotplug/arch/i386/kernel/setup.c~Y2-page_is_ram_hotplug	2005-02-17 15:51:10.000000000 -0800
+++ /arch/i386/kernel/setup.c	2005-02-17 15:51:10.000000000 -0800
@@ -118,6 +118,7 @@
 struct edid_info edid_info;
 struct ist_info ist_info;
 struct e820map e820;
+struct e820map bios_e820;
 
 unsigned char aux_device_present;
 
@@ -1417,6 +1418,7 @@
 	else {
 		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 		print_memory_map(machine_specific_memory_setup());
+		bios_e820 = e820;
 	}
 
 	copy_edd();
--- memhotplug/arch/i386/mm/init.c~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /arch/i386/mm/init.c	2005-02-17 15:51:10.000000000 -0800
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/efi.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -191,38 +192,42 @@
 
 extern int is_available_memory(efi_memory_desc_t *);
 
-int page_is_ram(unsigned long pagenr)
+static int page_is_ram_efi(unsigned long pagenr)
 {
+#ifdef CONFIG_EFI
 	int i;
 	unsigned long addr, end;
+	efi_memory_desc_t *md;
 
-	if (efi_enabled) {
-		efi_memory_desc_t *md;
-
-		for (i = 0; i < memmap.nr_map; i++) {
-			md = &memmap.map[i];
-			if (!is_available_memory(md))
-				continue;
-			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-			end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
-			if ((pagenr >= addr) && (pagenr < end))
-				return 1;
-		}
-		return 0;
+	for (i = 0; i < memmap.nr_map; i++) {
+		md = &memmap.map[i];
+		if (!is_available_memory(md))
+			continue;
+		addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
+		if ((pagenr >= addr) && (pagenr < end))
+			return 1;
 	}
+#endif /* CONFIG_EFI */
+	return 0;
+}
 
-	for (i = 0; i < e820.nr_map; i++) {
+int page_is_ram_e820(unsigned long pagenr, struct e820map *local_e820)
+{
+	int i;
+	unsigned long addr, end;
+
+	for (i = 0; i < local_e820->nr_map; i++) {
 
-		if (e820.map[i].type != E820_RAM)	/* not usable memory */
+		if (local_e820->map[i].type != E820_RAM) /* not usable memory */
 			continue;
 		/*
 		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
 		 *	are not. Notably the 640->1Mb area. We need a sanity
 		 *	check here.
 		 */
-		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+		addr = (local_e820->map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (local_e820->map[i].addr+local_e820->map[i].size) >> PAGE_SHIFT;
 		if  ((pagenr >= addr) && (pagenr < end))
 			return 1;
 	}
@@ -543,6 +548,7 @@
 	int tmp;
 	int bad_ppro;
 
+
 #ifdef CONFIG_FLATMEM
 	if (!mem_map)
 		BUG();
@@ -617,6 +623,104 @@
 #endif
 }
 
+int add_one_highpage(struct page *page, int pfn, int bad_ppro)
+{
+	/*
+	 * there's no page_is_ram() check because that only covers ram
+	 * from boot-time.  We learned about this ram later
+	 */
+	if ( !(bad_ppro && page_kills_ppro(pfn))) {
+		set_bit(PG_highmem, &page->flags);
+		set_page_count(page, 1);
+		__free_page(page);
+		totalhigh_pages++;
+	} else {
+		SetPageReserved(page);
+		BUG(); /* for debugging.  remove later */
+	}
+	totalram_pages++;
+#ifdef CONFIG_FLATMEM
+	max_mapnr++;
+#endif
+	num_physpages++;
+	return 0;
+}
+
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	add_one_highpage(page, page_to_pfn(page), 0);
+}
+
+/*
+ * this is for the non-NUMA, single node SMP system case.
+ * Specifically, in the case of x86, we will always add
+ * memory to the highmem for now.
+ */
+#ifndef CONFIG_NUMA
+int add_memory(u64 start, u64 size, unsigned long attr)
+{
+	struct pglist_data *pgdata = &contig_page_data;
+	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	return __add_pages(zone, start_pfn, nr_pages, attr);
+}
+
+int remove_memory(u64 start, u64 size, unsigned long attr)
+{
+	struct zone *zone;
+	unsigned long start_pfn, end_pfn, nr_pages;
+
+	start_pfn = start >> PAGE_SHIFT;
+	nr_pages = size >> PAGE_SHIFT;
+	end_pfn = start_pfn + nr_pages;
+
+	/*
+	 * check to see which zone the page range is in. If
+	 * not in a zone where we allow hotplug (i.e. highmem),
+	 * just fail it right now.
+	 */
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	printk(KERN_DEBUG "%s(): memory will be removed from "
+			"the %s zone\n", __func__, zone->name);
+
+	/*
+	 * not handling removing memory ranges that
+	 * overlap multiple zones yet
+	 */
+	if (zone != page_zone(pfn_to_page(end_pfn-1)))
+		goto overlap;
+
+	/* make sure it is in highmem */
+	if (!is_highmem(zone)) {
+		printk(KERN_DEBUG "%s(): range to be removed must be in highmem!\n",
+			__func__);
+		goto not_highmem;
+	}
+
+	return __remove_pages(zone, start_pfn, nr_pages, attr);
+
+overlap:
+	printk(KERN_DEBUG "%s(): memory range to be removed overlaps "
+		"multiple zones!!!\n", __func__);
+not_highmem:
+	return -EINVAL;
+}
+#endif
+
+
+
 kmem_cache_t *pgd_cache;
 kmem_cache_t *pmd_cache;
 
@@ -697,3 +801,10 @@
 	}
 }
 #endif
+
+int page_is_ram(unsigned long pagenr)
+{
+	if (efi_enabled)
+		return page_is_ram_efi(pagenr);
+	return page_is_ram_e820(pagenr, &e820);
+}
--- memhotplug/arch/ia64/mm/init.c~L2-ia64-hotplug-stubs	2005-02-17 15:51:09.000000000 -0800
+++ /arch/ia64/mm/init.c	2005-02-17 15:51:09.000000000 -0800
@@ -618,3 +618,24 @@
 	ia32_mem_init();
 #endif
 }
+
+#ifdef	CONFIG_MEMORY_HOTPLUG
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	set_page_count(page, 1);
+	__free_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+int add_memory(u64 start, u64 size, unsigned long attr)
+{
+	return -ENOSYS;
+}
+
+int remove_memory(u64 start, u64 size, unsigned long attr)
+{
+	return -ENOSYS;
+}
+#endif
--- memhotplug/arch/ppc64/mm/init.c~I0-nonlinear-types	2005-02-17 15:51:06.000000000 -0800
+++ /arch/ppc64/mm/init.c	2005-02-17 15:51:06.000000000 -0800
@@ -677,7 +677,7 @@
 	zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
 
 	free_area_init_node(0, &contig_page_data, zones_size,
-			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
+			    __pa((void *)PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
 }
 #endif /* CONFIG_NUMA */
 
--- memhotplug/drivers/acpi/Kconfig~Q-ACPI-hotplug-driver	2005-02-17 15:51:10.000000000 -0800
+++ /drivers/acpi/Kconfig	2005-02-17 15:51:10.000000000 -0800
@@ -342,4 +342,11 @@
 	 	This is the ACPI generic container driver which supports
 		ACPI0004, PNP0A05 and PNP0A06 devices
 
+config ACPI_HOTPLUG_MEMORY
+	tristate "Memory Hotplug"
+	depends on ACPI
+	depends on MEMORY_HOTPLUG
+	default m
+	help
+	  This driver adds supports for ACPI Memory Hotplug.
 endmenu
--- memhotplug/drivers/acpi/Makefile~Q-ACPI-hotplug-driver	2005-02-17 15:51:10.000000000 -0800
+++ /drivers/acpi/Makefile	2005-02-17 15:51:10.000000000 -0800
@@ -55,3 +55,4 @@
 obj-$(CONFIG_ACPI_IBM)		+= ibm_acpi.o
 obj-$(CONFIG_ACPI_TOSHIBA)	+= toshiba_acpi.o
 obj-$(CONFIG_ACPI_BUS)		+= scan.o motherboard.o
+obj-$(CONFIG_ACPI_HOTPLUG_MEMORY)	+= acpi_memhotplug.o
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /drivers/acpi/acpi_memhotplug.c	2005-02-17 15:51:10.000000000 -0800
@@ -0,0 +1,542 @@
+/*
+ * Copyright (C) 2004 Intel Corporation <naveen.b.s@intel.com>
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *
+ * ACPI based HotPlug driver that supports Memory Hotplug
+ * This driver fields notifications from firmare for memory add
+ * and remove operations and alerts the VM of the affected memory
+ * ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/memory_hotplug.h>
+#include <acpi/acpi_drivers.h>
+
+
+#define ACPI_MEMORY_DEVICE_COMPONENT		0x08000000UL
+#define ACPI_MEMORY_DEVICE_CLASS		"memory"
+#define ACPI_MEMORY_DEVICE_HID			"PNP0C80"
+#define ACPI_MEMORY_DEVICE_DRIVER_NAME		"Hotplug Mem Driver"
+#define ACPI_MEMORY_DEVICE_NAME			"Hotplug Mem Device"
+
+#define _COMPONENT		ACPI_MEMORY_DEVICE_COMPONENT
+
+ACPI_MODULE_NAME		("acpi_memory")
+MODULE_AUTHOR("Naveen B S <naveen.b.s@intel.com>");
+MODULE_DESCRIPTION(ACPI_MEMORY_DEVICE_DRIVER_NAME);
+MODULE_LICENSE("GPL");
+
+/* ACPI _STA method values */
+#define ACPI_MEMORY_STA_PRESENT		(0x00000001UL)
+#define ACPI_MEMORY_STA_ENABLED		(0x00000002UL)
+#define ACPI_MEMORY_STA_FUNCTIONAL	(0x00000008UL)
+
+/* Memory Device States */
+#define MEMORY_INVALID_STATE	0
+#define MEMORY_POWER_ON_STATE	1
+#define MEMORY_POWER_OFF_STATE	2
+
+static int acpi_memory_device_add (struct acpi_device *device);
+static int acpi_memory_device_remove (struct acpi_device *device, int type);
+
+static struct acpi_driver acpi_memory_device_driver = {
+	.name =		ACPI_MEMORY_DEVICE_DRIVER_NAME,
+	.class =	ACPI_MEMORY_DEVICE_CLASS,
+	.ids =		ACPI_MEMORY_DEVICE_HID,
+	.ops =		{
+				.add =		acpi_memory_device_add,
+				.remove =	acpi_memory_device_remove,
+			},
+};
+
+struct acpi_memory_device {
+	acpi_handle handle;
+	unsigned int state;		/* State of the memory device */
+	unsigned short cache_attribute;	/* memory cache attribute */
+	unsigned short read_write_attribute;/* memory read/write attribute */
+	u64 start_addr;	/* Memory Range start physical addr */
+	u64 end_addr;	/* Memory Range end physical addr */
+};
+
+
+static int
+acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
+{
+	acpi_status status;
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	struct acpi_resource *resource = NULL;
+	struct acpi_resource_address64 address64;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_get_device_resources");
+
+	/* Get the range from the _CRS */
+	status = acpi_get_current_resources(mem_device->handle, &buffer);
+	if (ACPI_FAILURE(status))
+		return_VALUE(-EINVAL);
+
+	resource = (struct acpi_resource *) buffer.pointer;
+	status = acpi_resource_to_address64(resource, &address64);
+	if (ACPI_SUCCESS(status)) {
+		if (address64.resource_type == ACPI_MEMORY_RANGE) {
+			/* Populate the structure */
+			mem_device->cache_attribute =
+				address64.attribute.memory.cache_attribute;
+			mem_device->read_write_attribute =
+			address64.attribute.memory.read_write_attribute;
+			mem_device->start_addr = address64.min_address_range;
+			mem_device->end_addr = address64.max_address_range;
+		}
+	}
+
+	acpi_os_free(buffer.pointer);
+	return_VALUE(0);
+}
+
+static int
+acpi_memory_get_device(acpi_handle handle,
+	struct acpi_memory_device **mem_device)
+{
+	acpi_status status;
+	acpi_handle phandle;
+	struct acpi_device *device = NULL;
+	struct acpi_device *pdevice = NULL;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_get_device");
+
+	if (!acpi_bus_get_device(handle, &device) && device)
+		goto end;
+
+	status = acpi_get_parent(handle, &phandle);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+			"Error in acpi_get_parent\n"));
+		return_VALUE(-EINVAL);
+	}
+
+	/* Get the parent device */
+	status = acpi_bus_get_device(phandle, &pdevice);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+			"Error in acpi_bus_get_device\n"));
+		return_VALUE(-EINVAL);
+	}
+
+	/*
+	 * Now add the notified device.  This creates the acpi_device
+	 * and invokes .add function
+	 */
+	status = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+			"Error in acpi_bus_add\n"));
+		return_VALUE(-EINVAL);
+	}
+
+end:
+	*mem_device = acpi_driver_data(device);
+	if (!(*mem_device)) {
+		printk(KERN_ERR "\n driver data not found" );
+		return_VALUE(-ENODEV);
+	}
+
+	return_VALUE(0);
+}
+
+static int
+acpi_memory_check_device(struct acpi_memory_device *mem_device)
+{
+	unsigned long current_status;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_check_device");
+
+	/* Get device present/absent information from the _STA */
+	if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->handle, "_STA",
+		NULL, &current_status)))
+		return_VALUE(-ENODEV);
+	/*
+	 * Check for device status. Device should be
+	 * present/enabled/functioning.
+	 */
+	if (!((current_status & ACPI_MEMORY_STA_PRESENT)
+		&& (current_status & ACPI_MEMORY_STA_ENABLED)
+		&& (current_status & ACPI_MEMORY_STA_FUNCTIONAL)))
+		return_VALUE(-ENODEV);
+
+	return_VALUE(0);
+}
+
+static int
+acpi_memory_enable_device(struct acpi_memory_device *mem_device)
+{
+	int result;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_enable_device");
+
+	/* Get the range from the _CRS */
+	result = acpi_memory_get_device_resources(mem_device);
+	if (result) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+			"\nget_device_resources failed\n"));
+		mem_device->state = MEMORY_INVALID_STATE;
+		return result;
+	}
+
+	/*
+	 * Tell the VM there is more memory here...
+	 * Note: Assume that this function returns zero on success
+	 */
+	result = add_memory(mem_device->start_addr,
+			(mem_device->end_addr - mem_device->start_addr) + 1,
+			mem_device->read_write_attribute);
+	if (result) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+			"\nadd_memory failed\n"));
+		mem_device->state = MEMORY_INVALID_STATE;
+		return result;
+	}
+
+	return result;
+}
+
+static int
+acpi_memory_powerdown_device(struct acpi_memory_device *mem_device)
+{
+	acpi_status status;
+	struct acpi_object_list	arg_list;
+	union acpi_object arg;
+	unsigned long current_status;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_powerdown_device");
+
+	/* Issue the _EJ0 command */
+	arg_list.count = 1;
+	arg_list.pointer = &arg;
+	arg.type = ACPI_TYPE_INTEGER;
+	arg.integer.value = 1;
+	status = acpi_evaluate_object(mem_device->handle,
+			"_EJ0", &arg_list, NULL);
+	/* Return on _EJ0 failure */
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,"_EJ0 failed.\n"));
+		return_VALUE(-ENODEV);
+	}
+
+	/* Evalute _STA to check if the device is disabled */
+	status = acpi_evaluate_integer(mem_device->handle, "_STA",
+		NULL, &current_status);
+	if (ACPI_FAILURE(status))
+		return_VALUE(-ENODEV);
+
+	/* Check for device status.  Device should be disabled */
+	if (current_status & ACPI_MEMORY_STA_ENABLED)
+		return_VALUE(-EINVAL);
+
+	return_VALUE(0);
+}
+
+static int
+acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+	int result;
+	u64 start = mem_device->start_addr;
+	u64 len = mem_device->end_addr - start + 1;
+	unsigned long attr = mem_device->read_write_attribute;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_disable_device");
+
+	/*
+	 * Ask the VM to offline this memory range.
+	 * Note: Assume that this function returns zero on success
+	 */
+	result = remove_memory(start, len, attr);
+	if (result) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
+		return_VALUE(result);
+	}
+
+	/* Power-off and eject the device */
+	result = acpi_memory_powerdown_device(mem_device);
+	if (result) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+					"Device Power Down failed.\n"));
+		/* Set the status of the device to invalid */
+		mem_device->state = MEMORY_INVALID_STATE;
+		return result;
+	}
+
+	mem_device->state = MEMORY_POWER_OFF_STATE;
+	return result;
+}
+
+static void
+acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)
+{
+	struct acpi_memory_device *mem_device;
+	struct acpi_device *device;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_device_notify");
+
+	switch (event) {
+	case ACPI_NOTIFY_BUS_CHECK:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived BUS CHECK notification for device\n"));
+		/* Fall Through */
+	case ACPI_NOTIFY_DEVICE_CHECK:
+		if (event == ACPI_NOTIFY_DEVICE_CHECK)
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived DEVICE CHECK notification for device\n"));
+		if (acpi_memory_get_device(handle, &mem_device)) {
+			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+				"Error in finding driver data\n"));
+			return_VOID;
+		}
+
+		if (!acpi_memory_check_device(mem_device)) {
+			if (acpi_memory_enable_device(mem_device))
+				ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+				"Error in acpi_memory_enable_device\n"));
+		}
+		break;
+	case ACPI_NOTIFY_EJECT_REQUEST:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived EJECT REQUEST notification for device\n"));
+
+		if (acpi_bus_get_device(handle, &device)) {
+			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+					"Device doesn't exist\n"));
+			break;
+		}
+		mem_device = acpi_driver_data(device);
+		if (!mem_device) {
+			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+					"Driver Data is NULL\n"));
+			break;
+		}
+
+		/*
+		 * Currently disabling memory device from kernel mode
+		 * TBD: Can also be disabled from user mode scripts
+		 * TBD: Can also be disabled by Callback registration
+		 * 	with generic sysfs driver
+		 */
+		if (acpi_memory_disable_device(mem_device))
+			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+				"Error in acpi_memory_disable_device\n"));
+		/*
+		 * TBD: Invoke acpi_bus_remove to cleanup data structures
+		 */
+		break;
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"Unsupported event [0x%x]\n", event));
+		break;
+	}
+
+	return_VOID;
+}
+
+static int
+acpi_memory_device_add(struct acpi_device *device)
+{
+	int result;
+	struct acpi_memory_device *mem_device = NULL;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_device_add");
+
+	if (!device)
+		return_VALUE(-EINVAL);
+
+	mem_device = kmalloc(sizeof(struct acpi_memory_device), GFP_KERNEL);
+	if (!mem_device)
+		return_VALUE(-ENOMEM);
+	memset(mem_device, 0, sizeof(struct acpi_memory_device));
+
+	mem_device->handle = device->handle;
+	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
+	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
+	acpi_driver_data(device) = mem_device;
+
+	/* Get the range from the _CRS */
+	result = acpi_memory_get_device_resources(mem_device);
+	if (result) {
+		kfree(mem_device);
+		return_VALUE(result);
+	}
+
+	/* Set the device state */
+	mem_device->state = MEMORY_POWER_ON_STATE;
+
+	printk(KERN_INFO "%s \n", acpi_device_name(device));
+
+	return_VALUE(result);
+}
+
+static int
+acpi_memory_device_remove (struct acpi_device *device, int type)
+{
+	struct acpi_memory_device *mem_device = NULL;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_device_remove");
+
+	if (!device || !acpi_driver_data(device))
+		return_VALUE(-EINVAL);
+
+	mem_device = (struct acpi_memory_device *) acpi_driver_data(device);
+	kfree(mem_device);
+
+	return_VALUE(0);
+}
+
+/*
+ * Helper function to check for memory device
+ */
+static acpi_status
+is_memory_device(acpi_handle handle)
+{
+	char *hardware_id;
+	acpi_status status;
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	struct acpi_device_info *info;
+
+	ACPI_FUNCTION_TRACE("is_memory_device");
+
+	status = acpi_get_object_info(handle, &buffer);
+	if (ACPI_FAILURE(status))
+		return_ACPI_STATUS(AE_ERROR);
+
+	info = buffer.pointer;
+	if (!(info->valid & ACPI_VALID_HID)) {
+		acpi_os_free(buffer.pointer);
+		return_ACPI_STATUS(AE_ERROR);
+	}
+
+	hardware_id = info->hardware_id.value;
+	if ((hardware_id == NULL) ||
+		(strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID)))
+		status = AE_ERROR;
+
+	acpi_os_free(buffer.pointer);
+	return_ACPI_STATUS(status);
+}
+
+static acpi_status
+acpi_memory_register_notify_handler (acpi_handle handle,
+	u32 level, void *ctxt, void **retv)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_register_notify_handler");
+
+	status = is_memory_device(handle);
+	if (ACPI_FAILURE(status))
+		return_ACPI_STATUS(AE_OK);	/* continue */
+
+	status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
+			acpi_memory_device_notify, NULL);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+			"Error installing notify handler\n"));
+		return_ACPI_STATUS(AE_OK);	/* continue */
+	}
+
+	return_ACPI_STATUS(status);
+}
+
+static acpi_status
+acpi_memory_deregister_notify_handler (acpi_handle handle,
+			       u32 level, void *ctxt, void **retv)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_deregister_notify_handler");
+
+	status = is_memory_device(handle);
+	if (ACPI_FAILURE(status))
+		return_ACPI_STATUS(AE_OK);	/* continue */
+
+	status = acpi_remove_notify_handler(handle,
+			ACPI_SYSTEM_NOTIFY, acpi_memory_device_notify);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+				"Error removing notify handler\n"));
+		return_ACPI_STATUS(AE_OK);	/* continue */
+	}
+
+	return_ACPI_STATUS(status);
+}
+
+static int __init
+acpi_memory_device_init (void)
+{
+	int result;
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_device_init");
+
+	result = acpi_bus_register_driver(&acpi_memory_device_driver);
+
+	if (result < 0)
+		return_VALUE(-ENODEV);
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+				ACPI_UINT32_MAX,
+				acpi_memory_register_notify_handler,
+				NULL, NULL);
+
+	if (ACPI_FAILURE (status)) {
+		ACPI_DEBUG_PRINT ((ACPI_DB_ERROR, "walk_namespace failed\n"));
+		acpi_bus_unregister_driver(&acpi_memory_device_driver);
+		return_VALUE(-ENODEV);
+	}
+
+	return_VALUE(0);
+}
+
+static void __exit
+acpi_memory_device_exit (void)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_device_exit");
+
+	/*
+	 * Adding this to un-install notification handlers for all the device
+	 * handles.
+	 */
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+			ACPI_UINT32_MAX,
+			acpi_memory_deregister_notify_handler,
+			NULL, NULL);
+
+	if (ACPI_FAILURE (status))
+		ACPI_DEBUG_PRINT ((ACPI_DB_ERROR, "walk_namespace failed\n"));
+
+	acpi_bus_unregister_driver(&acpi_memory_device_driver);
+
+	return_VOID;
+}
+
+module_init(acpi_memory_device_init);
+module_exit(acpi_memory_device_exit);
+
+
--- memhotplug/drivers/base/Makefile~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /drivers/base/Makefile	2005-02-17 15:51:08.000000000 -0800
@@ -7,6 +7,7 @@
 obj-y			+= power/
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
--- memhotplug/drivers/base/init.c~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /drivers/base/init.c	2005-02-17 15:51:08.000000000 -0800
@@ -9,6 +9,7 @@
 
 #include <linux/device.h>
 #include <linux/init.h>
+#include <linux/memory.h>
 
 extern int devices_init(void);
 extern int buses_init(void);
@@ -39,5 +40,6 @@
 	platform_bus_init();
 	system_bus_init();
 	cpu_dev_init();
+	memory_dev_init();
 	attribute_container_init();
 }
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /drivers/base/memory.c	2005-02-17 15:51:10.000000000 -0800
@@ -0,0 +1,525 @@
+/*
+ * drivers/base/memory.c - basic Memory class support
+ */
+
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>	/* capable() */
+#include <linux/topology.h>
+#include <linux/device.h>
+#include <linux/memory.h>
+#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define MEMORY_CLASS_NAME	"memory"
+
+struct sysdev_class memory_sysdev_class = {
+	set_kset_name(MEMORY_CLASS_NAME),
+};
+EXPORT_SYMBOL(memory_sysdev_class);
+
+/*
+ * With these ops structures, we can override actions for things
+ * like merging or splitting
+ */
+static int memory_hotplug_filter(struct kset *kset, struct kobject *kobj)
+{
+/*	struct kobj_type *ktype = get_ktype(kobj); */
+	return 1;
+}
+
+static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+{
+	return MEMORY_CLASS_NAME;
+}
+
+static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+			int num_envp, char *buffer, int buffer_size)
+{
+	int retval = 0;
+
+	return retval;
+}
+
+static struct kset_hotplug_ops memory_hotplug_ops = {
+	.filter = memory_hotplug_filter,
+	.name	= memory_hotplug_name,
+	.hotplug	= memory_hotplug,
+};
+
+
+/*
+ * register_memory - Setup a sysfs device for a memory block
+ */
+int
+register_memory(struct memory_block *memory, struct mem_section *section,
+		struct node *root)
+{
+	int error;
+
+	memory->sysdev.cls = &memory_sysdev_class;
+	memory->sysdev.id = __section_nr(section);
+
+	error = sysdev_register(&memory->sysdev);
+
+	if (root && !error)
+		error = sysfs_create_link(&root->sysdev.kobj,
+					  &memory->sysdev.kobj,
+					  kobject_name(&memory->sysdev.kobj));
+
+	return error;
+}
+
+void
+unregister_memory(struct memory_block *memory, struct mem_section *section,
+		struct node *root)
+{
+	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+	BUG_ON(memory->sysdev.id != __section_nr(section));
+
+	sysdev_unregister(&memory->sysdev);
+	if (root)
+		sysfs_remove_link(&root->sysdev.kobj, kobject_name(&memory->sysdev.kobj));
+}
+
+/*
+ * use this as the physical section index that this memsection
+ * uses.
+ */
+
+static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	return sprintf(buf, "%08lx\n", mem->phys_index);
+}
+
+/*
+ * online, offline, going offline, etc.
+ */
+static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	ssize_t len = 0;
+
+	/*
+	 * We can probably put these states in a nice little array
+	 * so that they're not open-coded
+	 */
+	switch (mem->state) {
+		case MEM_ONLINE:
+			len = sprintf(buf, "online\n");
+			break;
+		case MEM_OFFLINE:
+			len = sprintf(buf, "offline\n");
+			break;
+		case MEM_GOING_OFFLINE:
+			len = sprintf(buf, "going-offline\n");
+			break;
+		case MEM_INVALID:
+			len = sprintf(buf, "invalid\n");
+			break;
+		default:
+			len = sprintf(buf, "ERROR\n");
+			break;
+	}
+
+	return len;
+}
+
+#ifdef CONFIG_SPARSEMEM
+/* this can't stay here.  it needs to go into nonlinear.c or something */
+static int
+memory_block_action(struct memory_block *mem, unsigned long action)
+{
+	int i;
+	unsigned long psection;
+	unsigned long start_pfn, start_paddr;
+	struct page *first_page;
+	int ret;
+	int old_state = mem->state;
+
+	/*
+	 * this eventually needs to be a loop so that a memory_block
+	 * can contain more than a single section
+	 */
+	psection = mem->phys_index; //pfn_to_section()??
+	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+	printk(KERN_DEBUG "%s()\n"
+	       KERN_DEBUG "\tpsection: %ld\n"
+	       KERN_DEBUG "\tfirst_page: %p\n"
+	       KERN_DEBUG "\tphys_index: %08lx\n",
+		__func__, psection, first_page, mem->phys_index);
+	for (i = 0; i < PAGES_PER_SECTION; i++) {
+		if ((action == MEM_ONLINE) && !PageReserved(first_page+i)) {
+			printk(KERN_WARNING "%s: section number %ld page number %d "
+				"not reserved, was it already online? \n",
+				__func__, psection, i);
+			return -EBUSY;
+		}
+	}
+
+	switch (action) {
+		case MEM_ONLINE:
+			start_pfn = page_to_pfn(first_page);
+			ret = online_pages(start_pfn, PAGES_PER_SECTION);
+			break;
+		case MEM_OFFLINE:
+			mem->state = MEM_GOING_OFFLINE;
+			start_paddr = page_to_pfn(first_page) <<PAGE_SHIFT;
+			ret = remove_memory(start_paddr, PAGES_PER_SECTION<<PAGE_SHIFT, 0);
+			printk(KERN_DEBUG "%s(%p, %ld) remove_memory() res: %d\n",
+					__func__, mem, action, ret);
+			if (ret)
+				mem->state = old_state;
+			break;
+		default:
+			printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", __func__,
+				mem, action, action);
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+#else
+static int
+memory_block_action(struct memory_block *mem, unsigned long action)
+{
+	printk(KERN_WARNING "%s() failed to perform action: %d, SPAARSE is "
+			"compiled out\n", __FUNCTION__, action);
+	return -ENOSYS;
+}
+#endif
+
+/*
+ * These to_state and from_state things really are just state
+ * machine changes.  It might just be better to declare them
+ * all in a table instead of in code like this.
+ */
+static int memory_block_change_state(struct memory_block *mem,
+		unsigned long to_state, unsigned long from_state_req)
+{
+	int ret = 0;
+	down(&mem->state_sem);
+
+	if (mem->state != from_state_req) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = memory_block_action(mem, to_state);
+	if (!ret)
+		mem->state = to_state;
+
+out:
+	up(&mem->state_sem);
+	return ret;
+}
+
+static ssize_t
+store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	unsigned int phys_section_nr = mem->phys_index;
+	int ret = -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!valid_section_nr(phys_section_nr)) {
+		printk(KERN_DEBUG "%s: section (%d) is not valid\n",
+			__func__, phys_section_nr);
+		goto out;
+	}
+
+	if (!strncmp(buf, "online", min((int)count, 6)))
+		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+	else if(!strncmp(buf, "offline", min((int)count, 7)))
+		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+
+out:
+	if (ret)
+		return ret;
+	return count;
+}
+
+/*
+ * phys_device is a bad name for this.  What I really want
+ * is a way to differentiate between memory ranges that
+ * are part of physical devices that constitute
+ * a complete removable unit or fru.
+ * i.e. do these ranges belong to the same physical device,
+ * s.t. if I offline all of these sections I can then
+ * remove the physical device?
+ */
+static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	return sprintf(buf, "%d\n", mem->phys_device);
+}
+
+SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
+SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
+
+#define mem_create_simple_file(mem, attr_name)	\
+	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+#define mem_remove_simple_file(mem, attr_name)	\
+	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+
+/*
+ * Block size attribute stuff
+ */
+
+static ssize_t
+print_block_size(struct class *class, char *buf)
+{
+	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION*PAGE_SIZE);
+}
+
+static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+
+static int block_size_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_block_size_bytes.attr);
+	return 0;
+}
+
+/*
+ * All the probe stuff here
+ */
+
+extern int page_is_hotpluggable_ram(unsigned long pfn);
+/* define this off in some header somewhere ... */
+#ifdef CONFIG_ARCH_MEMORY_PROBE
+static ssize_t
+memory_probe_store(struct class *class, const char __user *buf, size_t count)
+{
+	u64 phys_addr;
+	unsigned long offset;
+	int ret;
+	/*
+	 * Hmmm... what do we really want this to do?
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	phys_addr = simple_strtoull(buf, NULL, 0);
+
+	// a hardware check for the ram?
+	for (offset = 0; offset < PAGES_PER_SECTION; offset++) {
+		unsigned long page_nr = (phys_addr >> PAGE_SHIFT) + offset;
+		if (page_is_hotpluggable_ram(page_nr))
+			break;
+	}
+	if (offset == PAGES_PER_SECTION)
+		return -EINVAL;
+
+	ret = add_memory(phys_addr, (PAGES_PER_SECTION << PAGE_SHIFT), 0);
+
+	if (ret)
+		count = ret;
+
+	return count;
+}
+static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
+
+static int memory_probe_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_probe.attr);
+	return 0;
+}
+#else
+#define memory_probe_init(...) (1)
+#endif
+
+/*
+ * Note that phys_device is optional.  It is here to allow for
+ * differentiation between which *physical* devices each
+ * section belongs to...
+ */
+
+int add_memory_block(unsigned long node_id, struct mem_section *section,
+		     unsigned long state, int phys_device)
+{
+	size_t size = sizeof(struct memory_block);
+	struct memory_block *mem = kmalloc(size, GFP_KERNEL);
+	int ret = 0;
+
+	if (!mem)
+		return -ENOMEM;
+
+	memset(mem, 0, size);
+
+	mem->phys_index = __section_nr(section);
+	mem->state = state;
+	init_MUTEX(&mem->state_sem);
+	mem->phys_device = phys_device;
+
+#if 0
+	/* not yet sure how this can be optimally structured
+	 * to get the fru information from hw/fw specific drivers
+	 */
+	if (mem->callback)
+		callback(mem);
+#endif
+
+	ret = register_memory(mem, section, NULL);
+	if (!ret)
+		ret = mem_create_simple_file(mem, phys_index);
+	if (!ret)
+		ret = mem_create_simple_file(mem, state);
+	if (!ret)
+		ret = mem_create_simple_file(mem, phys_device);
+
+	return ret;
+}
+
+#define online_section(...) 	do {} while(0)
+
+static ssize_t
+online_store(struct class *class, const char *buf, size_t count)
+{
+	unsigned int section = simple_strtoul(buf, NULL, 10);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (valid_section(&mem_section[section])) {
+		printk(KERN_WARNING "memory state store: section %d is "
+				    "not currently mapped\n", section);
+		return -EINVAL;
+	}
+
+	/*
+	 * Crude section based onlining; probably need random
+	 * address onlining...
+	 */
+	online_section(section);
+
+	return count;
+}
+static CLASS_ATTR(online, 0700, NULL, online_store);
+
+static int online_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_online.attr);
+	return 0;
+}
+
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+struct memory_block *find_memory_block(struct mem_section *section)
+{
+	struct kobject *kobj;
+	struct sys_device *sysdev;
+	struct memory_block *mem;
+	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+	/*
+	 * This only works because we know that section == sysdev->id
+	 * slightly redundant with sysdev_register()
+	 */
+	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+	printk(KERN_DEBUG "%s() looking for name: \"%s\"\n", __func__, name);
+
+	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+	if (!kobj)
+		return NULL;
+
+	sysdev = container_of(kobj, struct sys_device, kobj);
+	mem = container_of(sysdev, struct memory_block, sysdev);
+
+	return mem;
+}
+
+int remove_memory_block(unsigned long node_id, struct mem_section *section,
+		int phys_device)
+{
+	struct memory_block *mem;
+
+	mem = find_memory_block(section);
+
+#if 0
+	/* not yet sure how this can be optimally structured
+	 * to get the fru information from hw/fw specific drivers
+	 */
+	if (mem->callback)
+		callback(mem);
+#endif
+
+	mem_remove_simple_file(mem, phys_index);
+	mem_remove_simple_file(mem, state);
+	mem_remove_simple_file(mem, phys_device);
+	unregister_memory(mem, section, NULL);
+
+	return 0;
+}
+
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(struct mem_section *section)
+{
+	printk(KERN_DEBUG "%s(%p)\n", __func__, section);
+
+	/* need some node info here and some sort of callback .... */
+	return add_memory_block(0, section, MEM_OFFLINE, 0);
+}
+
+int unregister_memory_section(struct mem_section *section)
+{
+	if (!valid_section(section)) {
+		printk(KERN_WARNING "%s: section %d is already invalid\n",
+					__func__, __section_nr(section));
+		return -EINVAL;
+	}
+
+	/* need some node info here and some sort of callback .... */
+	return remove_memory_block(0, section, 0);
+}
+
+/*
+ * Initialize the sysfs support for memory devices...
+ */
+int __init memory_dev_init(void)
+{
+	unsigned int i;
+	int ret;
+
+	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+	ret = sysdev_class_register(&memory_sysdev_class);
+
+	/*
+	 * Create entries for memory sections that were found
+	 * during boot and have been initialized
+	 */
+	for (i = 0; i < NR_MEM_SECTIONS; i++) {
+		if (!valid_section_nr(i))
+			break;
+		add_memory_block(0, &mem_section[i], MEM_ONLINE, 0);
+	}
+
+	memory_probe_init();
+	block_size_init();
+	online_init();
+
+	return ret;
+}
--- memhotplug/include/asm-i386/highmem.h~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /include/asm-i386/highmem.h	2005-02-17 15:51:08.000000000 -0800
@@ -65,6 +65,7 @@
 
 extern void * FASTCALL(kmap_high(struct page *page));
 extern void FASTCALL(kunmap_high(struct page *page));
+extern void flush_all_zero_pkmaps(void);
 
 void *kmap(struct page *page);
 void kunmap(struct page *page);
--- memhotplug/include/linux/highmem.h~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /include/linux/highmem.h	2005-02-17 15:51:08.000000000 -0800
@@ -29,6 +29,7 @@
 #define kmap_atomic(page, idx)		page_address(page)
 #define kunmap_atomic(addr, idx)	do { } while (0)
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
+#define flush_all_zero_pkmaps()		do { } while (0)
 
 #endif /* CONFIG_HIGHMEM */
 
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /include/linux/memory.h	2005-02-17 15:51:10.000000000 -0800
@@ -0,0 +1,78 @@
+/*
+ * include/linux/memory.h - generic memory definition
+ *
+ * This is mainly for topological representation. We define the
+ * basic "struct memory_block" here, which can be embedded in per-arch
+ * definitions or NUMA information.
+ *
+ * Basic handling of the devices is done in drivers/base/memory.c
+ * and system devices are handled in drivers/base/sys.c.
+ *
+ * Memory block are exported via sysfs in the class/memory/devices/
+ * directory.
+ *
+ */
+#ifndef _LINUX_MEMORY_H_
+#define _LINUX_MEMORY_H_
+
+#include <linux/sysdev.h>
+#include <linux/node.h>
+#include <linux/compiler.h>
+
+#include <asm/semaphore.h>
+
+struct memory_block {
+	unsigned long phys_index;
+	unsigned long state;
+	struct semaphore state_sem;
+	int phys_device;		/* to which fru does this belong? */
+	void *hw;			/* optional pointer to fw/hw data */
+	int (*phys_callback)(struct memory_block *);
+	struct sys_device sysdev;
+};
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline int memory_dev_init(void)
+{
+	return 0;
+}
+#else
+extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
+extern int register_new_memory(struct mem_section *);
+extern int unregister_memory_section(struct mem_section *);
+extern int memory_dev_init(void);
+#endif
+
+#ifndef CONFIG_SPARSEMEM
+#define CONFIG_MEM_BLOCK_SIZE	(1<<27)
+#else /* tie this to nonlinear */
+#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
+#endif
+
+#define CONFIG_ARCH_MEMORY_PROBE 1
+
+#define	MEM_ONLINE		(1<<0)
+#define	MEM_OFFLINE		(1<<1)
+#define	MEM_GOING_OFFLINE	(1<<2)
+#define MEM_INVALID		(1<<3)
+#define MEM_BROKEN		(1<<4)
+
+extern int invalidate_phys_mapping(unsigned long, unsigned long);
+extern int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages);
+struct notifier_block;
+
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+extern struct sysdev_class memory_sysdev_class;
+extern struct semaphore memory_sem;
+
+#define lock_memory_hotplug()	down(&memory_sem)
+#define unlock_memory_hotplug()	up(&memory_sem)
+#define lock_memory_hotplug_interruptible() down_interruptible(&memory_sem)
+#define hot_memory_notifier(fn, pri) {				\
+	static struct notifier_block fn##_nb = { fn, pri };	\
+	register_memory_notifier(&fn##_nb);			\
+}
+
+#endif /* _LINUX_MEMORY_H_ */
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /include/linux/memory_hotplug.h	2005-02-17 15:51:08.000000000 -0800
@@ -0,0 +1,41 @@
+#ifndef __MEMORY_HOTPLUG_H
+#define __MEMORY_HOTPLUG_H
+
+extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
+extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
+extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+/* need some defines for these for archs that don't support it */
+extern void online_page(struct page *page);
+/* VM interface that may be used by firmware interface */
+extern int add_memory(u64 start, u64 size, unsigned long attr);
+extern int remove_memory(u64 start, u64 size, unsigned long attr);
+extern int online_pages(unsigned long, unsigned long);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* reasonably generic interface to expand the physical pages in a zone  */
+extern int __add_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages, unsigned long attr);
+extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages, unsigned long attr);
+#else
+static inline int mhp_notimplemented(char *func)
+{
+	printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", __func__);
+	dump_stack();
+	return -ENOSYS;
+}
+
+static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages, unsigned long attr)
+{
+	return mhp_notimplemented(__FUNCTION__);
+}
+static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages, unsigned long attr)
+{
+	return mhp_notimplemented(__FUNCTION__);
+}
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#endif
--- memhotplug/include/linux/mmzone.h~J-zone_resize_sem	2005-02-17 15:51:08.000000000 -0800
+++ /include/linux/mmzone.h	2005-02-17 15:51:09.000000000 -0800
@@ -12,6 +12,7 @@
 #include <linux/threads.h>
 #include <linux/numa.h>
 #include <asm/atomic.h>
+#include <asm/semaphore.h>
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_FORCE_MAX_ZONEORDER
@@ -204,6 +205,7 @@
 
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
+	struct semaphore	resize_sem;
 
 	/*
 	 * rarely used fields:
@@ -277,6 +279,7 @@
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int alloc_type, int can_try_harder, int gfp_high);
+void setup_per_zone_pages_min(void);
 
 /*
  * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
@@ -472,16 +475,25 @@
 	return (section->section_mem_map & SECTION_HAS_MEM_MAP);
 }
 
+static inline int valid_section_nr(int nr)
+{
+	return valid_section(&mem_section[nr]);
+}
+
 /*
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
-
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
 	return &mem_section[pfn >> PFN_SECTION_SHIFT];
 }
 
+static inline int __section_nr(struct mem_section* ms)
+{
+	return ms - &mem_section[0];
+}
+
 #define pfn_to_page(pfn) 						\
 ({ 									\
 	unsigned long __pfn = (pfn);					\
--- memhotplug/mm/Kconfig~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /mm/Kconfig	2005-02-17 15:51:10.000000000 -0800
@@ -1,3 +1,7 @@
+config MEMORY_HOTPLUG
+	bool "Allow for memory hot-add"
+	depends on SPARSEMEM && HOTPLUG
+
 choice
 	prompt "Memory model"
 	default SPARSEMEM if ARCH_SPARSEMEM_DEFAULT
@@ -16,4 +20,7 @@
 
 endchoice
 
+config SIMULATED_MEM_HOTPLUG
+	bool "Simulate memory hotplug on non-hotplug hardware"
+	depends on X86 && !X86_64
 
--- memhotplug/mm/Makefile~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /mm/Makefile	2005-02-17 15:51:08.000000000 -0800
@@ -18,4 +18,5 @@
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 
--- memhotplug/mm/highmem.c~L1-sysfs-memory-class	2005-02-17 15:51:08.000000000 -0800
+++ /mm/highmem.c	2005-02-17 15:51:08.000000000 -0800
@@ -59,7 +59,7 @@
 
 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 
-static void flush_all_zero_pkmaps(void)
+void flush_all_zero_pkmaps(void)
 {
 	int i;
 
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ /mm/memory_hotplug.c	2005-02-17 15:51:10.000000000 -0800
@@ -0,0 +1,222 @@
+/*
+ *  linux/mm/memory_hotplug.c
+ *
+ *  Copyright (C)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
+
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+	struct page *page, *ret;
+	unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+	if (page)
+		goto got_map_page;
+
+	ret = vmalloc(memmap_size);
+	if (ret)
+		goto got_map_ptr;
+
+	return NULL;
+got_map_page:
+	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+	memset(ret, 0, memmap_size);
+
+	return ret;
+}
+
+extern int sparse_add_one_section(int, int, struct page *); /* FIXME header*/
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, unsigned long size);
+int __add_section(struct zone *zone, unsigned long phys_start_pfn,
+		  unsigned long attr)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nr_pages = PAGES_PER_SECTION;
+	struct page *memmap;
+	int zone_type;
+	int nid = 0;
+	int ret;
+
+	printk(KERN_DEBUG "%s(%p, %08lx, %08lx)\n", __func__, zone,
+		phys_start_pfn, attr);
+
+	/*
+	 * don't check this for failure because it is possible that the
+	 * section already has a mem_map.  The sparse code will fix this up
+	 */
+	memmap = __kmalloc_section_memmap(nr_pages);
+
+	down(&zone->resize_sem);
+
+	printk(KERN_DEBUG "%s() phys_start_pfn: %08lx\n", __func__, phys_start_pfn);
+	ret = sparse_add_one_section(phys_start_pfn, nr_pages, memmap);
+
+	if (ret <= 0) {
+		/* the mem_map didn't get used */
+		if (memmap >= (struct page *)VMALLOC_START &&
+		    memmap < (struct page *)VMALLOC_END)
+			vfree(memmap);
+		else
+			free_pages((unsigned long)memmap,
+				   get_order(sizeof(struct page) * nr_pages));
+	}
+
+	if (zone->zone_start_pfn > phys_start_pfn) {
+		zone->spanned_pages += zone->zone_start_pfn - phys_start_pfn;
+		zone->zone_start_pfn = phys_start_pfn;
+	}
+	if (phys_start_pfn + nr_pages > zone->zone_start_pfn + zone->spanned_pages) {
+		zone->spanned_pages = (phys_start_pfn + nr_pages) -
+					zone->zone_start_pfn;
+	}
+
+	hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION);
+
+	up(&zone->resize_sem);
+
+	if (ret < 0) {
+		printk(KERN_WARNING "%s(): error onlining section: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
+	zone_type = zone - pgdat->node_zones;
+	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+
+	/*
+	 * Actually, we don't want to online the pages here at all.  We
+	 * will enable the new regions to be available via sysfs and thus
+	 * onlined from user space.
+	 */
+	{
+		struct mem_section *ms = __pfn_to_section(phys_start_pfn);
+		register_new_memory(ms);
+	}
+
+	return 0;
+}
+
+/*
+ * Reasonably generic function for adding memory.  It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+		 unsigned long nr_pages, unsigned long attr)
+{
+	unsigned long i;
+	int err = 0;
+
+	printk(KERN_DEBUG "%s(%p, %08lx, %ld, %08lx)\n", __func__,
+			zone, phys_start_pfn, nr_pages, attr);
+
+	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+		 printk(KERN_DEBUG "\tfor: i: %ld\n", i);
+		 err = __add_section(zone, phys_start_pfn + i, attr);
+
+		 if (err)
+			break;
+	}
+
+	/*
+	 * Should we back the ones out that succeeded if any part of
+	 * the addition fails?
+	 */
+
+	return err;
+}
+
+#ifdef CONFIG_SIMULATED_MEM_HOTPLUG
+int page_is_hotpluggable_ram(unsigned long pfn)
+{
+	extern struct e820map bios_e820;
+	extern int page_is_ram_e820(unsigned long, struct e820map*);
+
+	return page_is_ram_e820(pfn, &bios_e820);
+}
+#else
+int page_is_hotpluggable_ram(unsigned long pfn)
+{
+	return 1;
+}
+#endif
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+	int i;
+
+	printk(KERN_DEBUG "%s: onlining 0x%lx pages starting from pfn: 0x%lx\n",
+		__func__, nr_pages, pfn);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = pfn_to_page(pfn + i);
+
+		if (page_is_hotpluggable_ram(pfn + i))
+			online_page(page);
+	}
+
+	page_zone(pfn_to_page(pfn))->present_pages += nr_pages;
+
+	setup_per_zone_pages_min();
+
+	return 0;
+}
+
+extern void flush_all_zero_pkmaps(void);
+int __remove_pages(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages, unsigned long attr)
+{
+	int order = get_order(nr_pages<<PAGE_SHIFT);
+	struct mem_section *ms = __pfn_to_section(start_pfn);
+	/*
+	 * for now, only handle 2^x sized areas
+	 */
+	if (nr_pages != 1<<order)
+		return -EINVAL;
+
+#ifdef CONFIG_MEMORY_REMOVE
+	if (capture_page_range(start_pfn, order)) {
+		printk(KERN_WARNING "%s(): failed to capture page range: %ld -> %ld\n",
+				__func__, start_pfn, start_pfn + nr_pages);
+
+		return -EAGAIN;
+	}
+#else
+	return -EINVAL;
+#endif
+
+	unregister_memory_section(ms);
+
+	/*
+	 * Permanent kmaps keep ptes to a page long after a kunmap() to
+	 * keep global tlb flushes to a minimum.  When it flushes, it
+	 * works out a pfn and a struct page from that pte which can be
+	 * long after the page is removed.  Flush before removal.
+	 */
+	flush_all_zero_pkmaps();
+//	invalidate_phys_mapping(start_pfn, nr_pages);
+	ms->section_mem_map &= ~SECTION_MARKED_PRESENT;
+	return 0;
+}
--- memhotplug/mm/page_alloc.c~I0-nonlinear-types	2005-02-17 15:51:06.000000000 -0800
+++ /mm/page_alloc.c	2005-02-17 15:51:10.000000000 -0800
@@ -1296,7 +1296,7 @@
 /*
  * Builds allocation fallback zone lists.
  */
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+int __devinit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
 {
 	switch (k) {
 		struct zone *zone;
@@ -1304,7 +1304,12 @@
 		BUG();
 	case ZONE_HIGHMEM:
 		zone = pgdat->node_zones + ZONE_HIGHMEM;
-		if (zone->present_pages) {
+		/*
+		 * with mem hotplug we don't increment present_pages
+		 * until the pages are actually freed into the zone,
+		 * but we increment spanned pages much earlier
+		 */
+		if (zone->spanned_pages) {
 #ifndef CONFIG_HIGHMEM
 			BUG();
 #endif
@@ -1312,11 +1317,11 @@
 		}
 	case ZONE_NORMAL:
 		zone = pgdat->node_zones + ZONE_NORMAL;
-		if (zone->present_pages)
+		if (zone->spanned_pages)
 			zonelist->zones[j++] = zone;
 	case ZONE_DMA:
 		zone = pgdat->node_zones + ZONE_DMA;
-		if (zone->present_pages)
+		if (zone->spanned_pages)
 			zonelist->zones[j++] = zone;
 	}
 
@@ -1387,7 +1392,7 @@
 	return best_node;
 }
 
-static void __init build_zonelists(pg_data_t *pgdat)
+void __devinit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 	int prev_node, load;
@@ -1434,7 +1439,7 @@
 
 #else	/* CONFIG_NUMA */
 
-static void __init build_zonelists(pg_data_t *pgdat)
+void __devinit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 
@@ -1554,7 +1559,7 @@
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
@@ -1613,7 +1618,7 @@
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
-static __devinit void zone_pcp_init(struct zone *zone)
+void zone_pcp_init(struct zone *zone)
 {
 	unsigned long batch;
 	int cpu;
@@ -1653,7 +1658,7 @@
 			zone->name, zone->present_pages, batch);
 }
 
-static __devinit void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int table_size_bytes;
 	int i;
@@ -1679,7 +1684,6 @@
 {
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nid = pgdat->node_id;
 
 	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 	zone->zone_start_pfn = zone_start_pfn;
@@ -1687,9 +1691,8 @@
 	if ((zone_start_pfn) & (zone_required_alignment-1))
 		printk("BUG: wrong zone alignment, it will crash\n");
 
-	memmap_init(size, nid, zone_idx(zone), zone_start_pfn);
-
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+	zone->spanned_pages = size;
 
 	pgdat->nr_zones++;
 }
@@ -1723,11 +1726,11 @@
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 
-		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
+		init_MUTEX(&zone->resize_sem);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 
@@ -1748,6 +1751,7 @@
 
 		zone_wait_table_init(zone, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
+		//memmap_init(size, nid, zone_idx(zone), zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
@@ -1801,7 +1805,7 @@
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, &contig_page_data, zones_size,
-			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+			__pa((void*)PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 #endif
 
@@ -2032,7 +2036,7 @@
  *	that the pages_{min,low,high} values for each zone are set correctly 
  *	with respect to min_free_kbytes.
  */
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
@@ -2241,3 +2245,32 @@
 
 	return table;
 }
+
+static inline int zone_previously_initialized(struct zone *zone)
+{
+	if (zone->wait_table_size)
+		return 1;
+
+	return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int  hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages)
+{
+	if (zone_previously_initialized(zone))
+		return -EEXIST;
+
+	zone_wait_table_init(zone, PAGES_PER_SECTION);
+	init_currently_empty_zone(zone, phys_start_pfn, PAGES_PER_SECTION);
+	zone_pcp_init(zone);
+
+	/*
+	 * FIXME: there is no locking at all for the zonelists.
+	 * Least impactful (codewise) way to do this is probably
+	 * to freeze all the CPUs for a sec while this is done.
+	 */
+	build_zonelists(zone->zone_pgdat);
+
+	return 0;
+}
+#endif

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Sparse Memory Handling (hot-add foundation)
  2005-02-18  0:03 [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Dave Hansen
  2005-02-18  0:05 ` [RFC][PATCH] Memory Hotplug Dave Hansen
@ 2005-02-18  5:16 ` Mike Kravetz
  2005-02-18 15:31   ` Dave Hansen
  2005-02-18 10:04 ` Andi Kleen
  2 siblings, 1 reply; 9+ messages in thread
From: Mike Kravetz @ 2005-02-18  5:16 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Kernel Mailing List, lhms, linux-mm, Andy Whitcroft

On Thu, Feb 17, 2005 at 04:03:53PM -0800, Dave Hansen wrote:
> The attached patch

Just tried to compile this and noticed that there is no definition
of valid_section_nr(),  referenced in sparse_init.

-- 
Mike

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Sparse Memory Handling (hot-add foundation)
  2005-02-18  0:03 [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Dave Hansen
  2005-02-18  0:05 ` [RFC][PATCH] Memory Hotplug Dave Hansen
  2005-02-18  5:16 ` [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Mike Kravetz
@ 2005-02-18 10:04 ` Andi Kleen
  2005-02-18 15:42   ` Dave Hansen
  2 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2005-02-18 10:04 UTC (permalink / raw)
  To: Dave Hansen; +Cc: lhms, linux-mm, Andy Whitcroft, linux-kernel

Dave Hansen <haveblue@us.ibm.com> writes:

> The attached patch, largely written by Andy Whitcroft, implements a
> feature which is similar to DISCONTIGMEM, but has some added features.
> Instead of splitting up the mem_map for each NUMA node, this splits it
> up into areas that represent fixed blocks of memory.  This allows
> individual pieces of that memory to be easily added and removed.

[...]

I'm curious - how does this affect .text size for a i386 or x86-64 NUMA
kernel? One area I wanted to improve on x86-64 for a long time was
to shrink the big virt_to_page() etc. inline macros. Your new code
actually looks a bit smaller.

-Andi

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Sparse Memory Handling (hot-add foundation)
  2005-02-18  5:16 ` [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Mike Kravetz
@ 2005-02-18 15:31   ` Dave Hansen
  0 siblings, 0 replies; 9+ messages in thread
From: Dave Hansen @ 2005-02-18 15:31 UTC (permalink / raw)
  To: Mike Kravetz; +Cc: Linux Kernel Mailing List, lhms, linux-mm, Andy Whitcroft

On Thu, 2005-02-17 at 21:16 -0800, Mike Kravetz wrote:
> On Thu, Feb 17, 2005 at 04:03:53PM -0800, Dave Hansen wrote:
> > The attached patch
> 
> Just tried to compile this and noticed that there is no definition
> of valid_section_nr(),  referenced in sparse_init.

What's your .config?  I didn't actually try it on ppc64, and I may have
missed one of the necessary patches.  I trimmed it down to very near the
minimum set on x86.

-- Dave


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Sparse Memory Handling (hot-add foundation)
  2005-02-18 10:04 ` Andi Kleen
@ 2005-02-18 15:42   ` Dave Hansen
  0 siblings, 0 replies; 9+ messages in thread
From: Dave Hansen @ 2005-02-18 15:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: lhms, linux-mm, Andy Whitcroft, Linux Kernel Mailing List,
	Matthew E Tolentino

On Fri, 2005-02-18 at 11:04 +0100, Andi Kleen wrote:
> Dave Hansen <haveblue@us.ibm.com> writes:
> 
> > The attached patch, largely written by Andy Whitcroft, implements a
> > feature which is similar to DISCONTIGMEM, but has some added features.
> > Instead of splitting up the mem_map for each NUMA node, this splits it
> > up into areas that represent fixed blocks of memory.  This allows
> > individual pieces of that memory to be easily added and removed.
>
> I'm curious - how does this affect .text size for a i386 or x86-64 NUMA
> kernel? One area I wanted to improve on x86-64 for a long time was
> to shrink the big virt_to_page() etc. inline macros. Your new code
> actually looks a bit smaller.

On x86, it looks like a 3k increase in text size.  I know Matt Tolentino
has been testing it on x86_64, he might have a comparison there for you.

$ size i386-T41-laptop*/vmlinux
   text    data     bss     dec     hex filename
2897131  580592  204252 3681975  382eb7 i386-T41-laptop.sparse/vmlinux
2894166  581832  203228 3679226  3823fa i386-T41-laptop/vmlinux

BTW, this PAE is on and uses 36-bits of physaddr space.  

-- Dave


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Memory Hotplug
  2005-02-18  0:05 ` [RFC][PATCH] Memory Hotplug Dave Hansen
@ 2005-02-18 21:52   ` Rik van Riel
  2005-02-18 22:20     ` Dave Hansen
  0 siblings, 1 reply; 9+ messages in thread
From: Rik van Riel @ 2005-02-18 21:52 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Kernel Mailing List, lhms, linux-mm, Andy Whitcroft

On Thu, 17 Feb 2005, Dave Hansen wrote:

> The attached patch is a prototype implementation of memory hot-add.  It
> allows you to boot your system, and add memory to it later.  Why would
> you want to do this?

I want it so I can grow Xen guests after they have been booted
up.  Being able to hot-add memory is essential for dynamically
resizing the memory of various guest OSes, to readjust them for
the workload.

Memory hot-remove isn't really needed with Xen, the balloon
driver takes care of that.

> I can post individual patches if anyone would like to comment on them.

I'm interested.  I want to get this stuff working with Xen ;)

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Memory Hotplug
  2005-02-18 21:52   ` Rik van Riel
@ 2005-02-18 22:20     ` Dave Hansen
  2005-02-19  1:48       ` Rik van Riel
  0 siblings, 1 reply; 9+ messages in thread
From: Dave Hansen @ 2005-02-18 22:20 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Linux Kernel Mailing List, lhms, linux-mm, Andy Whitcroft

On Fri, 2005-02-18 at 16:52 -0500, Rik van Riel wrote:
> On Thu, 17 Feb 2005, Dave Hansen wrote:
> > The attached patch is a prototype implementation of memory hot-add.  It
> > allows you to boot your system, and add memory to it later.  Why would
> > you want to do this?
> 
> I want it so I can grow Xen guests after they have been booted
> up.  Being able to hot-add memory is essential for dynamically
> resizing the memory of various guest OSes, to readjust them for
> the workload.

That's the same thing we like about it on ppc64 partitions.

> Memory hot-remove isn't really needed with Xen, the balloon
> driver takes care of that.

You can free up individual pages back to the hypervisor, but you might
also want the opportunity to free up some unused mem_map if you shrink
the partition by a large amount.

> > I can post individual patches if anyone would like to comment on them.
> 
> I'm interested.  I want to get this stuff working with Xen ;)

You can either pull them from here:

	http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out/

or grab the whole tarball:

http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out-2.6.11-rc3-mhp1.tar.gz

Or, I could always post the whole bunch to lhms.  Nobody there should
mind too much. :)

The largest part of porting hot-add to a new architecture is usually the
sparsemem portion.  You'll pretty much have to #ifdef pfn_to_page() and
friends, declare a few macros, and then do a bit of debugging.  Here's
ppc64 as an example:

http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out/B-sparse-170-sparsemem-ppc64.patch

-- Dave


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Memory Hotplug
  2005-02-18 22:20     ` Dave Hansen
@ 2005-02-19  1:48       ` Rik van Riel
  0 siblings, 0 replies; 9+ messages in thread
From: Rik van Riel @ 2005-02-19  1:48 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Kernel Mailing List, lhms, linux-mm, Andy Whitcroft

On Fri, 18 Feb 2005, Dave Hansen wrote:

>> Memory hot-remove isn't really needed with Xen, the balloon
>> driver takes care of that.
>
> You can free up individual pages back to the hypervisor, but you might
> also want the opportunity to free up some unused mem_map if you shrink
> the partition by a large amount.

Agreed, though I rather like the fact that the code can
be introduced bit by bit, so the memory hot-remove code
(probably the most complex part) doesn't need to be
maintained out-of-tree for Xen, but can wait until it
is upstream.

>>> I can post individual patches if anyone would like to comment on them.
>>
>> I'm interested.  I want to get this stuff working with Xen ;)
>
> You can either pull them from here:
>
> 	http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out/

Thanks, I'll take a stab at porting this functionality to Xen.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2005-02-19  1:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-02-18  0:03 [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Dave Hansen
2005-02-18  0:05 ` [RFC][PATCH] Memory Hotplug Dave Hansen
2005-02-18 21:52   ` Rik van Riel
2005-02-18 22:20     ` Dave Hansen
2005-02-19  1:48       ` Rik van Riel
2005-02-18  5:16 ` [RFC][PATCH] Sparse Memory Handling (hot-add foundation) Mike Kravetz
2005-02-18 15:31   ` Dave Hansen
2005-02-18 10:04 ` Andi Kleen
2005-02-18 15:42   ` Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).