Re: memory hotremove prototype, take 3

From: Yasunori Goto <ygoto@fsw.fujitsu.com>
To: "Pavel Machek" <pavel@suse.cz>
Cc: <linux-kernel@vger.kernel.org>,
	"Luck, Tony" <tony.luck@intel.com>,
	IWAMOTO Toshihiro <iwamoto@valinux.co.jp>,
	Hirokazu Takahashi <taka@valinux.co.jp>,
	Linux Hotplug Memory Support  <lhms-devel@lists.sourceforge.net>
Subject: Re: memory hotremove prototype, take 3
Date: Tue, 02 Dec 2003 14:26:10 -0800	[thread overview]
Message-ID: <20031202111944.57B2.YGOTO@fsw.fujitsu.com> (raw)
In-Reply-To: <B8E391BBE9FE384DAA4C5C003888BE6F4FAED7@scsmsx401.sc.intel.com>

[-- Attachment #1: Type: text/plain, Size: 919 bytes --]

Hello.

> Pavel Machek wrote:
> 
> > hotunplug seems cool... How do you deal with kernel data structures in
> > memory "to be removed"? Or you simply don't allow kmalloc() to
> > allocate there?
> 
> You guessed right.  Hot removeable memory can only be allocated
> for uses that we can easily re-allocate.  So kmalloc() etc. have
> to get memory from some area that we promise not to ever try to
> remove.

IMHO, To hot-remove memory, memory attribute should be divided
into Hotpluggable and no-Hotpluggable, and each attribute memory
should be allocated each unit(ex. node). 

(I posted the following mail 2 month ago.)
http://marc.theaimsgroup.com/?l=linux-kernel&m=106506389406876&w=2

Now, I'm making a Memory hot-ADD trial patch, but it don't work yet.
(Kernel panic when memory enable command is executed.)
After this patch will work, I will post it again.

Thanks.

-- 
Yasunori Goto <ygoto at fsw.fujitsu.com>

[-- Attachment #2: 20031125.patch --]
[-- Type: application/octet-stream, Size: 34776 bytes --]

diff -duprb linux-2.6.0-test7/Makefile testdir/Makefile

--- linux-2.6.0-test7/Makefile	Wed Oct  8 12:24:17 2003
+++ testdir/Makefile	Sat Nov 22 17:55:21 2003
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -test7
+EXTRAVERSION = -test7-mem-hotplug
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
diff -duprb linux-2.6.0-test7/arch/i386/Kconfig testdir/arch/i386/Kconfig
--- linux-2.6.0-test7/arch/i386/Kconfig	Wed Oct  8 12:24:02 2003
+++ testdir/arch/i386/Kconfig	Sat Nov 22 17:52:36 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
 
+config MEMHOTPLUGTEST
+	bool "Memory hotplug test"
+	default n
+
 config DISCONTIGMEM
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HAVE_ARCH_BOOTMEM_NODE
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HIGHPTE
diff -duprb linux-2.6.0-test7/arch/i386/kernel/setup.c testdir/arch/i386/kernel/setup.c
--- linux-2.6.0-test7/arch/i386/kernel/setup.c	Wed Oct  8 12:24:05 2003
+++ testdir/arch/i386/kernel/setup.c	Sat Nov 22 17:52:36 2003
@@ -114,6 +114,8 @@ extern void generic_apic_probe(char *);
 extern int root_mountflags;
 extern char _end[];
 
+extern unsigned long node_end_pfn[MAX_NUMNODES];
+
 unsigned long saved_videomode;
 
 #define RAMDISK_IMAGE_START_MASK  	0x07FF
@@ -611,7 +613,11 @@ unsigned long __init find_max_low_pfn(vo
 {
 	unsigned long max_low_pfn;
 
+#if CONFIG_MEMHOTPLUGTEST
+	max_low_pfn = node_end_pfn[0];
+#else
 	max_low_pfn = max_pfn;
+#endif
 	if (max_low_pfn > MAXMEM_PFN) {
 		if (highmem_pages == -1)
 			highmem_pages = max_pfn - MAXMEM_PFN;
diff -duprb linux-2.6.0-test7/arch/i386/mm/discontig.c testdir/arch/i386/mm/discontig.c
--- linux-2.6.0-test7/arch/i386/mm/discontig.c	Wed Oct  8 12:24:07 2003
+++ testdir/arch/i386/mm/discontig.c	Tue Nov 25 19:34:03 2003
@@ -28,6 +28,12 @@
 #include <linux/mmzone.h>
 #include <linux/highmem.h>
 #include <linux/initrd.h>
+#include <linux/proc_fs.h>
+
+#ifdef CONFIG_MEMHOTPLUG
+#include <linux/sched.h>
+#endif
+
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/mmzone.h>
@@ -80,6 +86,10 @@ unsigned long node_remap_offset[MAX_NUMN
 void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+void set_pmd_pfn_withpgd(unsigned long vaddr, unsigned long pfn,pgd_t *pgd, pgprot_t flags);
+#endif
+
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
@@ -111,6 +121,44 @@ int __init get_memcfg_numa_flat(void)
 	return 1;
 }
 
+int __init get_memcfg_numa_blks(void)
+{
+	int i, pfn;
+
+	printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+	/* Run the memory configuration and find the top of memory. */
+	find_max_pfn();
+	max_pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		pfn = PFN_DOWN(256 << 20) * i;
+		node_start_pfn[i] = pfn;
+		pfn += PFN_DOWN(256 << 20);
+		if (pfn < max_pfn)
+			node_end_pfn[i] = pfn;
+		else {
+			node_end_pfn[i] = max_pfn;
+			i++;
+			printk("total %d blocks, max %d\n", i, (int)max_pfn);
+			break;
+		}
+	}
+
+	/* Fill in the physnode_map with our simplistic memory model,
+	   * all memory is in node 0.
+	 */
+	for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+		pfn += PAGES_PER_ELEMENT) {
+		physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(256 << 20);
+	}
+
+	/* Indicate there is one node available. */
+	node_set_online(0);
+	numnodes = i;
+
+	return 1;
+}
+
 /*
  * Find the highest page frame number we have available for the node
  */
@@ -134,6 +182,12 @@ static void __init find_max_pfn_node(int
  */
 static void __init allocate_pgdat(int nid)
 {
+#if CONFIG_MEMHOTPLUGTEST
+	/* pg_dat allocate Node 0 statically */
+	NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
+	min_low_pfn += PFN_UP(sizeof(pg_data_t));
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+#else
 	if (nid)
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
@@ -141,6 +195,7 @@ static void __init allocate_pgdat(int ni
 		min_low_pfn += PFN_UP(sizeof(pg_data_t));
 		memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
 	}
+#endif
 }
 
 /*
@@ -183,6 +238,7 @@ static void __init register_bootmem_low_
 	}
 }
 
+/*static struct kcore_list numa_kc;*/
 void __init remap_numa_kva(void)
 {
 	void *vaddr;
@@ -196,9 +252,34 @@ void __init remap_numa_kva(void)
 				node_remap_start_pfn[node] + pfn, 
 				PAGE_KERNEL_LARGE);
 		}
+	/*	memset(node_remap_start_vaddr[node], 0,node_remap_size[node] * PAGE_SIZE); */
+	}
+/*	kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+		   node_remap_offset[numnodes - 1] << PAGE_SHIFT);*/
+}
+
+void remap_add_node_kva(int node)
+{
+	void *vaddr;
+	unsigned long pfn;
+	struct task_struct *p;
+	pgd_t *pg_dir;
+
+	read_lock(&tasklist_lock);
+	for_each_process(p){
+		pg_dir = p->mm->pgd;
+		for(pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE){
+			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+			set_pmd_pfn_withpgd((ulong) vaddr,
+				node_remap_start_pfn[node] + pfn,
+				pg_dir + pgd_index( (ulong)vaddr ) ,
+				PAGE_KERNEL_LARGE);
+		}
 	}
+	read_unlock(&tasklist_lock);
 }
 
+
 static unsigned long calculate_numa_remap_pages(void)
 {
 	int nid;
@@ -206,8 +287,13 @@ static unsigned long calculate_numa_rema
 
 	for (nid = 1; nid < numnodes; nid++) {
 		/* calculate the size of the mem_map needed in bytes */
+#if CONFIG_MEMHOTPLUGTEST
+		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
+			* sizeof(struct page);
+#else
 		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
 			* sizeof(struct page) + sizeof(pg_data_t);
+#endif
 		/* convert size to large (pmd size) pages, rounding up */
 		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
@@ -248,7 +334,9 @@ unsigned long __init setup_memory(void)
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 	       pages_to_mb(highend_pfn - highstart_pfn));
 #endif
+#ifndef CONFIG_MEMHOTPLUGTEST
 	system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
+#endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(system_max_low_pfn));
 	printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
@@ -426,7 +514,11 @@ void __init set_highmem_pages_init(int b
 void __init set_max_mapnr_init(void)
 {
 #ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_MEMHOTPLUGTEST
+	highmem_start_page = phys_to_virt(max_low_pfn << PAGE_SHIFT);
+#else  /* !CONFIG_MEMHOTPLUGTEST */
 	highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map;
+#endif /* !CONFIG_MEMHOTPLUGTEST */
 	num_physpages = highend_pfn;
 #else
 	num_physpages = max_low_pfn;
diff -duprb linux-2.6.0-test7/arch/i386/mm/pgtable.c testdir/arch/i386/mm/pgtable.c
--- linux-2.6.0-test7/arch/i386/mm/pgtable.c	Wed Oct  8 12:24:53 2003
+++ testdir/arch/i386/mm/pgtable.c	Tue Nov 25 19:23:46 2003
@@ -118,6 +118,30 @@ void set_pmd_pfn(unsigned long vaddr, un
 	 */
 	__flush_tlb_one(vaddr);
 }
+void set_pmd_pfn_withpgd(unsigned long vaddr, unsigned long pfn, pgd_t *pgd, pgprot_t flags)
+{
+	pmd_t *pmd;
+
+	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
+		printk ("set_pmd_pfn: vaddr misaligned\n");
+		return; /* BUG(); */
+	}
+	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
+		printk ("set_pmd_pfn: pfn misaligned\n");
+		return; /* BUG(); */
+	}
+	if (pgd_none(*pgd)) {
+		printk ("set_pmd_pfn: pgd_none\n");
+		return; /* BUG(); */
+	}
+	pmd = pmd_offset(pgd, vaddr);
+	set_pmd(pmd, pfn_pmd(pfn, flags));
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
 
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 {
diff -duprb linux-2.6.0-test7/drivers/char/mem.c testdir/drivers/char/mem.c
--- linux-2.6.0-test7/drivers/char/mem.c	Wed Oct  8 12:24:06 2003
+++ testdir/drivers/char/mem.c	Sat Nov 22 17:53:41 2003
@@ -24,6 +24,9 @@
 #include <linux/smp_lock.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/ptrace.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/highmem.h>
+#endif
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -104,6 +107,36 @@ static ssize_t do_write_mem(struct file 
 	return written;
 }
 
+#ifdef CONFIG_HIGHMEM
+static ssize_t read_highmem(struct file * file, char * buf,
+ 			size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	ssize_t read = 0;
+	int off, pfn = p >> PAGE_SHIFT;
+	char *pp;
+	struct page *page;
+
+	if (! pfn_valid(pfn))
+		return 0;
+	page = pfn_to_page(pfn);
+	pp = kmap(page);
+
+	off = p & (PAGE_SIZE - 1);
+	if (PAGE_SIZE - off > count)
+		count = PAGE_SIZE - off;
+
+	if (copy_to_user(buf, pp + off, count)) {
+		kunmap(page);
+		return -EFAULT;
+	}
+	read += count;
+	*ppos += read;
+	kunmap(page);
+	return read;
+}
+
+#endif
 
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the 
@@ -118,7 +151,11 @@ static ssize_t read_mem(struct file * fi
 
 	end_mem = __pa(high_memory);
 	if (p >= end_mem)
+#ifdef CONFIG_HIGHMEM
+		return read_highmem(file, buf, count, ppos);
+#else
 		return 0;
+#endif
 	if (count > end_mem - p)
 		count = end_mem - p;
 	read = 0;
diff -duprb linux-2.6.0-test7/fs/proc/kcore.c testdir/fs/proc/kcore.c
--- linux-2.6.0-test7/fs/proc/kcore.c	Wed Oct  8 12:24:07 2003
+++ testdir/fs/proc/kcore.c	Sat Nov 22 17:54:58 2003
@@ -387,7 +387,7 @@ read_kcore(struct file *file, char __use
 			}
 			kfree(elf_buf);
 		} else {
-			if (kern_addr_valid(start)) {
+			if (1 /*kern_addr_valid(start)*/) {
 				unsigned long n;
 
 				n = copy_to_user(buffer, (char *)start, tsz);
diff -duprb linux-2.6.0-test7/include/asm-i386/mmzone.h testdir/include/asm-i386/mmzone.h
--- linux-2.6.0-test7/include/asm-i386/mmzone.h	Wed Oct  8 12:24:06 2003
+++ testdir/include/asm-i386/mmzone.h	Sat Nov 22 17:54:41 2003
@@ -128,6 +128,7 @@ static inline struct pglist_data *pfn_to
 #endif /* CONFIG_X86_NUMAQ */
 
 extern int get_memcfg_numa_flat(void );
+extern int get_memcfg_numa_blks(void );
 /*
  * This allows any one NUMA architecture to be compiled
  * for, and still fall back to the flat function if it
@@ -140,6 +141,9 @@ static inline void get_memcfg_numa(void)
 		return;
 #elif CONFIG_ACPI_SRAT
 	if (get_memcfg_from_srat())
+		return;
+#elif CONFIG_MEMHOTPLUGTEST
+	if (get_memcfg_numa_blks())
 		return;
 #endif
 
diff -duprb linux-2.6.0-test7/include/asm-i386/numnodes.h testdir/include/asm-i386/numnodes.h
--- linux-2.6.0-test7/include/asm-i386/numnodes.h	Wed Oct  8 12:24:02 2003
+++ testdir/include/asm-i386/numnodes.h	Sat Nov 22 17:54:41 2003
@@ -13,6 +13,10 @@
 /* Max 8 Nodes */
 #define NODES_SHIFT	3
 
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT	3
+
 #endif /* CONFIG_X86_NUMAQ */
 
 #endif /* _ASM_MAX_NUMNODES_H */
diff -duprb linux-2.6.0-test7/include/linux/mm.h testdir/include/linux/mm.h
--- linux-2.6.0-test7/include/linux/mm.h	Wed Oct  8 12:24:01 2003
+++ testdir/include/linux/mm.h	Sat Nov 22 17:54:21 2003
@@ -219,7 +219,14 @@ struct page {
  */
 #define put_page_testzero(p)				\
 	({						\
-		BUG_ON(page_count(p) == 0);		\
+		if (page_count(p) == 0) {		\
+			int i;						\
+			printk("Page: %lx ", (long)p);			\
+			for(i = 0; i < sizeof(struct page); i++)	\
+				printk(" %02x", ((unsigned char *)p)[i]); \
+			printk("\n");					\
+			BUG();				\
+		}					\
 		atomic_dec_and_test(&(p)->count);	\
 	})
 
@@ -622,5 +629,17 @@ kernel_map_pages(struct page *page, int 
 }
 #endif
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define	page_trace(p)	page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define	page_trace(p)	do { } while(0)
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define	page_trace(p)	page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define	page_trace(p)	do { } while(0)
+#endif /* CONFIG_MEMHOTPLUGTEST */ 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff -duprb linux-2.6.0-test7/include/linux/mmzone.h testdir/include/linux/mmzone.h
--- linux-2.6.0-test7/include/linux/mmzone.h	Wed Oct  8 12:24:08 2003
+++ testdir/include/linux/mmzone.h	Sat Nov 22 17:54:23 2003
@@ -174,6 +174,7 @@ struct zone {
  * footprint of this construct is very small.
  */
 struct zonelist {
+	rwlock_t zonelist_lock; 
 	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
 };
 
@@ -235,10 +236,29 @@ void wakeup_kswapd(struct zone *zone);
  * next_zone - helper magic for for_each_zone()
  * Thanks to William Lee Irwin III for this piece of ingenuity.
  */
+extern char zone_active[];
+
 static inline struct zone *next_zone(struct zone *zone)
 {
 	pg_data_t *pgdat = zone->zone_pgdat;
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+	unsigned int zone_idx = zone - pgdat->node_zones;
+	do{
+		if (zone_idx < MAX_NR_ZONES -1){
+			zone++;
+			zone_idx++;
+		}else if (pgdat->pgdat_next){
+			pgdat = pgdat->pgdat_next;
+			zone = pgdat->node_zones;
+			zone_idx=0;
+		}else
+			return NULL;
+	}while(!zone_active[pgdat->node_id * MAX_NR_ZONES + zone_idx]);
+
+	return zone;
+
+#else
 	if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
 		zone++;
 	else if (pgdat->pgdat_next) {
@@ -248,6 +268,7 @@ static inline struct zone *next_zone(str
 		zone = NULL;
 
 	return zone;
+#endif
 }
 
 /**
@@ -359,6 +380,10 @@ static inline unsigned int num_online_me
 	}
 	return num;
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+#endif
 
 #else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
 
diff -duprb linux-2.6.0-test7/mm/page_alloc.c testdir/mm/page_alloc.c
--- linux-2.6.0-test7/mm/page_alloc.c	Wed Oct  8 12:24:01 2003
+++ testdir/mm/page_alloc.c	Tue Nov 25 18:48:01 2003
@@ -31,6 +31,7 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 
@@ -52,6 +53,11 @@ EXPORT_SYMBOL(nr_swap_pages);
  */
 struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+EXPORT_SYMBOL(zone_active);
+static const struct page *page_trace_list[10];
+#endif
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -512,9 +518,28 @@ static struct page *buffered_rmqueue(str
 		mod_page_state(pgalloc, 1 << order);
 		prep_new_page(page, order);
 	}
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_active[page->flags >> ZONE_SHIFT])
+		BUG();
+#endif
 	return page;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+	int i;
+
+	for(i = 0; ; i++) {
+		if (zone_table[i] == z)
+			return zone_active[i];
+		if (zone_table[i] == NULL)
+			BUG();
+	}
+}
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *
@@ -544,6 +569,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	int i;
 	int cold;
 	int do_retry;
+	unsigned long flag;
 
 	might_sleep_if(wait);
 
@@ -551,10 +577,13 @@ __alloc_pages(unsigned int gfp_mask, uns
 	if (gfp_mask & __GFP_COLD)
 		cold = 1;
 
+	read_lock_irqsave(&zonelist->zonelist_lock,flag);
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	classzone = zones[0]; 
-	if (classzone == NULL)    /* no zones in the zonelist */
+	if (classzone == NULL){    /* no zones in the zonelist */
+		read_unlock_irqrestore(&zonelist->zonelist_lock,flag);
 		return NULL;
+	}
 
 	/* Go through the zonelist once, looking for a zone with enough free */
 	min = 1UL << order;
@@ -562,6 +591,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		struct zone *z = zones[i];
 		unsigned long local_low;
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		/*
 		 * This is the fabled 'incremental min'. We let real-time tasks
 		 * dip their real-time paws a little deeper into reserves.
@@ -589,6 +622,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 	for (i = 0; zones[i] != NULL; i++) {
 		unsigned long local_min;
 		struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 
 		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
@@ -612,6 +649,10 @@ rebalance:
 		/* go through the zonelist yet again, ignoring mins */
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(z))
+				continue;
+#endif
 
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
@@ -668,6 +709,7 @@ rebalance:
 	}
 
 nopage:
+	read_unlock_irqrestore(&zonelist->zonelist_lock,flag);
 	if (!(gfp_mask & __GFP_NOWARN)) {
 		printk("%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
@@ -676,6 +718,24 @@ nopage:
 	return NULL;
 got_pg:
 	kernel_map_pages(page, 1 << order, 1);
+#if 1 // debug
+	/* Validate page */
+	{
+		struct zone *z = page_zone(page);
+		int idx = page - z->zone_mem_map;
+		if (idx < 0 || idx >= z->spanned_pages) {
+			printk("0x%08x %d\n", (int)(page->flags >> ZONE_SHIFT), idx);
+			read_unlock(&zonelist->zonelist_lock);
+			BUG();
+		}
+	}
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+	read_unlock_irqrestore(&zonelist->zonelist_lock,flag);
+	if (! zone_active[page->flags >> ZONE_SHIFT]){
+		BUG();
+	}
+#endif
 	return page;
 }
 
@@ -1046,7 +1106,11 @@ void show_free_areas(void)
 /*
  * Builds allocation fallback zone lists.
  */
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+#else
 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+#endif
 {
 	switch (k) {
 		struct zone *zone;
@@ -1076,6 +1140,9 @@ static int __init build_zonelists_node(p
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct zone *zone;
+#endif
 
 	local_node = pgdat->node_id;
 	printk("Building zonelist for node : %d\n", local_node);
@@ -1092,6 +1159,7 @@ static void __init build_zonelists(pg_da
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;
 
+#ifndef CONFIG_MEMHOTPLUGTEST
  		j = build_zonelists_node(pgdat, zonelist, j, k);
  		/*
  		 * Now we build the zonelist so that it contains the zones
@@ -1107,6 +1175,26 @@ static void __init build_zonelists(pg_da
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
+#else
+		rwlock_init(&zonelist->zonelist_lock);
+		for(; k >= 0; k--) {
+			zone = pgdat->node_zones + k;
+			if (!zone_activep(zone))
+				continue;
+			if (zone->present_pages)
+				zonelist->zones[j++] = zone;
+			for (node = local_node + 1; node < numnodes; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone_activep(zone) && zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+			for (node = 0; node < local_node; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone_activep(zone) && zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+		}
+#endif
 	} 
 }
 
@@ -1162,8 +1250,14 @@ static inline unsigned long wait_table_b
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
+#if CONFIG_MEMHOTPLUGTEST
+static void calculate_zone_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+#else
 static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
+#endif
+
 {
 	unsigned long realtotalpages, totalpages = 0;
 	int i;
@@ -1199,6 +1293,20 @@ static void __init calculate_zone_bitmap
 	}
 }
 
+#if CONFIG_MEMHOTPLUGTEST
+static void calculate_addzone_bitmap(struct pglist_data *pgdat, unsigned long *zones_size)
+{
+	unsigned long size = zones_size[ZONE_HIGHMEM];
+
+	size = LONG_ALIGN((size + 7) >> 3);
+	if (size) {
+		pgdat->valid_addr_bitmap = (unsigned long *)kmalloc(size,GFP_KERNEL);
+		memset(pgdat->valid_addr_bitmap, 0, size);
+	}
+}
+
+#endif
+
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
@@ -1252,6 +1360,45 @@ static void __init free_area_init_core(s
 		unsigned long batch;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+										/* only node 0 is active */
+		if ( nid ){ 							/*  node 1-... are node active */
+										/* XXX : This should be changed. */
+			zone_active[nid * MAX_NR_ZONES + j ] = 0;
+			zone->spanned_pages = 0;
+			zone->present_pages = 0;
+			zone->name = zone_names[j];
+			spin_lock_init(&zone->lock);
+			spin_lock_init(&zone->lru_lock);
+			zone->zone_pgdat = pgdat;
+			zone->free_pages = 0;
+			for (cpu = 0; cpu < NR_CPUS; cpu++) {
+				struct per_cpu_pages *pcp;
+
+				pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+				pcp->count = 0;
+				pcp->low = 0;
+				pcp->high = 0;
+				pcp->batch = 0;
+				INIT_LIST_HEAD(&pcp->list);
+
+				pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+				pcp->count = 0;
+				pcp->low = 0;
+				pcp->high = 0;
+				pcp->batch = 0;
+				INIT_LIST_HEAD(&pcp->list);
+			}
+			INIT_LIST_HEAD(&zone->active_list);
+			INIT_LIST_HEAD(&zone->inactive_list);
+			atomic_set(&zone->refill_counter, 0);
+			zone->nr_active = 0;
+			zone->nr_inactive = 0;
+
+			continue;
+		}
+		zone_active[nid * MAX_NR_ZONES + j ] =  1 ;		/* only node 0 is active */
+#endif
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
@@ -1295,8 +1442,8 @@ static void __init free_area_init_core(s
 			pcp->batch = 1 * batch;
 			INIT_LIST_HEAD(&pcp->list);
 		}
-		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
-				zone_names[j], realsize, batch);
+		printk("  %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+				zone_names[j], realsize, batch, zone_start_pfn);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		atomic_set(&zone->refill_counter, 0);
@@ -1381,14 +1528,22 @@ void __init free_area_init_node(int nid,
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (!node_mem_map && !nid) {
+#else
 	if (!node_mem_map) {
+#endif
 		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
 		node_mem_map = alloc_bootmem_node(pgdat, size);
 	}
 	pgdat->node_mem_map = node_mem_map;
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if(!nid)memblk_set_online(node_to_memblk(nid));		/* only node 0 is online */
+#else
 	memblk_set_online(node_to_memblk(nid));
+#endif
 
 	calculate_zone_bitmap(pgdat, zones_size);
 }
@@ -1644,3 +1799,387 @@ int min_free_kbytes_sysctl_handler(ctl_t
 	setup_per_zone_pages_min();
 	return 0;
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void rebuild_all_zonelist(unsigned long nid)
+{
+	struct zonelist *zonelist;
+	unsigned long node, p_node, j=0;
+
+
+	zonelist = NODE_DATA(nid)->node_zonelists + ZONE_HIGHMEM;
+	write_lock(&zonelist->zonelist_lock);
+	memset(zonelist, 0, sizeof(*zonelist));
+
+	/* build zonelist for added zone */
+	j= build_zonelists_node( NODE_DATA(nid), zonelist, j, ZONE_HIGHMEM);
+
+	for ( node = nid + 1; node < numnodes; node++)
+		j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+	for (node = 0; node < nid ; node++)
+		j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+
+
+	/* rebuild zonelist for other node */
+	for( p_node = 0; p_node < numnodes ; p_node++){
+		zonelist = NODE_DATA(p_node)->node_zonelists + ZONE_HIGHMEM;
+		j=0;
+
+		j = build_zonelists_node( NODE_DATA(p_node), zonelist, j, ZONE_HIGHMEM);
+
+		for ( node = p_node + 1 ; node < numnodes ; node++ )
+			j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+		for ( node = 0; node < p_node; node++ )
+			j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+		zonelist->zones[j++] = NULL;
+
+	}
+	write_unlock(&zonelist->zonelist_lock);
+}
+
+
+static void free_area_add_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long i;
+	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+	int cpu, nid = pgdat->node_id;
+	struct page *lmem_map = pgdat->node_mem_map;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+
+	{
+		struct zone *zone = pgdat->node_zones + ZONE_HIGHMEM;
+		unsigned long size, realsize;
+		unsigned long batch;
+
+		zone_table[nid * MAX_NR_ZONES + ZONE_HIGHMEM] = zone;
+
+		realsize = size = zones_size[ZONE_HIGHMEM];
+		if (zholes_size)
+			realsize -= zholes_size[ZONE_HIGHMEM];
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[ZONE_HIGHMEM];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		/*
+		 * The per-cpu-pages pools are set to around 1000th of the
+		 * size of the zone.  But no more than 1/4 of a meg - there's
+		 * no point in going beyond the size of L2 cache.
+		 *
+		 * OK, so we don't know how big the cache is.  So guess.
+		 */
+		batch = zone->present_pages / 1024;
+		if (batch * PAGE_SIZE > 256 * 1024)
+			batch = (256 * 1024) / PAGE_SIZE;
+		batch /= 4;		/* We effectively *= 4 below */
+		if (batch < 1)
+			batch = 1;
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+			pcp->count = 0;
+			pcp->low = 2 * batch;
+			pcp->high = 6 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+		}
+		printk("  %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+				zone_names[ZONE_HIGHMEM], realsize, batch, zone_start_pfn);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		atomic_set(&zone->refill_counter, 0);
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+
+		/*
+		 * The per-page waitqueue mechanism uses hashed waitqueues
+		 * per zone.
+		 */
+		zone->wait_table_size = wait_table_size(size);
+		zone->wait_table_bits =
+			wait_table_bits(zone->wait_table_size);
+		zone->wait_table = (wait_queue_head_t *)kmalloc(zone->wait_table_size
+						* sizeof(wait_queue_head_t), GFP_KERNEL);
+				/* XXX: wait_table might have to be allocate own node. */
+
+		for(i = 0; i < zone->wait_table_size; ++i)
+			init_waitqueue_head(zone->wait_table + i);
+
+		pgdat->nr_zones = ZONE_HIGHMEM+1;
+
+		zone->zone_mem_map = lmem_map;
+		zone->zone_start_pfn = zone_start_pfn;
+
+		if ((zone_start_pfn) & (zone_required_alignment-1))
+			printk("BUG: wrong zone alignment, it will crash\n");
+
+		memmap_init_zone(lmem_map, size, nid, ZONE_HIGHMEM, zone_start_pfn);
+
+		for (i = 0; ; i++) {
+			unsigned long bitmap_size;
+
+			INIT_LIST_HEAD(&zone->free_area[i].free_list);
+			if (i == MAX_ORDER-1) {
+				zone->free_area[i].map = NULL;
+				break;
+			}
+
+			/*
+			 * Page buddy system uses "index >> (i+1)",
+			 * where "index" is at most "size-1".
+			 *
+			 * The extra "+3" is to round down to byte
+			 * size (8 bits per byte assumption). Thus
+			 * we get "(size-1) >> (i+4)" as the last byte
+			 * we can access.
+			 *
+			 * The "+1" is because we want to round the
+			 * byte allocation up rather than down. So
+			 * we should have had a "+7" before we shifted
+			 * down by three. Also, we have to add one as
+			 * we actually _use_ the last bit (it's [0,n]
+			 * inclusive, not [0,n[).
+			 *
+			 * So we actually had +7+1 before we shift
+			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+			 * (modulo overflows, which we do not have).
+			 *
+			 * Finally, we LONG_ALIGN because all bitmap
+			 * operations are on longs.
+			 */
+			bitmap_size = (size-1) >> (i+4);
+			bitmap_size = LONG_ALIGN(bitmap_size+1);
+			zone->free_area[i].map =
+			  (unsigned long *) kmalloc(bitmap_size, GFP_KERNEL);
+				/* XXX: bitmap might have to be allocate own node too. */
+		}
+	}
+}
+
+extern void *node_remap_start_vaddr[];
+
+void free_area_add_node(int nid, struct pglist_data *pgdat,unsigned long *zones_size,
+		unsigned long node_start_pfn, unsigned long *zholes_size)
+{
+	unsigned long size;
+
+	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+
+	size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+	remap_add_node_kva(nid);
+
+	free_area_add_core(pgdat, zones_size, zholes_size);
+	calculate_addzone_bitmap(pgdat, zones_size);
+
+}
+
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+
+static void node_enable(unsigned long nid)
+{
+	unsigned long idx = nid * MAX_NR_ZONES + ZONE_HIGHMEM;
+	unsigned long zones_size[MAX_NR_ZONES] =  {0, 0, 0};
+	unsigned long *zholes_size;
+
+	if (nid > numnodes){		/* XXX : nid should has continuity now */
+					/*       but it should be changed */
+		printk("nid=%d isn&t possible to enable \n",nid);
+		return;
+	}
+
+	if (node_online(nid)){
+		printk("nid=%d is already enable \n", nid);
+		return;
+	}
+
+	zones_size[ZONE_HIGHMEM] = node_end_pfn[nid] - node_start_pfn[nid];
+					/* XXX: This information should be got from firmware.
+					        However, this is emulation. */
+	if( !zones_size[ZONE_HIGHMEM] ){
+		printk("nid=%d is size 0\n",nid);
+		return;
+	}
+
+	zholes_size = get_zholes_size(nid);
+
+	free_area_add_node(nid, NODE_DATA(nid), zones_size, node_start_pfn[nid], zholes_size);
+
+	setup_per_zone_pages_min();	/* set up again */
+
+	rebuild_all_zonelist( nid);
+	memblk_set_online(node_to_memblk(nid));
+	node_set_online(nid);
+	zone_active[idx] = 1;
+
+}
+
+static int mhtest_read(char *page, char **start, off_t off, int count,
+    int *eof, void *data)
+{
+	char *p;
+	int i, len;
+	const struct zone *z;
+
+	p = page;
+	for(i = 0; ; i++) {
+		z = zone_table[i];
+		if (z == NULL)
+			break;
+		if (! z->present_pages)
+			/* skip empty zone */
+			continue;
+		len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+		    zone_active[i] ? "en" : "dis", (int)z->free_pages, (int)z->nr_active,
+		    (int)z->present_pages);
+		p += len;
+	}
+	len = p - page;
+
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len < 0)
+		len = 0;
+	if (len > count)
+		len = count;
+
+	return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+    unsigned long count, void *data)
+{
+	unsigned long idx;
+	char buf[64], *p;
+	int i;
+
+	if (count > sizeof(buf) - 1)
+		count = sizeof(buf) - 1;
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	p = strchr(buf, ' ');
+	if (p == NULL)
+		goto out;
+
+	*p++ = '\0';
+	idx = simple_strtoul(p, NULL, 0);
+
+	if (strcmp(buf, "trace") == 0) {
+		for(i = 0; i < sizeof(page_trace_list) /
+		    sizeof(page_trace_list[0]); i++)
+			if (page_trace_list[i] == NULL) {
+				page_trace_list[i] = (struct page *)idx;
+				printk("add trace %lx\n", (unsigned long)idx);
+				goto out;
+			}
+		printk("page_trace_list is full (not added)\n");
+		goto out;
+	} else if (strcmp(buf, "untrace") == 0) {
+		for(i = 0; i < sizeof(page_trace_list) /
+		    sizeof(page_trace_list[0]); i++)
+			if (page_trace_list[i] == (struct page *)idx)
+				break;
+		if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0])) {
+			printk("not registered\n");
+			goto out;
+		}
+		for(; i < sizeof(page_trace_list) /
+		    sizeof(page_trace_list[0]) - 1; i++)
+			page_trace_list[i] = page_trace_list[i + 1];
+		page_trace_list[i] = NULL;
+		goto out;
+	}
+	if (idx > MAX_NUMNODES) {
+		printk("Argument out of range\n");
+		goto out;
+	}
+	if (strcmp(buf, "disable") == 0) {
+		printk("disable node = %d\n", (int)idx);	/* XXX */
+		goto out;
+	} else if (strcmp(buf, "purge") == 0) {
+		/* XXX */
+	} else if (strcmp(buf, "enable") == 0) {
+		printk("enable node = %d\n", (int)idx);
+		node_enable(idx);
+	} else if (strcmp(buf, "active") == 0) {
+		/*
+		if (zone_table[idx] == NULL)
+			goto out;
+		spin_lock_irq(&zone_table[idx]->lru_lock);
+		i = 0;
+		list_for_each(l, &zone_table[idx]->active_list) {
+			printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+			i++;
+			if (i == 10)
+				break;
+		}
+		spin_unlock_irq(&zone_table[idx]->lru_lock);
+		printk("\n");
+		*/
+	} else if (strcmp(buf, "inuse") == 0) {
+		/*
+		if (zone_table[idx] == NULL)
+			goto out;
+		for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+			if (page_count(&zone_table[idx]->zone_mem_map[i]))
+				printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+		printk("\n");
+		*/
+	}
+out:
+	return count;
+}
+
+static int __init procmhtest_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("memhotplug", 0, NULL);
+	if (entry == NULL)
+		return -1;
+
+	entry->read_proc = &mhtest_read;
+	entry->write_proc = &mhtest_write;
+	return 0;
+}
+__initcall(procmhtest_init);
+
+void
+page_trace_func(const struct page *p, const char *func, int line) {
+	int i;
+
+	for(i = 0; i < sizeof(page_trace_list) /
+	    sizeof(page_trace_list[0]); i++) {
+		if (page_trace_list[i] == NULL)
+			return;
+		if (page_trace_list[i] == p)
+			break;
+	}
+	if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0]))
+		return;
+
+	printk("Page %lx, %s %d\n", (unsigned long)p, func, line);
+}
+#endif