linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* memory hotremove prototype, take 3
@ 2003-12-01  3:41 IWAMOTO Toshihiro
  2003-12-01 19:56 ` Pavel Machek
  2003-12-03 19:41 ` Martin J. Bligh
  0 siblings, 2 replies; 17+ messages in thread
From: IWAMOTO Toshihiro @ 2003-12-01  3:41 UTC (permalink / raw)
  To: linux-kernel, linux-mm

Hi,

this is a new version of my memory hotplug prototype patch, against
linux-2.6.0-test11.

Freeing 100% of a specified memory zone is non-trivial and necessary
for memory hot removal.  This patch splits memory into 1GB zones, and
implements complete zone memory freeing using kswapd or "remapping".

A bit more detailed explanation and some test scripts are at:
	http://people.valinux.co.jp/~iwamoto/mh.html

Main changes against previous versions are:
- The stability is greatly improved.  Kernel crashes (probably related
  with kswapd) still happen, but they are rather rare so that I'm
  having difficulty reproducing crashes.
  Page remapping under simultaneous tar + rm -rf works.
- Implemented a solution to a deadlock caused by ext2_rename, which
  increments a refcount of a directory page twice.

Questions and comments are welcome.

$Id: memoryhotplug.patch,v 1.26 2003/11/28 09:12:12 iwamoto Exp $

diff -dpur linux-2.6.0-test11/arch/i386/Kconfig linux-2.6.0-test11-mh/arch/i386/Kconfig
--- linux-2.6.0-test11/arch/i386/Kconfig	Thu Nov 27 05:43:07 2003
+++ linux-2.6.0-test11-mh/arch/i386/Kconfig	Fri Nov 28 17:45:42 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
 
+config MEMHOTPLUGTEST
+       bool "Memory hotplug test"
+       default n
+
 config DISCONTIGMEM
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HAVE_ARCH_BOOTMEM_NODE
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HIGHPTE
diff -dpur linux-2.6.0-test11/arch/i386/mm/discontig.c linux-2.6.0-test11-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test11/arch/i386/mm/discontig.c	Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/arch/i386/mm/discontig.c	Fri Nov 28 17:45:42 2003
@@ -28,6 +28,7 @@
 #include <linux/mmzone.h>
 #include <linux/highmem.h>
 #include <linux/initrd.h>
+#include <linux/proc_fs.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
 	return 1;
 }
 
+int __init get_memcfg_numa_blks(void)
+{
+	int i, pfn;
+
+	printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+	/* Run the memory configuration and find the top of memory. */
+	find_max_pfn();
+	if (max_pfn & (PTRS_PER_PTE - 1)) {
+		pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+		printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+		max_pfn = pfn;
+	}
+	for(i = 0; i < MAX_NUMNODES; i++) {
+		pfn = PFN_DOWN(1 << 30) * i;
+		node_start_pfn[i]  = pfn;
+		pfn += PFN_DOWN(1 << 30);
+		if (pfn < max_pfn)
+			node_end_pfn[i]	  = pfn;
+		else {
+			node_end_pfn[i]	  = max_pfn;
+			i++;
+			printk("total %d blocks, max %d\n", i, max_pfn);
+			break;
+		}
+	}
+
+	/* Fill in the physnode_map with our simplistic memory model,
+	* all memory is in node 0.
+	*/
+	for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+	       pfn += PAGES_PER_ELEMENT)
+	{
+		physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+	}
+
+         /* Indicate there is one node available. */
+	node_set_online(0);
+	numnodes = i;
+
+	return 1;
+}
+
 /*
  * Find the highest page frame number we have available for the node
  */
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
 	}
 }
 
+static struct kcore_list numa_kc;
+
 void __init remap_numa_kva(void)
 {
 	void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
 				node_remap_start_pfn[node] + pfn, 
 				PAGE_KERNEL_LARGE);
 		}
+		memset(node_remap_start_vaddr[node], 0,
+		    node_remap_size[node] * PAGE_SIZE);
 	}
+	kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+	    node_remap_offset[numnodes - 1] << PAGE_SHIFT);
 }
 
 static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test11/include/asm-i386/kmap_types.h linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test11/include/asm-i386/kmap_types.h	Thu Nov 27 05:44:56 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h	Fri Nov 28 17:52:08 2003
@@ -24,7 +24,13 @@ D(10)	KM_IRQ0,
 D(11)	KM_IRQ1,
 D(12)	KM_SOFTIRQ0,
 D(13)	KM_SOFTIRQ1,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(14)	KM_REMAP0,
+D(15)	KM_REMAP1,
+D(16)	KM_TYPE_NR,
+#else
 D(14)	KM_TYPE_NR
+#endif
 };
 
 #undef D
diff -dpur linux-2.6.0-test11/include/asm-i386/mmzone.h linux-2.6.0-test11-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test11/include/asm-i386/mmzone.h	Thu Nov 27 05:44:10 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/mmzone.h	Fri Nov 28 17:45:42 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
 #endif /* CONFIG_X86_NUMAQ */
 
 extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
 /*
  * This allows any one NUMA architecture to be compiled
  * for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
 		return;
 #endif
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+	get_memcfg_numa_blks();
+	return;
+#endif
 	get_memcfg_numa_flat();
 }
 
diff -dpur linux-2.6.0-test11/include/asm-i386/numnodes.h linux-2.6.0-test11-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test11/include/asm-i386/numnodes.h	Thu Nov 27 05:43:09 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/numnodes.h	Fri Nov 28 17:45:42 2003
@@ -13,6 +13,10 @@
 /* Max 8 Nodes */
 #define NODES_SHIFT	3
 
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT	3
+
 #endif /* CONFIG_X86_NUMAQ */
 
 #endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test11/include/linux/mm.h linux-2.6.0-test11-mh/include/linux/mm.h
--- linux-2.6.0-test11/include/linux/mm.h	Thu Nov 27 05:42:55 2003
+++ linux-2.6.0-test11-mh/include/linux/mm.h	Fri Nov 28 17:45:42 2003
@@ -219,7 +219,14 @@ struct page {
  */
 #define put_page_testzero(p)				\
 	({						\
-		BUG_ON(page_count(p) == 0);		\
+		if (page_count(p) == 0) {		\
+			int i;						\
+			printk("Page: %lx ", (long)p);			\
+			for(i = 0; i < sizeof(struct page); i++)	\
+				printk(" %02x", ((unsigned char *)p)[i]); \
+			printk("\n");					\
+			BUG();				\
+		}					\
 		atomic_dec_and_test(&(p)->count);	\
 	})
 
diff -dpur linux-2.6.0-test11/include/linux/mmzone.h linux-2.6.0-test11-mh/include/linux/mmzone.h
--- linux-2.6.0-test11/include/linux/mmzone.h	Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/include/linux/mmzone.h	Fri Nov 28 17:45:42 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
 	return num;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
 #else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
 
 #define node_online(node) \
diff -dpur linux-2.6.0-test11/include/linux/page-flags.h linux-2.6.0-test11-mh/include/linux/page-flags.h
--- linux-2.6.0-test11/include/linux/page-flags.h	Thu Nov 27 05:44:52 2003
+++ linux-2.6.0-test11-mh/include/linux/page-flags.h	Fri Nov 28 17:45:42 2003
@@ -76,6 +76,8 @@
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
 
+#define	PG_again		20
+
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -268,6 +270,10 @@ extern void get_full_page_state(struct p
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+
+#define PageAgain(page)	test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page)	set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page)	clear_bit(PG_again, &(page)->flags)
 
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
diff -dpur linux-2.6.0-test11/mm/filemap.c linux-2.6.0-test11-mh/mm/filemap.c
--- linux-2.6.0-test11/mm/filemap.c	Thu Nov 27 05:43:33 2003
+++ linux-2.6.0-test11-mh/mm/filemap.c	Fri Nov 28 17:45:42 2003
@@ -448,7 +448,8 @@ repeat:
 			spin_lock(&mapping->page_lock);
 
 			/* Has the page been truncated while we slept? */
-			if (page->mapping != mapping || page->index != offset) {
+			if (page->mapping != mapping || page->index != offset ||
+			    PageAgain(page)) {
 				unlock_page(page);
 				page_cache_release(page);
 				goto repeat;
@@ -677,6 +678,12 @@ page_not_up_to_date:
 			goto page_ok;
 		}
 
+		if (PageAgain(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto find_page;
+		}
+
 readpage:
 		/* ... and start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
@@ -1120,6 +1127,12 @@ page_not_uptodate:
 		goto success;
 	}
 
+	if (PageAgain(page)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry_find;
+	}
+
 	if (!mapping->a_ops->readpage(file, page)) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
@@ -1228,6 +1241,12 @@ page_not_uptodate:
 		goto success;
 	}
 
+	if (PageAgain(page)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry_find;
+	}
+
 	if (!mapping->a_ops->readpage(file, page)) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
@@ -1436,6 +1455,11 @@ retry:
 	if (PageUptodate(page)) {
 		unlock_page(page);
 		goto out;
+	}
+	if (PageAgain(page)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry;
 	}
 	err = filler(data, page);
 	if (err < 0) {
diff -dpur linux-2.6.0-test11/mm/page_alloc.c linux-2.6.0-test11-mh/mm/page_alloc.c
--- linux-2.6.0-test11/mm/page_alloc.c	Thu Nov 27 05:42:56 2003
+++ linux-2.6.0-test11-mh/mm/page_alloc.c	Fri Nov 28 17:45:42 2003
@@ -31,6 +31,7 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 
@@ -52,6 +53,9 @@ EXPORT_SYMBOL(nr_swap_pages);
  */
 struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+#endif
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -411,7 +415,9 @@ int is_head_of_free_region(struct page *
 	spin_unlock_irqrestore(&zone->lock, flags);
         return 0;
 }
+#endif
 
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
@@ -512,9 +518,28 @@ static struct page *buffered_rmqueue(str
 		mod_page_state(pgalloc, 1 << order);
 		prep_new_page(page, order);
 	}
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (page != NULL && ! zone_active[page->flags >> ZONE_SHIFT])
+		printk("alloc_page from disabled zone: %p\n", page);
+#endif
 	return page;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+	int i;
+
+	for(i = 0; ; i++) {
+		if (zone_table[i] == z)
+			return zone_active[i];
+		if (zone_table[i] == NULL)
+			BUG();
+	}
+}
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *
@@ -562,6 +587,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		struct zone *z = zones[i];
 		unsigned long local_low;
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		/*
 		 * This is the fabled 'incremental min'. We let real-time tasks
 		 * dip their real-time paws a little deeper into reserves.
@@ -590,6 +619,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		unsigned long local_min;
 		struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
@@ -613,6 +646,10 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(z))
+				continue;
+#endif
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
@@ -638,6 +675,10 @@ rebalance:
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		min += z->pages_min;
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
@@ -1076,6 +1117,9 @@ static int __init build_zonelists_node(p
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct zone *zone;
+#endif
 
 	local_node = pgdat->node_id;
 	printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1135,7 @@ static void __init build_zonelists(pg_da
 			k = ZONE_HIGHMEM;
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
  		j = build_zonelists_node(pgdat, zonelist, j, k);
  		/*
  		 * Now we build the zonelist so that it contains the zones
@@ -1107,6 +1151,23 @@ static void __init build_zonelists(pg_da
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
+#else
+		for(; k >= 0; k--) {
+			zone = pgdat->node_zones + k;
+			if (zone->present_pages)
+				zonelist->zones[j++] = zone;
+			for (node = local_node + 1; node < numnodes; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+			for (node = 0; node < local_node; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+		}
+#endif
 	} 
 }
 
@@ -1252,6 +1313,9 @@ static void __init free_area_init_core(s
 		unsigned long batch;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+		zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
@@ -1644,3 +1708,145 @@ int min_free_kbytes_sysctl_handler(ctl_t
 	setup_per_zone_pages_min();
 	return 0;
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+    int *eof, void *data)
+{
+	char *p;
+	int i, len;
+	const struct zone *z;
+
+	p = page;
+	for(i = 0; ; i++) {
+		z = zone_table[i];
+		if (z == NULL)
+			break;
+		if (! z->present_pages)
+			/* skip empty zone */
+			continue;
+		len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+		    zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+		    z->present_pages);
+		p += len;
+	}
+	len = p - page;
+
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len < 0)
+		len = 0;
+	if (len > count)
+		len = count;
+
+	return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+    unsigned long count, void *data)
+{
+	unsigned long idx;
+	char buf[64], *p;
+	int i;
+	struct list_head *l;
+
+	if (count > sizeof(buf) - 1)
+		count = sizeof(buf) - 1;
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	p = strchr(buf, ' ');
+	if (p == NULL)
+		goto out;
+
+	*p++ = '\0';
+	idx = simple_strtoul(p, NULL, 0);
+
+	if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+		printk("Argument out of range\n");
+		goto out;
+	}
+	if (strcmp(buf, "disable") == 0) {
+		printk("disable %d\n", idx);
+		/* XXX */
+		for (i = 0; i < NR_CPUS; i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[0];	/* hot */
+			pcp->low = pcp->high = 0;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[1];	/* cold */
+			pcp->low = pcp->high = 0;
+		}
+		zone_active[idx] = 0;
+		zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+	} else if (strcmp(buf, "purge") == 0) {
+		if (zone_active[idx])
+			printk("Zone %d still active (proceeding anyway)\n",
+			    idx);
+		printk("purge %d\n", idx);
+		wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+		/* XXX overkill, but who cares? */
+		on_each_cpu(drain_local_pages, NULL, 1, 1);
+	} else if (strcmp(buf, "enable") == 0) {
+		printk("enable %d\n", idx);
+		zone_active[idx] = 1;
+		zone_table[idx]->pages_high = 
+		    zone_table[idx]->pages_min * 3;
+		/* XXX */
+		for (i = 0; i < NR_CPUS; i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[0];	/* hot */
+			pcp->low = 2 * pcp->batch;
+			pcp->high = 6 * pcp->batch;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[1];	/* cold */
+			pcp->high = 2 * pcp->batch;
+		}
+	} else if (strcmp(buf, "remap") == 0) {
+		on_each_cpu(drain_local_pages, NULL, 1, 1);
+		kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+	} else if (strcmp(buf, "active") == 0) {
+		if (zone_table[idx] == NULL)
+			goto out;
+		spin_lock_irq(&zone_table[idx]->lru_lock);
+		i = 0;
+		list_for_each(l, &zone_table[idx]->active_list) {
+			printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+			i++;
+			if (i == 10)
+				break;
+		}
+		spin_unlock_irq(&zone_table[idx]->lru_lock);
+		printk("\n");
+	} else if (strcmp(buf, "inuse") == 0) {
+		if (zone_table[idx] == NULL)
+			goto out;
+		for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+			if (page_count(&zone_table[idx]->zone_mem_map[i]))
+				printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+		printk("\n");
+	}
+out:
+	return count;
+}
+
+static int __init procmhtest_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("memhotplug", 0, NULL);
+	if (entry == NULL)
+		return -1;
+
+	entry->read_proc = &mhtest_read;
+	entry->write_proc = &mhtest_write;
+	return 0;
+}
+__initcall(procmhtest_init);
+#endif
diff -dpur linux-2.6.0-test11/mm/shmem.c linux-2.6.0-test11-mh/mm/shmem.c
--- linux-2.6.0-test11/mm/shmem.c	Thu Nov 27 05:43:41 2003
+++ linux-2.6.0-test11-mh/mm/shmem.c	Fri Nov 28 17:45:42 2003
@@ -80,7 +80,15 @@ static inline struct page *shmem_dir_all
 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
 	 */
-	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+	    PAGE_CACHE_SHIFT-PAGE_SHIFT);
+	printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+	return p;
+#else
+	return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+	    PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
 }
 
 static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test11/mm/truncate.c linux-2.6.0-test11-mh/mm/truncate.c
--- linux-2.6.0-test11/mm/truncate.c	Thu Nov 27 05:45:39 2003
+++ linux-2.6.0-test11-mh/mm/truncate.c	Fri Nov 28 17:45:42 2003
@@ -132,6 +132,10 @@ void truncate_inode_pages(struct address
 			next++;
 			if (TestSetPageLocked(page))
 				continue;
+			if (PageAgain(page)) {
+				unlock_page(page);
+				continue;
+			}
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				continue;
@@ -165,6 +169,14 @@ void truncate_inode_pages(struct address
 			struct page *page = pvec.pages[i];
 
 			lock_page(page);
+			if (PageAgain(page)) {
+				unsigned long index = page->index;
+
+				unlock_page(page);
+				put_page(page);
+				page = find_lock_page(mapping, index);
+				pvec.pages[i] = page;
+			}
 			wait_on_page_writeback(page);
 			if (page->index > next)
 				next = page->index;
@@ -255,6 +267,14 @@ void invalidate_inode_pages2(struct addr
 			struct page *page = pvec.pages[i];
 
 			lock_page(page);
+			if (PageAgain(page)) {
+				unsigned long index = page->index;
+
+				unlock_page(page);
+				put_page(page);
+				page = find_lock_page(mapping, index);
+				pvec.pages[i] = page;
+			}
 			if (page->mapping == mapping) {	/* truncate race? */
 				wait_on_page_writeback(page);
 				next = page->index + 1;
diff -dpur linux-2.6.0-test11/mm/vmalloc.c linux-2.6.0-test11-mh/mm/vmalloc.c
--- linux-2.6.0-test11/mm/vmalloc.c	Thu Nov 27 05:44:23 2003
+++ linux-2.6.0-test11-mh/mm/vmalloc.c	Fri Nov 28 17:45:42 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
+#ifdef CONFIG_MEMHOTPLUGTEST
+       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
 }
 
 EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0-test11/mm/vmscan.c linux-2.6.0-test11-mh/mm/vmscan.c
--- linux-2.6.0-test11/mm/vmscan.c	Thu Nov 27 05:43:06 2003
+++ linux-2.6.0-test11-mh/mm/vmscan.c	Fri Nov 28 17:55:35 2003
@@ -36,6 +36,9 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -285,6 +288,8 @@ shrink_list(struct list_head *page_list,
 			goto keep_locked;
 
 		pte_chain_lock(page);
+		if ((! zone_activep(page_zone(page))) && page_mapped(page))
+			page_referenced(page);
 		referenced = page_referenced(page);
 		if (referenced && page_mapping_inuse(page)) {
 			/* In active use or really unfreeable.  Activate it. */
@@ -589,7 +594,7 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
 			struct page_state *ps, int priority)
 {
 	int pgmoved;
@@ -607,6 +612,12 @@ refill_inactive_zone(struct zone *zone, 
 
 	lru_add_drain();
 	pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone)) {
+		nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+		printk("Purging active list of disabled zone\n");
+	}
+#endif
 	spin_lock_irq(&zone->lru_lock);
 	while (nr_pages && !list_empty(&zone->active_list)) {
 		page = list_entry(zone->active_list.prev, struct page, lru);
@@ -658,12 +669,20 @@ refill_inactive_zone(struct zone *zone, 
 	 */
 	if (swap_tendency >= 100)
 		reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone))
+		reclaim_mapped = 1;
+#endif
 
 	while (!list_empty(&l_hold)) {
 		page = list_entry(l_hold.prev, struct page, lru);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
 			pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(zone))
+				page_referenced(page);	/* XXX */
+#endif
 			if (page_mapped(page) && page_referenced(page)) {
 				pte_chain_unlock(page);
 				list_add(&page->lru, &l_active);
@@ -767,6 +786,11 @@ shrink_zone(struct zone *zone, int max_s
 	ratio = (unsigned long)nr_pages * zone->nr_active /
 				((zone->nr_inactive | 1) * 2);
 	atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone))
+		/* XXX */
+		atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
 	if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
 		int count;
 
@@ -1048,6 +1072,439 @@ int kswapd(void *p)
 		balance_pgdat(pgdat, 0, &ps);
 	}
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+	struct address_space* mapping = page->mapping;
+	struct buffer_head *bh, *head;
+
+	spin_lock(&mapping->private_lock);
+	bh = head = page_buffers(page);
+	printk("buffers:");
+	do {
+		printk(" %lx %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+	printk("\n");
+	spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+	struct page *newpage;
+	struct zone *zone;
+	struct address_space *mapping = page->mapping;
+	char *np, *op;
+	void *p;
+	int waitcnt, error = -1;
+
+	newpage = alloc_page(GFP_HIGHUSER);
+	if (newpage == NULL)
+		return -ENOMEM;
+	if (TestSetPageLocked(newpage))
+		BUG();
+	lock_page(page);
+
+	if (! PagePrivate(page) && PageWriteback(page))
+#ifdef CONFIG_KDB
+		KDB_ENTER();
+#else
+		BUG();
+#endif
+	if (PagePrivate(page)) {
+		waitcnt = 100;
+		while (PageWriteback(page)) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(10);
+			__set_current_state(TASK_RUNNING);
+			if (! --waitcnt)
+				goto radixfail;
+		}
+
+		/* XXX copied from shrink_list() */
+		if (PageDirty(page) &&
+		    is_page_cache_freeable(page) &&
+		    mapping != NULL &&
+		    mapping->a_ops->writepage != NULL) {
+			spin_lock(&mapping->page_lock);
+			if (test_clear_page_dirty(page)) {
+				int res;
+				struct writeback_control wbc = {
+					.sync_mode = WB_SYNC_NONE,
+					.nr_to_write = SWAP_CLUSTER_MAX,
+					.nonblocking = 1,
+					.for_reclaim = 1,
+				};
+
+				list_move(&page->list, &mapping->locked_pages);
+				spin_unlock(&mapping->page_lock);
+
+				SetPageReclaim(page);
+				res = mapping->a_ops->writepage(page, &wbc);
+
+				if (res == WRITEPAGE_ACTIVATE) {
+					ClearPageReclaim(page);
+					goto radixfail;
+				}
+				if (!PageWriteback(page)) {
+					/* synchronous write or broken a_ops? */
+					ClearPageReclaim(page);
+				}
+				lock_page(page);
+				if (! PagePrivate(page))
+					goto bufferdone;
+			} else
+				spin_unlock(&mapping->page_lock);
+		}
+
+		waitcnt = 100;
+		while (1) {
+			if (try_to_release_page(page, GFP_KERNEL))
+				break;
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(10);
+			__set_current_state(TASK_RUNNING);
+			if (! --waitcnt) {
+				print_buffer(page);
+				goto radixfail;
+			}
+		}
+	}
+bufferdone:
+	if (mapping == NULL) {
+		/* The page is an anon page. Allocate swap entry. */
+		if (!add_to_swap(page))
+			goto radixfail;
+		mapping = page->mapping;
+	}
+	error = radix_tree_preload(GFP_KERNEL);
+	if (error)
+		goto radixfail;
+	if (PagePrivate(page)) /* XXX */
+		BUG();
+
+	/* should {__add_to,__remove_from}_page_cache be used instead? */
+	spin_lock(&mapping->page_lock);
+	if (mapping != page->mapping)
+		printk("mapping changed %p -> %p, page %p\n",
+		    mapping, page->mapping, page);
+	if (radix_tree_delete(&mapping->page_tree, page->index) == NULL) {
+		/* Page truncated. */
+		spin_unlock(&mapping->page_lock);
+		radix_tree_preload_end();
+		goto radixfail;
+	}
+	/* don't __put_page(page) here. truncate may be in progress */
+	newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+	    ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+	    ~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);
+
+	/* list_del(&page->list); XXX */
+	radix_tree_insert(&mapping->page_tree, page->index, newpage);
+	page_cache_get(newpage);
+	newpage->mapping = mapping;
+	newpage->index = page->index;
+	if (PageDirty(page))
+		list_add(&newpage->list, &mapping->dirty_pages);
+	else
+		list_add(&newpage->list, &mapping->clean_pages);
+	spin_unlock(&mapping->page_lock);
+	radix_tree_preload_end();
+
+	pte_chain_lock(page);
+	if (page_mapped(page)) {
+		while ((error = try_to_unmap(page)) == SWAP_AGAIN) {
+			pte_chain_unlock(page);
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(1);
+			__set_current_state(TASK_RUNNING);
+			pte_chain_lock(page);
+		}
+		if (error == SWAP_FAIL) {
+			pte_chain_unlock(page); /* XXX */
+			/* either during mremap or mlocked */
+			goto unmapfail;
+		}
+	}
+	pte_chain_unlock(page);
+	if (PagePrivate(page))
+		printk("buffer reappeared\n");
+
+	unlock_page(page);	/* no lock needed while waiting page count */
+
+	waitcnt = 1;
+wait_again:
+	while (page_count(page) > 2) {
+		waitcnt++;
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(1);
+		if ((waitcnt % 5000) == 0) {
+			printk("remap_onepage: still waiting on %p %d\n", page, waitcnt);
+			break;
+		}
+		if (PagePrivate(page))
+			break;		/* see below */
+	}
+
+	lock_page(page);
+	BUG_ON(page_count(page) == 0);
+	if (PagePrivate(page))
+		try_to_release_page(page, GFP_KERNEL);
+	if (page_count(page) > 2) {
+		if (waitcnt > 50000)
+			goto unmapfail;
+		unlock_page(page);
+		goto wait_again;
+	}
+	if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+		KDB_ENTER();
+#else
+		BUG();
+#endif
+	if (page_count(page) == 1) {
+		/* page has been truncated.  free both pages. */
+		spin_lock(&mapping->page_lock);
+		p = radix_tree_lookup(&mapping->page_tree, newpage->index);
+		if (p != NULL) {
+			/* new cache page appeared after truncation */
+			printk("page %p newpage %p radix %p\n",
+			    page, newpage, p);
+			BUG_ON(p == newpage);
+		}
+		list_del(&newpage->list);
+		put_page(newpage);
+		if (page_count(newpage) != 1) {
+			printk("newpage count %d != 1, %p\n",
+			    page_count(newpage), newpage);
+			BUG();
+		}
+		/* No need to do page->list. remove_from_page_cache did. */
+		newpage->mapping = page->mapping = NULL;
+		spin_unlock(&mapping->page_lock);
+		ClearPageActive(page);
+		ClearPageActive(newpage);
+		unlock_page(page);
+		unlock_page(newpage);
+		put_page(page);
+		put_page(newpage);
+		return 0;
+	}
+
+	spin_lock(&mapping->page_lock);
+	list_del(&page->list); /* XXX */
+	page->mapping = NULL;
+	spin_unlock(&mapping->page_lock);
+	unlock_page(page);
+
+	np = kmap_atomic(newpage, KM_REMAP0);
+	op = kmap_atomic(page, KM_REMAP1);
+	if (np == NULL || op == NULL) {	/* XXX */
+		printk("%p %p %p %p\n", np, op, newpage, page);
+		BUG();
+	}
+	memcpy(np, op, PAGE_SIZE);
+	kunmap_atomic(page, KM_REMAP1);
+	kunmap_atomic(newpage, KM_REMAP0);
+	ClearPageActive(page);
+	__put_page(page);
+	put_page(page);
+
+	/* We are done. Finish and let the waiters run. */
+	SetPageUptodate(newpage);
+	/* XXX locking order correct? */
+	zone = page_zone(newpage);
+	spin_lock_irq(&zone->lru_lock);
+	if (PageActive(newpage)) {
+		list_add(&newpage->lru, &zone->active_list);
+		zone->nr_active++;
+	} else {
+		list_add(&newpage->lru, &zone->inactive_list);
+		zone->nr_inactive++;
+	}
+	SetPageLRU(newpage);
+	spin_unlock_irq(&zone->lru_lock);
+	unlock_page(newpage);
+	page_cache_release(newpage);
+	return 0;
+
+unmapfail:
+	/*
+	 * Try to unwind by notifying waiters.  If someone misbehaves,
+	 * we die.
+	 */
+	error = radix_tree_preload(GFP_KERNEL);
+	if (error)
+		BUG();
+	/* should {__add_to,__remove_from}_page_cache be used instead? */
+	spin_lock(&mapping->page_lock);
+	/* list_del(&newpage->list); */
+	if (radix_tree_delete(&mapping->page_tree, page->index) == NULL)
+		/* Hold extra count to handle truncate */
+		page_cache_get(newpage);
+	radix_tree_insert(&mapping->page_tree, page->index, page);
+	/* no page_cache_get(page); needed */
+	radix_tree_preload_end();
+	spin_unlock(&mapping->page_lock);
+
+	SetPageAgain(newpage);
+	/* XXX unmap needed?  No, it shouldn't.  Handled by fault handlers. */
+	unlock_page(newpage);
+
+	waitcnt = 1;
+	for(; page_count(newpage) > 2; waitcnt++) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(1);
+		if ((waitcnt % 10000) == 0) {
+			printk("You are hosed.\n");
+			printk("newpage %p\n", newpage);
+			BUG();
+		}
+	}
+	BUG_ON(PageUptodate(newpage));
+	ClearPageDirty(newpage);
+	ClearPageActive(newpage);
+	spin_lock(&mapping->page_lock);
+	newpage->mapping = NULL;
+	if (page_count(newpage) == 1) {
+		printk("newpage %p truncated. page %p\n", newpage, page);
+		BUG();
+	}
+	list_del(&newpage->list);
+	spin_unlock(&mapping->page_lock);
+	unlock_page(page);
+	__put_page(newpage);
+	__free_page(newpage);
+	return 1;
+	
+radixfail:
+	unlock_page(page);
+	unlock_page(newpage);
+	__free_page(newpage);
+	return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+	int cpu = get_cpu();
+
+	schedule_work(&lru_drain_wq[cpu]);
+	put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+	struct zone *zone = p;
+	struct page *page, *page1;
+	struct list_head *l;
+	int active, i, nr_failed = 0;
+	int fastmode = 100;
+	LIST_HEAD(failedp);
+
+	daemonize("remap%d", zone->zone_start_pfn);
+	if (atomic_read(&remapd_count) > 0) {
+		printk("remapd already running\n");
+		return 0;
+	}
+	atomic_inc(&remapd_count);
+	on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+	while(nr_failed < 100) {
+		spin_lock_irq(&zone->lru_lock);
+		for(active = 0; active < 2; active++) {
+			l = active ? &zone->active_list :
+			    &zone->inactive_list;
+			for(i = 0; ! list_empty(l) && i < 10; i++) {
+				page = list_entry(l->prev, struct page, lru);
+				if (fastmode && PageLocked(page)) {
+					page1 = page;
+					while (fastmode && PageLocked(page)) {
+						page =
+						    list_entry(page->lru.prev,
+						    struct page, lru);
+						fastmode--;
+						if (&page->lru == l) {
+							/* scanned the whole
+							   list */
+							page = page1;
+							break;
+						}
+						if (page == page1)
+							BUG();
+					}
+					if (! fastmode) {
+						printk("used up fastmode\n");
+						page = page1;
+					}
+				}
+				if (! TestClearPageLRU(page))
+					BUG();
+				list_del(&page->lru);
+				if (page_count(page) == 0) {
+					/* the page is in pagevec_release();
+					   shrink_cache says so. */
+					SetPageLRU(page);
+					list_add(&page->lru, l);
+					continue;
+				}
+				if (active)
+					zone->nr_active--;
+				else
+					zone->nr_inactive--;
+				page_cache_get(page);
+				spin_unlock_irq(&zone->lru_lock);
+				goto got_page;
+			}
+		}
+		spin_unlock_irq(&zone->lru_lock);
+		break;
+
+	got_page:
+		if (remap_onepage(page)) {
+			nr_failed++;
+			list_add(&page->lru, &failedp);
+		}
+	}
+	if (list_empty(&failedp))
+		goto out;
+
+	while (! list_empty(&failedp)) {
+		spin_lock_irq(&zone->lru_lock);
+		page = list_entry(failedp.prev, struct page, lru);
+		list_del(&page->lru);
+		if (PageActive(page)) {
+			list_add(&page->lru, &zone->active_list);
+			zone->nr_active++;
+		} else {
+			list_add(&page->lru, &zone->inactive_list);
+			zone->nr_inactive++;
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		spin_unlock_irq(&zone->lru_lock);
+		page_cache_release(page);
+	}
+out:
+	atomic_dec(&remapd_count);
+	return 0;
+}
+			
+static int __init remapd_init(void)
+{
+	int i;
+
+	for(i = 0; i < NR_CPUS; i++)
+		INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+	return 0;
+}
+
+module_init(remapd_init);
+#endif
 
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.

^ permalink raw reply	[flat|nested] 17+ messages in thread
* Re: memory hotremove prototype, take 3
@ 2003-12-01 20:12 Luck, Tony
  2003-12-02  3:01 ` IWAMOTO Toshihiro
  2003-12-02 22:26 ` Yasunori Goto
  0 siblings, 2 replies; 17+ messages in thread
From: Luck, Tony @ 2003-12-01 20:12 UTC (permalink / raw)
  To: linux-kernel; +Cc: Pavel Machek

Pavel Machek wrote:

> hotunplug seems cool... How do you deal with kernel data structures in
> memory "to be removed"? Or you simply don't allow kmalloc() to
> allocate there?

You guessed right.  Hot removeable memory can only be allocated
for uses that we can easily re-allocate.  So kmalloc() etc. have
to get memory from some area that we promise not to ever try to
remove.

> During hotunplug, you copy pages to new locaion. Would it simplify
> code if you forced them to be swapped out, instead? [Yep, it would be
> slower...]

There are some pages that will have to be copied (e.g. pages that
the user "mlock()d" should still be locked in their new location,
same for hugetlbfs pages).

-Tony Luck  


^ permalink raw reply	[flat|nested] 17+ messages in thread
* RE: memory hotremove prototype, take 3
@ 2003-12-03  5:19 Perez-Gonzalez, Inaky
  0 siblings, 0 replies; 17+ messages in thread
From: Perez-Gonzalez, Inaky @ 2003-12-03  5:19 UTC (permalink / raw)
  To: Yasunori Goto, Pavel Machek
  Cc: linux-kernel, Luck, Tony, IWAMOTO Toshihiro, Hirokazu Takahashi,
	Linux Hotplug Memory Support


> From: Yasunori Goto

> IMHO, To hot-remove memory, memory attribute should be divided
> into Hotpluggable and no-Hotpluggable, and each attribute memory
> should be allocated each unit(ex. node).

Why? I still don't get that -- we should be able to use the virtual
addressing mechanism of any CPU to swap under the rug any virtual
address without needing to do anything more than allocate a page frame
for the new physical location (I am ignoring here devices that are 
directly accessing physical memory--a callback in the device model could
be added to require them to reallocate their buffers).

Or am I deadly and naively wrong?

Iñaky Pérez-González -- Not speaking for Intel -- all opinions are my own (and my fault)

^ permalink raw reply	[flat|nested] 17+ messages in thread
* RE: memory hotremove prototype, take 3
@ 2003-12-03 17:57 Luck, Tony
  0 siblings, 0 replies; 17+ messages in thread
From: Luck, Tony @ 2003-12-03 17:57 UTC (permalink / raw)
  To: Perez-Gonzalez, Inaky, Yasunori Goto, Pavel Machek
  Cc: linux-kernel, IWAMOTO Toshihiro, Hirokazu Takahashi,
	Linux Hotplug Memory Support

> > IMHO, To hot-remove memory, memory attribute should be divided
> > into Hotpluggable and no-Hotpluggable, and each attribute memory
> > should be allocated each unit(ex. node).
> 
> Why? I still don't get that -- we should be able to use the virtual
> addressing mechanism of any CPU to swap under the rug any virtual
> address without needing to do anything more than allocate a page frame
> for the new physical location (I am ignoring here devices that are 
> directly accessing physical memory--a callback in the device 
> model could
> be added to require them to reallocate their buffers).
> 
> Or am I deadly and naively wrong?

Most (all?) Linux implementations make use of a large area
of memory that is mapped 1:1 (with a constant offset) from
kernel virtual space to physical space.  Kernel memory is
allocated in this virtual area.  If the processor supports
some form of large pages in the TLB, this 1:1 area uses the
large pages ... so it would require some major surgery to
remap portions of this area, and would have a negative effect
on performance (since you'd take more TLB misses).  It might
even be a correctness issue if the structures in this area
were needed to handle small page TLB faults in the area itself.

-Tony

^ permalink raw reply	[flat|nested] 17+ messages in thread
* Re: memory hotremove prototype, take 3
@ 2003-12-10  0:45 Luck, Tony
  0 siblings, 0 replies; 17+ messages in thread
From: Luck, Tony @ 2003-12-10  0:45 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel

> If your target is NUMA, then you really, really need CONFIG_NONLINEAR.
> We don't support multiple pgdats per node, nor do I wish to, as it'll
> make an unholy mess ;-). With CONFIG_NONLINEAR, the discontiguities
> within a node are buried down further, so we have much less complexity
> to deal with from the main VM. The abstraction also keeps the poor
> VM engineers trying to read / write the code saner via simplicity ;-)
> 
> WRT generic discontigmem support (not NUMA), doing that via pgdats
> should really go away, as there's no real difference between the 
> chunks of physical memory as far as the page allocator is concerned.
> The plan is to use Daniel's nonlinear stuff to replace that, and keep
> the pgdats strictly for NUMA. Same would apply to hotpluggable zones - 
> I'd hate to end up with 512 pgdats of stuff that are really all the
> same memory types underneath.

I guess this all depends on whether you allow bits of memory on
nodes to be hot-plugged ... or insist on the whole node being
added/removed in one fell swoop.  I'd expect the latter to be
a more common model, and in that case the "pgdat-for-the-node" is
the same as the "pgdat-for-the-hot-plug-zone", so you don't have
a proliferation of pgdats to support hotplug.

> The real issue you have is the mapping of the struct pages - if we can
> achieve a non-contig mapping of the mem_map / lmem_map array, we should
> be able to take memory on and offline reasonably easy. If you're willing
> for a first implementation to pre-allocate the struct page array for 
> every possible virtual address, it makes life a lot easier.

On 64-bit systems with CONFIG_VIRTUAL_MEMMAP, this would be trivial,
and avoids the need for the extra level of indirection in the psection[]
and vection[] arrays in CONFIG_NONLINEAR (ok ... it doesn't really get
rid of the indirection, as the page table lookup to access the virtual
mem_map effectively ends up doing the same thing).
 
> Adding the other layer of indirection for access the struct page array
> should fix up most of that, and is very easily abstracted out via the
> pfn_to_page macros and friends. I ripped out all the direct references
> to mem_map indexing already in 2.6, so it should all be nicely 
> abstracted out.

I did go back and look at the CONFIG_NONLINEAR patch again, and I
still can't see how to make it useful on 64-bit machines. Jack
Steiner asked a bunch of questions on how it would work for an
architecture like the SGI:
http://marc.theaimsgroup.com/?l=lse-tech&m=101828803506249&w=2
I don't remember seeing any answers on the list.  Assuming he
were to use a section size of 64MB (a convenient number for ia64)
he'd end up with psection[]/vsection[] tables with 8 million
entries each (@ 4 bytes/entry -> 64MB for the pair).

-Tony Luck  


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2003-12-10  0:45 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-12-01  3:41 memory hotremove prototype, take 3 IWAMOTO Toshihiro
2003-12-01 19:56 ` Pavel Machek
2003-12-03 19:41 ` Martin J. Bligh
2003-12-04  3:58   ` IWAMOTO Toshihiro
2003-12-04  5:38     ` Martin J. Bligh
2003-12-04 15:44       ` IWAMOTO Toshihiro
2003-12-04 17:12         ` Martin J. Bligh
2003-12-04 18:27         ` Jesse Barnes
2003-12-04 18:29           ` Martin J. Bligh
2003-12-04 18:59             ` Jesse Barnes
2003-12-01 20:12 Luck, Tony
2003-12-02  3:01 ` IWAMOTO Toshihiro
2003-12-02  6:43   ` Hirokazu Takahashi
2003-12-02 22:26 ` Yasunori Goto
2003-12-03  5:19 Perez-Gonzalez, Inaky
2003-12-03 17:57 Luck, Tony
2003-12-10  0:45 Luck, Tony

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).