* memory hotremove prototype, take 2
@ 2003-11-04 10:29 IWAMOTO Toshihiro
0 siblings, 0 replies; only message in thread
From: IWAMOTO Toshihiro @ 2003-11-04 10:29 UTC (permalink / raw)
To: linux-mm, lhms-devel
Hi,
As you may know, I'm working on memory hotplug.
(See http://marc.theaimsgroup.com/?l=linux-kernel&m=106637967926960
for my original patch.)
I fixed several fatal bugs in the original patch and it works much
better. The updated version is included in this mail.
I confirmed successful "make -j4" cross-build of NetBSD libc while
rotating active and inactive zones and remapping pages of inactive
zones.
However, I discovered my page remapping approach has a fatal flaw and
ext2_rename caused a deadlock. Let me explain the situation.
To put it simple, what my patch does is:
for each page (called "oldpage" hereafter) do
1. allocate "newpage" as a replacement
2. increment oldpage's page count
3. rewrite oldpage radix entry with the one of newpage,
so that find_get_page and its friends return newpage
4. wait until page_count(oldpage) drops to 1
5. copy oldpage's content to newpage,
SetPageUptodate(newpage), unlock_page(newpage)
6. oldpage can be freed
ext2_rename does:
old_de = ext2_find_entry (old_dir, old_dentry, &old_page);
:
:
err = ext2_add_link(new_dentry, old_inode);
ext2_find_entry increments old_page's page count, and ext2_add_link
finds and locks a directory page. These two pages can be the same if
a file is renamed within a directory. If the radix tree gets
rewritten in the meanwhile, a deadlock happens.
To solve this problem, I think I must implement either of the followings.
1. Make the step 4 timeout and retry page remapping.
Timeout means "newpage" returned by find_get_page
is obsolete and the page should be looked up again.
To achieve this, a page flag, say PG_again, could be used
to notify callers.
Every portion of the kernel which calls find_get_page
needs modification.
2. Record which processes incremented page count and return
oldpage when asked by such process to avoid deadlock.
I'm going to try solution 1, but I hope there is a better solution.
If you have any idea or comments, let me know.
$Id: memoryhotplug.patch,v 1.10 2003/10/31 09:54:35 iwamoto Exp $
diff -dpur linux-2.6.0-test9-kdb/arch/i386/Kconfig linux-2.6.0-test9-kdb-mh/arch/i386/Kconfig
--- linux-2.6.0-test9-kdb/arch/i386/Kconfig Thu Oct 30 11:14:47 2003
+++ linux-2.6.0-test9-kdb-mh/arch/i386/Kconfig Thu Oct 30 12:11:36 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
+config MEMHOTPLUGTEST
+ bool "Memory hotplug test"
+ default n
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y
config HAVE_ARCH_BOOTMEM_NODE
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y
config HIGHPTE
diff -dpur linux-2.6.0-test9-kdb/arch/i386/mm/discontig.c linux-2.6.0-test9-kdb-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test9-kdb/arch/i386/mm/discontig.c Sun Oct 26 03:43:49 2003
+++ linux-2.6.0-test9-kdb-mh/arch/i386/mm/discontig.c Thu Oct 30 14:58:10 2003
@@ -28,6 +28,7 @@
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
+#include <linux/proc_fs.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
return 1;
}
+int __init get_memcfg_numa_blks(void)
+{
+ int i, pfn;
+
+ printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+ /* Run the memory configuration and find the top of memory. */
+ find_max_pfn();
+ if (max_pfn & (PTRS_PER_PTE - 1)) {
+ pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+ printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+ max_pfn = pfn;
+ }
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ pfn = PFN_DOWN(1 << 30) * i;
+ node_start_pfn[i] = pfn;
+ pfn += PFN_DOWN(1 << 30);
+ if (pfn < max_pfn)
+ node_end_pfn[i] = pfn;
+ else {
+ node_end_pfn[i] = max_pfn;
+ i++;
+ printk("total %d blocks, max %d\n", i, max_pfn);
+ break;
+ }
+ }
+
+ /* Fill in the physnode_map with our simplistic memory model,
+ * all memory is in node 0.
+ */
+ for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+ pfn += PAGES_PER_ELEMENT)
+ {
+ physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+ }
+
+ /* Indicate there is one node available. */
+ node_set_online(0);
+ numnodes = i;
+
+ return 1;
+}
+
/*
* Find the highest page frame number we have available for the node
*/
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
}
}
+static struct kcore_list numa_kc;
+
void __init remap_numa_kva(void)
{
void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
+ memset(node_remap_start_vaddr[node], 0,
+ node_remap_size[node] * PAGE_SIZE);
}
+ kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+ node_remap_offset[numnodes - 1] << PAGE_SHIFT);
}
static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test9-kdb/include/asm-i386/kmap_types.h linux-2.6.0-test9-kdb-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test9-kdb/include/asm-i386/kmap_types.h Thu Oct 30 11:14:47 2003
+++ linux-2.6.0-test9-kdb-mh/include/asm-i386/kmap_types.h Thu Oct 30 12:22:27 2003
@@ -25,7 +25,13 @@ D(11) KM_IRQ1,
D(12) KM_SOFTIRQ0,
D(13) KM_SOFTIRQ1,
D(14) KM_KDB,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(15) KM_REMAP0,
+D(16) KM_REMAP1,
+D(17) KM_TYPE_NR,
+#else
D(15) KM_TYPE_NR
+#endif
};
#undef D
diff -dpur linux-2.6.0-test9-kdb/include/asm-i386/mmzone.h linux-2.6.0-test9-kdb-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test9-kdb/include/asm-i386/mmzone.h Sun Oct 26 03:43:39 2003
+++ linux-2.6.0-test9-kdb-mh/include/asm-i386/mmzone.h Thu Oct 30 12:42:06 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
#endif /* CONFIG_X86_NUMAQ */
extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
/*
* This allows any one NUMA architecture to be compiled
* for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
return;
#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+ get_memcfg_numa_blks();
+ return;
+#endif
get_memcfg_numa_flat();
}
diff -dpur linux-2.6.0-test9-kdb/include/asm-i386/numnodes.h linux-2.6.0-test9-kdb-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test9-kdb/include/asm-i386/numnodes.h Sun Oct 26 03:43:02 2003
+++ linux-2.6.0-test9-kdb-mh/include/asm-i386/numnodes.h Thu Oct 30 12:32:27 2003
@@ -13,6 +13,10 @@
/* Max 8 Nodes */
#define NODES_SHIFT 3
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT 3
+
#endif /* CONFIG_X86_NUMAQ */
#endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test9-kdb/include/linux/mm.h linux-2.6.0-test9-kdb-mh/include/linux/mm.h
--- linux-2.6.0-test9-kdb/include/linux/mm.h Sun Oct 26 03:42:50 2003
+++ linux-2.6.0-test9-kdb-mh/include/linux/mm.h Thu Oct 30 12:11:37 2003
@@ -219,7 +219,14 @@ struct page {
*/
#define put_page_testzero(p) \
({ \
- BUG_ON(page_count(p) == 0); \
+ if (page_count(p) == 0) { \
+ int i; \
+ printk("Page: %lx ", (long)p); \
+ for(i = 0; i < sizeof(struct page); i++) \
+ printk(" %02x", ((unsigned char *)p)[i]); \
+ printk("\n"); \
+ BUG(); \
+ } \
atomic_dec_and_test(&(p)->count); \
})
@@ -620,6 +627,12 @@ static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
}
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define page_trace(p) page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define page_trace(p) do { } while(0)
#endif
#endif /* __KERNEL__ */
diff -dpur linux-2.6.0-test9-kdb/include/linux/mmzone.h linux-2.6.0-test9-kdb-mh/include/linux/mmzone.h
--- linux-2.6.0-test9-kdb/include/linux/mmzone.h Sun Oct 26 03:43:49 2003
+++ linux-2.6.0-test9-kdb-mh/include/linux/mmzone.h Thu Oct 30 12:11:37 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
return num;
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
#else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
#define node_online(node) \
diff -dpur linux-2.6.0-test9-kdb/ipc/util.c linux-2.6.0-test9-kdb-mh/ipc/util.c
--- linux-2.6.0-test9-kdb/ipc/util.c Sun Oct 26 03:43:27 2003
+++ linux-2.6.0-test9-kdb-mh/ipc/util.c Thu Oct 30 12:11:37 2003
@@ -324,6 +324,9 @@ void* ipc_rcu_alloc(int size)
if (out) out += sizeof(struct ipc_rcu_kmalloc);
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+ printk("ipc_rcu_alloc: %lx\n", (unsigned long)out);
+#endif
return out;
}
diff -dpur linux-2.6.0-test9-kdb/mm/memory.c linux-2.6.0-test9-kdb-mh/mm/memory.c
--- linux-2.6.0-test9-kdb/mm/memory.c Thu Oct 30 11:27:52 2003
+++ linux-2.6.0-test9-kdb-mh/mm/memory.c Thu Oct 30 12:11:37 2003
@@ -420,6 +420,17 @@ zap_pte_range(struct mmu_gather *tlb, pm
mark_page_accessed(page);
tlb->freed++;
page_remove_rmap(page, ptep);
+#if 1 // debug
+ /* Validate page */
+ {
+ struct zone *z = page_zone(page);
+ int idx = page - z->zone_mem_map;
+ if (idx < 0 || idx >= z->spanned_pages) {
+ printk("zap_pte_range: %d %d\n", page->flags >> ZONE_SHIFT, idx);
+ BUG();
+ }
+ }
+#endif
tlb_remove_page(tlb, page);
}
}
diff -dpur linux-2.6.0-test9-kdb/mm/page_alloc.c linux-2.6.0-test9-kdb-mh/mm/page_alloc.c
--- linux-2.6.0-test9-kdb/mm/page_alloc.c Sun Oct 26 03:42:53 2003
+++ linux-2.6.0-test9-kdb-mh/mm/page_alloc.c Thu Oct 30 12:45:21 2003
@@ -31,6 +31,7 @@
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
+#include <linux/proc_fs.h>
#include <asm/tlbflush.h>
@@ -52,6 +53,10 @@ EXPORT_SYMBOL(nr_swap_pages);
*/
struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+static const struct page *page_trace_list[10];
+#endif
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -257,6 +262,7 @@ free_pages_bulk(struct zone *zone, int c
page = list_entry(list->prev, struct page, list);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->list);
+ page_trace(page);
__free_pages_bulk(page, base, zone, area, mask, order);
ret++;
}
@@ -411,7 +417,9 @@ int is_head_of_free_region(struct page *
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
+#endif
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
@@ -512,9 +520,28 @@ static struct page *buffered_rmqueue(str
mod_page_state(pgalloc, 1 << order);
prep_new_page(page, order);
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_active[page->flags >> ZONE_SHIFT])
+ printk("alloc_page from disabled zone: %p\n", page);
+#endif
return page;
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+ int i;
+
+ for(i = 0; ; i++) {
+ if (zone_table[i] == z)
+ return zone_active[i];
+ if (zone_table[i] == NULL)
+ BUG();
+ }
+}
+#endif
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -562,6 +589,10 @@ __alloc_pages(unsigned int gfp_mask, uns
struct zone *z = zones[i];
unsigned long local_low;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
@@ -590,6 +621,10 @@ __alloc_pages(unsigned int gfp_mask, uns
unsigned long local_min;
struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
@@ -613,6 +648,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -638,6 +677,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
min += z->pages_min;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
@@ -676,6 +719,21 @@ nopage:
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
+#if 1 // debug
+ /* Validate page */
+ {
+ struct zone *z = page_zone(page);
+ int idx = page - z->zone_mem_map;
+ if (idx < 0 || idx >= z->spanned_pages) {
+ printk("%d %d\n", page->flags >> ZONE_SHIFT, idx);
+ BUG();
+ }
+ }
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_active[page->flags >> ZONE_SHIFT])
+ BUG();
+#endif
return page;
}
@@ -1076,6 +1134,9 @@ static int __init build_zonelists_node(p
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct zone *zone;
+#endif
local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1152,7 @@ static void __init build_zonelists(pg_da
k = ZONE_HIGHMEM;
if (i & __GFP_DMA)
k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1107,6 +1168,23 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
zonelist->zones[j++] = NULL;
+#else
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ for (node = local_node + 1; node < numnodes; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ for (node = 0; node < local_node; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ }
+#endif
}
}
@@ -1252,6 +1330,9 @@ static void __init free_area_init_core(s
unsigned long batch;
zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1295,8 +1376,8 @@ static void __init free_area_init_core(s
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
}
- printk(" %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
+ printk(" %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+ zone_names[j], realsize, batch, zone_start_pfn);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
atomic_set(&zone->refill_counter, 0);
@@ -1644,3 +1725,187 @@ int min_free_kbytes_sysctl_handler(ctl_t
setup_per_zone_pages_min();
return 0;
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ char *p;
+ int i, len;
+ const struct zone *z;
+
+ p = page;
+ for(i = 0; ; i++) {
+ z = zone_table[i];
+ if (z == NULL)
+ break;
+ if (! z->present_pages)
+ /* skip empty zone */
+ continue;
+ len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+ zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+ z->present_pages);
+ p += len;
+ }
+ len = p - page;
+
+ if (len <= off + count)
+ *eof = 1;
+ *start = page + off;
+ len -= off;
+ if (len < 0)
+ len = 0;
+ if (len > count)
+ len = count;
+
+ return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ unsigned long idx;
+ char buf[64], *p;
+ int i;
+ struct list_head *l;
+
+ if (count > sizeof(buf) - 1)
+ count = sizeof(buf) - 1;
+ if (copy_from_user(buf, buffer, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ p = strchr(buf, ' ');
+ if (p == NULL)
+ goto out;
+
+ *p++ = '\0';
+ idx = simple_strtoul(p, NULL, 0);
+
+ if (strcmp(buf, "trace") == 0) {
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++)
+ if (page_trace_list[i] == NULL) {
+ page_trace_list[i] = (struct page *)idx;
+ printk("add trace %lx\n", (unsigned long)idx);
+ goto out;
+ }
+ printk("page_trace_list is full (not added)\n");
+ goto out;
+ } else if (strcmp(buf, "untrace") == 0) {
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++)
+ if (page_trace_list[i] == (struct page *)idx)
+ break;
+ if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0])) {
+ printk("not registered\n");
+ goto out;
+ }
+ for(; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]) - 1; i++)
+ page_trace_list[i] = page_trace_list[i + 1];
+ page_trace_list[i] = NULL;
+ goto out;
+ }
+ if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+ printk("Argument out of range\n");
+ goto out;
+ }
+ if (strcmp(buf, "disable") == 0) {
+ printk("disable %d\n", idx);
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = pcp->high = 0;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->low = pcp->high = 0;
+ }
+ zone_active[idx] = 0;
+ zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+ } else if (strcmp(buf, "purge") == 0) {
+ if (zone_active[idx])
+ printk("Zone %d still active (proceeding anyway)\n",
+ idx);
+ printk("purge %d\n", idx);
+ wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+ /* XXX overkill, but who cares? */
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ } else if (strcmp(buf, "enable") == 0) {
+ printk("enable %d\n", idx);
+ zone_active[idx] = 1;
+ zone_table[idx]->pages_high =
+ zone_table[idx]->pages_min * 3;
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = 2 * pcp->batch;
+ pcp->high = 6 * pcp->batch;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->high = 2 * pcp->batch;
+ }
+ } else if (strcmp(buf, "remap") == 0) {
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+ } else if (strcmp(buf, "active") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ spin_lock_irq(&zone_table[idx]->lru_lock);
+ i = 0;
+ list_for_each(l, &zone_table[idx]->active_list) {
+ printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+ i++;
+ if (i == 10)
+ break;
+ }
+ spin_unlock_irq(&zone_table[idx]->lru_lock);
+ printk("\n");
+ } else if (strcmp(buf, "inuse") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+ if (page_count(&zone_table[idx]->zone_mem_map[i]))
+ printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+ printk("\n");
+ }
+out:
+ return count;
+}
+
+static int __init procmhtest_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("memhotplug", 0, NULL);
+ if (entry == NULL)
+ return -1;
+
+ entry->read_proc = &mhtest_read;
+ entry->write_proc = &mhtest_write;
+ return 0;
+}
+__initcall(procmhtest_init);
+
+void
+page_trace_func(const struct page *p, const char *func, int line) {
+ int i;
+
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++) {
+ if (page_trace_list[i] == NULL)
+ return;
+ if (page_trace_list[i] == p)
+ break;
+ }
+ if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0]))
+ return;
+
+ printk("Page %lx, %s %d\n", (unsigned long)p, func, line);
+}
+#endif
diff -dpur linux-2.6.0-test9-kdb/mm/shmem.c linux-2.6.0-test9-kdb-mh/mm/shmem.c
--- linux-2.6.0-test9-kdb/mm/shmem.c Sun Oct 26 03:43:30 2003
+++ linux-2.6.0-test9-kdb-mh/mm/shmem.c Thu Oct 30 12:11:37 2003
@@ -80,7 +80,15 @@ static inline struct page *shmem_dir_all
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
*/
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+ printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+ return p;
+#else
+ return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
}
static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test9-kdb/mm/swap.c linux-2.6.0-test9-kdb-mh/mm/swap.c
--- linux-2.6.0-test9-kdb/mm/swap.c Sun Oct 26 03:43:26 2003
+++ linux-2.6.0-test9-kdb-mh/mm/swap.c Thu Oct 30 12:11:37 2003
@@ -77,6 +77,7 @@ void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
+ page_trace(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(zone, page);
@@ -189,6 +190,19 @@ void release_pages(struct page **pages,
struct page *page = pages[i];
struct zone *pagezone;
+ if (page_count(page) == 0) {
+ struct zone **z = zone_table;
+ int idx;
+ while (*z) {
+ idx = page - (*z)->zone_mem_map;
+ if (idx >= 0 && idx < (*z)->spanned_pages)
+ break;
+ z++;
+ }
+ if (*z != NULL)
+ printk("Zone: %lx %d, index: %d\n",
+ (unsigned long)*z, z - zone_table, idx);
+ }
if (PageReserved(page) || !put_page_testzero(page))
continue;
@@ -251,6 +265,10 @@ void __pagevec_release_nonlru(struct pag
BUG_ON(PageLRU(page));
if (put_page_testzero(page))
pagevec_add(&pages_to_free, page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ else
+ printk("Page %lx disappearing\n", page);
+#endif
}
pagevec_free(&pages_to_free);
pagevec_reinit(pvec);
diff -dpur linux-2.6.0-test9-kdb/mm/swap_state.c linux-2.6.0-test9-kdb-mh/mm/swap_state.c
--- linux-2.6.0-test9-kdb/mm/swap_state.c Sun Oct 26 03:43:31 2003
+++ linux-2.6.0-test9-kdb-mh/mm/swap_state.c Thu Oct 30 12:11:37 2003
@@ -152,6 +152,7 @@ int add_to_swap(struct page * page)
ClearPageDirty(page);
set_page_dirty(page);
INC_CACHE_INFO(add_total);
+ page_trace(page);
return 1;
case -EEXIST:
/* Raced with "speculative" read_swap_cache_async */
@@ -161,6 +162,7 @@ int add_to_swap(struct page * page)
default:
/* -ENOMEM radix-tree allocation failure */
swap_free(entry);
+ page_trace(page);
return 0;
}
}
diff -dpur linux-2.6.0-test9-kdb/mm/vmalloc.c linux-2.6.0-test9-kdb-mh/mm/vmalloc.c
--- linux-2.6.0-test9-kdb/mm/vmalloc.c Sun Oct 26 03:43:51 2003
+++ linux-2.6.0-test9-kdb-mh/mm/vmalloc.c Thu Oct 30 12:11:37 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
+#ifdef CONFIG_MEMHOTPLUGTEST
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
}
EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0-test9-kdb/mm/vmscan.c linux-2.6.0-test9-kdb-mh/mm/vmscan.c
--- linux-2.6.0-test9-kdb/mm/vmscan.c Sun Oct 26 03:42:59 2003
+++ linux-2.6.0-test9-kdb-mh/mm/vmscan.c Fri Oct 31 15:34:58 2003
@@ -285,6 +285,8 @@ shrink_list(struct list_head *page_list,
goto keep_locked;
pte_chain_lock(page);
+ if ((! zone_activep(page_zone(page))) && page_mapped(page))
+ page_referenced(page);
referenced = page_referenced(page);
if (referenced && page_mapping_inuse(page)) {
/* In active use or really unfreeable. Activate it. */
@@ -310,6 +312,7 @@ shrink_list(struct list_head *page_list,
}
#endif /* CONFIG_SWAP */
+ page_trace(page);
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
@@ -318,9 +321,11 @@ shrink_list(struct list_head *page_list,
switch (try_to_unmap(page)) {
case SWAP_FAIL:
pte_chain_unlock(page);
+ page_trace(page);
goto activate_locked;
case SWAP_AGAIN:
pte_chain_unlock(page);
+ page_trace(page);
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
@@ -367,6 +372,7 @@ shrink_list(struct list_head *page_list,
.nonblocking = 1,
.for_reclaim = 1,
};
+ page_trace(page);
list_move(&page->list, &mapping->locked_pages);
spin_unlock(&mapping->page_lock);
@@ -410,12 +416,14 @@ shrink_list(struct list_head *page_list,
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (PagePrivate(page)) {
+ page_trace(page);
if (!try_to_release_page(page, gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1)
goto free_it;
}
+ page_trace(page);
if (!mapping)
goto keep_locked; /* truncate got there first */
@@ -431,6 +439,7 @@ shrink_list(struct list_head *page_list,
goto keep_locked;
}
+ page_trace(page);
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->index };
@@ -589,7 +598,7 @@ done:
* But we had to alter page->flags anyway.
*/
static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
struct page_state *ps, int priority)
{
int pgmoved;
@@ -607,6 +616,12 @@ refill_inactive_zone(struct zone *zone,
lru_add_drain();
pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone)) {
+ nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+ printk("Purging active list of disabled zone\n");
+ }
+#endif
spin_lock_irq(&zone->lru_lock);
while (nr_pages && !list_empty(&zone->active_list)) {
page = list_entry(zone->active_list.prev, struct page, lru);
@@ -614,6 +629,7 @@ refill_inactive_zone(struct zone *zone,
if (!TestClearPageLRU(page))
BUG();
list_del(&page->lru);
+ page_trace(page);
if (page_count(page) == 0) {
/* It is currently in pagevec_release() */
SetPageLRU(page);
@@ -658,20 +674,30 @@ refill_inactive_zone(struct zone *zone,
*/
if (swap_tendency >= 100)
reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ reclaim_mapped = 1;
+#endif
while (!list_empty(&l_hold)) {
page = list_entry(l_hold.prev, struct page, lru);
list_del(&page->lru);
if (page_mapped(page)) {
pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ page_referenced(page); /* XXX */
+#endif
if (page_mapped(page) && page_referenced(page)) {
pte_chain_unlock(page);
+ page_trace(page);
list_add(&page->lru, &l_active);
continue;
}
pte_chain_unlock(page);
if (!reclaim_mapped) {
list_add(&page->lru, &l_active);
+ page_trace(page);
continue;
}
}
@@ -682,9 +708,11 @@ refill_inactive_zone(struct zone *zone,
if (total_swap_pages == 0 && !page->mapping &&
!PagePrivate(page)) {
list_add(&page->lru, &l_active);
+ page_trace(page);
continue;
}
list_add(&page->lru, &l_inactive);
+ page_trace(page);
}
pagevec_init(&pvec, 1);
@@ -767,6 +795,11 @@ shrink_zone(struct zone *zone, int max_s
ratio = (unsigned long)nr_pages * zone->nr_active /
((zone->nr_inactive | 1) * 2);
atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ /* XXX */
+ atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
int count;
@@ -1048,6 +1081,326 @@ int kswapd(void *p)
balance_pgdat(pgdat, 0, &ps);
}
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+ struct address_space* mapping = page->mapping;
+ struct buffer_head *bh, *head;
+
+ spin_lock(&mapping->private_lock);
+ bh = head = page_buffers(page);
+ printk("buffers:");
+ do {
+ printk(" %x %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ printk("\n");
+ spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+ struct page *newpage;
+ struct zone *zone;
+ struct address_space *mapping = page->mapping;
+ char *np, *op;
+ int waitcnt, error = -1;
+
+ newpage = alloc_page(GFP_HIGHUSER);
+ if (newpage == NULL)
+ return -ENOMEM;
+ if (TestSetPageLocked(newpage))
+ BUG();
+ lock_page(page);
+
+ if (PagePrivate(page)) {
+ waitcnt = 100;
+ while (PageWriteback(page)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(10);
+ if (! --waitcnt)
+ goto radixfail;
+ }
+
+ /* XXX copied from shrink_list() */
+ if (PageDirty(page) &&
+ is_page_cache_freeable(page) &&
+ mapping != NULL &&
+ mapping->a_ops->writepage != NULL) {
+ spin_lock(&mapping->page_lock);
+ if (test_clear_page_dirty(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1,
+ };
+
+ list_move(&page->list, &mapping->locked_pages);
+ spin_unlock(&mapping->page_lock);
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+
+ if (res == WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ goto radixfail;
+ }
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ lock_page(page);
+ goto waitbuffer;
+ }
+ spin_unlock(&mapping->page_lock);
+ }
+
+ waitbuffer:
+ waitcnt = 100;
+ while (1) {
+ if (try_to_release_page(page, GFP_KERNEL))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(10);
+ if (! --waitcnt) {
+ print_buffer(page);
+ goto radixfail;
+ }
+ }
+ }
+ if (mapping == NULL) {
+ /* The page is an anon page. Allocate swap entry. */
+ /* ...but just bail for now */
+ if (!add_to_swap(page))
+ goto radixfail;
+ mapping = page->mapping;
+ }
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ goto radixfail;
+ if (PagePrivate(page)) /* XXX */
+ BUG();
+
+ newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+ ~(1 << PG_highmem) & ~(~0UL << ZONE_SHIFT);
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock(&mapping->page_lock);
+ radix_tree_delete(&mapping->page_tree, page->index);
+ __put_page(page);
+ /* list_del(&page->list); XXX */
+ radix_tree_insert(&mapping->page_tree, page->index, newpage);
+ page_cache_get(newpage);
+ radix_tree_preload_end();
+ newpage->mapping = mapping;
+ newpage->index = page->index;
+ spin_unlock(&mapping->page_lock);
+ if (PageDirty(page))
+ list_add(&newpage->list, &mapping->dirty_pages);
+ else
+ list_add(&newpage->list, &mapping->clean_pages);
+
+ pte_chain_lock(page);
+ if (page_mapped(page)) {
+ while ((error = try_to_unmap(page)) == SWAP_AGAIN) {
+ pte_chain_unlock(page);
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ pte_chain_lock(page);
+ }
+ if (error == SWAP_FAIL)
+ /* either during mremap or mlocked */
+ goto unmapfail;
+ }
+ pte_chain_unlock(page);
+ if (PagePrivate(page))
+ printk("buffer reappeared\n");
+
+ unlock_page(page); /* no lock needed while waiting page count */
+
+ waitcnt = 0;
+wait_again:
+ while (page_count(page) != 1) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if (PagePrivate(page))
+ break; /* see below */
+ if (waitcnt == 10000) {
+ printk("remap_onepage: still waiting on %p\n", page);
+ waitcnt++;
+ }
+ if (waitcnt < 10000)
+ waitcnt++;
+ }
+
+ lock_page(page);
+ if (PagePrivate(page))
+ try_to_release_page(page, GFP_KERNEL);
+ if (page_count(page) != 1) {
+ unlock_page(page);
+ goto wait_again;
+ }
+ spin_lock(&mapping->page_lock);
+ list_del(&page->list); /* XXX */
+ page->mapping = NULL;
+ spin_unlock(&mapping->page_lock);
+ unlock_page(page);
+
+ np = kmap_atomic(newpage, KM_REMAP0);
+ op = kmap_atomic(page, KM_REMAP1);
+ if (np == NULL || op == NULL) { /* XXX */
+ printk("%p %p %p %p\n", np, op, newpage, page);
+ BUG();
+ }
+ memcpy(np, op, PAGE_SIZE);
+ kunmap_atomic(page, KM_REMAP1);
+ kunmap_atomic(newpage, KM_REMAP0);
+ ClearPageActive(page);
+ put_page(page);
+
+ /* We are done. Finish and let the waiters run. */
+ SetPageUptodate(newpage);
+ /* XXX locking order correct? */
+ zone = page_zone(newpage);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(newpage)) {
+ list_add(&newpage->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&newpage->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(newpage);
+ spin_unlock_irq(&zone->lru_lock);
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ return 0;
+
+unmapfail:
+ /* unwind is impossible if some process is waiting on the newpage */
+ printk("You are hosed.\n");
+ BUG();
+
+radixfail:
+ unlock_page(page);
+ unlock_page(newpage);
+ __free_page(newpage);
+ return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+ int cpu = get_cpu();
+
+ schedule_work(&lru_drain_wq[cpu]);
+ put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+ struct zone *zone = p;
+ struct page *page;
+ int i, nr_failed = 0;
+ LIST_HEAD(failedp);
+
+ daemonize("remap%d", zone->zone_start_pfn);
+ if (atomic_read(&remapd_count) > 0) {
+ printk("remapd already running\n");
+ return 0;
+ }
+ atomic_inc(&remapd_count);
+ on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+ while(nr_failed < 100) {
+ spin_lock_irq(&zone->lru_lock);
+ for(i = 0; ! list_empty(&zone->inactive_list) &&
+ i < 10; i++) {
+ page = list_entry(zone->inactive_list.prev,
+ struct page, lru);
+ if (! TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ zone->nr_inactive--;
+ if (page_count(page) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(page);
+ list_add(&page->lru, &zone->inactive_list);
+ continue;
+ }
+ page_cache_get(page);
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+
+ for(i = 0; ! list_empty(&zone->active_list) &&
+ i < 10; i++) {
+ page = list_entry(zone->active_list.prev,
+ struct page, lru);
+ if (! TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ zone->nr_active--;
+ if (page_count(page) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(page);
+ list_add(&page->lru, &zone->active_list);
+ continue;
+ }
+ page_cache_get(page);
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ break;
+
+ got_page:
+ if (remap_onepage(page)) {
+ nr_failed++;
+ list_add(&page->lru, &failedp);
+ }
+ }
+ if (list_empty(&failedp))
+ goto out;
+
+ while (! list_empty(&failedp)) {
+ spin_lock_irq(&zone->lru_lock);
+ page = list_entry(failedp.prev, struct page, lru);
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(page);
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(page);
+ }
+out:
+ atomic_dec(&remapd_count);
+ return 0;
+}
+
+static int __init remapd_init(void)
+{
+ int i;
+
+ for(i = 0; i < NR_CPUS; i++)
+ INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+ return 0;
+}
+
+module_init(remapd_init);
+#endif
/*
* A zone is low on free memory, so wake its kswapd task to service it.
--
IWAMOTO Toshihiro @ VA Linux Systems Japan
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2003-11-04 10:29 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-11-04 10:29 memory hotremove prototype, take 2 IWAMOTO Toshihiro
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.