From: Yasunori Goto <ygoto@fsw.fujitsu.com>
To: "Pavel Machek" <pavel@suse.cz>
Cc: <linux-kernel@vger.kernel.org>,
"Luck, Tony" <tony.luck@intel.com>,
IWAMOTO Toshihiro <iwamoto@valinux.co.jp>,
Hirokazu Takahashi <taka@valinux.co.jp>,
Linux Hotplug Memory Support <lhms-devel@lists.sourceforge.net>
Subject: Re: memory hotremove prototype, take 3
Date: Tue, 02 Dec 2003 14:26:10 -0800 [thread overview]
Message-ID: <20031202111944.57B2.YGOTO@fsw.fujitsu.com> (raw)
In-Reply-To: <B8E391BBE9FE384DAA4C5C003888BE6F4FAED7@scsmsx401.sc.intel.com>
[-- Attachment #1: Type: text/plain, Size: 919 bytes --]
Hello.
> Pavel Machek wrote:
>
> > hotunplug seems cool... How do you deal with kernel data structures in
> > memory "to be removed"? Or you simply don't allow kmalloc() to
> > allocate there?
>
> You guessed right. Hot removeable memory can only be allocated
> for uses that we can easily re-allocate. So kmalloc() etc. have
> to get memory from some area that we promise not to ever try to
> remove.
IMHO, To hot-remove memory, memory attribute should be divided
into Hotpluggable and no-Hotpluggable, and each attribute memory
should be allocated each unit(ex. node).
(I posted the following mail 2 month ago.)
http://marc.theaimsgroup.com/?l=linux-kernel&m=106506389406876&w=2
Now, I'm making a Memory hot-ADD trial patch, but it don't work yet.
(Kernel panic when memory enable command is executed.)
After this patch will work, I will post it again.
Thanks.
--
Yasunori Goto <ygoto at fsw.fujitsu.com>
[-- Attachment #2: 20031125.patch --]
[-- Type: application/octet-stream, Size: 34776 bytes --]
diff -duprb linux-2.6.0-test7/Makefile testdir/Makefile
--- linux-2.6.0-test7/Makefile Wed Oct 8 12:24:17 2003
+++ testdir/Makefile Sat Nov 22 17:55:21 2003
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 0
-EXTRAVERSION = -test7
+EXTRAVERSION = -test7-mem-hotplug
# *DOCUMENTATION*
# To see a list of typical targets execute "make help"
diff -duprb linux-2.6.0-test7/arch/i386/Kconfig testdir/arch/i386/Kconfig
--- linux-2.6.0-test7/arch/i386/Kconfig Wed Oct 8 12:24:02 2003
+++ testdir/arch/i386/Kconfig Sat Nov 22 17:52:36 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
+config MEMHOTPLUGTEST
+ bool "Memory hotplug test"
+ default n
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y
config HAVE_ARCH_BOOTMEM_NODE
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y
config HIGHPTE
diff -duprb linux-2.6.0-test7/arch/i386/kernel/setup.c testdir/arch/i386/kernel/setup.c
--- linux-2.6.0-test7/arch/i386/kernel/setup.c Wed Oct 8 12:24:05 2003
+++ testdir/arch/i386/kernel/setup.c Sat Nov 22 17:52:36 2003
@@ -114,6 +114,8 @@ extern void generic_apic_probe(char *);
extern int root_mountflags;
extern char _end[];
+extern unsigned long node_end_pfn[MAX_NUMNODES];
+
unsigned long saved_videomode;
#define RAMDISK_IMAGE_START_MASK 0x07FF
@@ -611,7 +613,11 @@ unsigned long __init find_max_low_pfn(vo
{
unsigned long max_low_pfn;
+#if CONFIG_MEMHOTPLUGTEST
+ max_low_pfn = node_end_pfn[0];
+#else
max_low_pfn = max_pfn;
+#endif
if (max_low_pfn > MAXMEM_PFN) {
if (highmem_pages == -1)
highmem_pages = max_pfn - MAXMEM_PFN;
diff -duprb linux-2.6.0-test7/arch/i386/mm/discontig.c testdir/arch/i386/mm/discontig.c
--- linux-2.6.0-test7/arch/i386/mm/discontig.c Wed Oct 8 12:24:07 2003
+++ testdir/arch/i386/mm/discontig.c Tue Nov 25 19:34:03 2003
@@ -28,6 +28,12 @@
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
+#include <linux/proc_fs.h>
+
+#ifdef CONFIG_MEMHOTPLUG
+#include <linux/sched.h>
+#endif
+
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
@@ -80,6 +86,10 @@ unsigned long node_remap_offset[MAX_NUMN
void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+#ifdef CONFIG_MEMHOTPLUGTEST
+void set_pmd_pfn_withpgd(unsigned long vaddr, unsigned long pfn,pgd_t *pgd, pgprot_t flags);
+#endif
+
/*
* FLAT - support for basic PC memory model with discontig enabled, essentially
* a single node with all available processors in it with a flat
@@ -111,6 +121,44 @@ int __init get_memcfg_numa_flat(void)
return 1;
}
+int __init get_memcfg_numa_blks(void)
+{
+ int i, pfn;
+
+ printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+ /* Run the memory configuration and find the top of memory. */
+ find_max_pfn();
+ max_pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ pfn = PFN_DOWN(256 << 20) * i;
+ node_start_pfn[i] = pfn;
+ pfn += PFN_DOWN(256 << 20);
+ if (pfn < max_pfn)
+ node_end_pfn[i] = pfn;
+ else {
+ node_end_pfn[i] = max_pfn;
+ i++;
+ printk("total %d blocks, max %d\n", i, (int)max_pfn);
+ break;
+ }
+ }
+
+ /* Fill in the physnode_map with our simplistic memory model,
+ * all memory is in node 0.
+ */
+ for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+ pfn += PAGES_PER_ELEMENT) {
+ physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(256 << 20);
+ }
+
+ /* Indicate there is one node available. */
+ node_set_online(0);
+ numnodes = i;
+
+ return 1;
+}
+
/*
* Find the highest page frame number we have available for the node
*/
@@ -134,6 +182,12 @@ static void __init find_max_pfn_node(int
*/
static void __init allocate_pgdat(int nid)
{
+#if CONFIG_MEMHOTPLUGTEST
+ /* pg_dat allocate Node 0 statically */
+ NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
+ min_low_pfn += PFN_UP(sizeof(pg_data_t));
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+#else
if (nid)
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
else {
@@ -141,6 +195,7 @@ static void __init allocate_pgdat(int ni
min_low_pfn += PFN_UP(sizeof(pg_data_t));
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
}
+#endif
}
/*
@@ -183,6 +238,7 @@ static void __init register_bootmem_low_
}
}
+/*static struct kcore_list numa_kc;*/
void __init remap_numa_kva(void)
{
void *vaddr;
@@ -196,9 +252,34 @@ void __init remap_numa_kva(void)
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
+ /* memset(node_remap_start_vaddr[node], 0,node_remap_size[node] * PAGE_SIZE); */
+ }
+/* kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+ node_remap_offset[numnodes - 1] << PAGE_SHIFT);*/
+}
+
+void remap_add_node_kva(int node)
+{
+ void *vaddr;
+ unsigned long pfn;
+ struct task_struct *p;
+ pgd_t *pg_dir;
+
+ read_lock(&tasklist_lock);
+ for_each_process(p){
+ pg_dir = p->mm->pgd;
+ for(pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE){
+ vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+ set_pmd_pfn_withpgd((ulong) vaddr,
+ node_remap_start_pfn[node] + pfn,
+ pg_dir + pgd_index( (ulong)vaddr ) ,
+ PAGE_KERNEL_LARGE);
+ }
}
+ read_unlock(&tasklist_lock);
}
+
static unsigned long calculate_numa_remap_pages(void)
{
int nid;
@@ -206,8 +287,13 @@ static unsigned long calculate_numa_rema
for (nid = 1; nid < numnodes; nid++) {
/* calculate the size of the mem_map needed in bytes */
+#if CONFIG_MEMHOTPLUGTEST
+ size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
+ * sizeof(struct page);
+#else
size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
* sizeof(struct page) + sizeof(pg_data_t);
+#endif
/* convert size to large (pmd size) pages, rounding up */
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
/* now the roundup is correct, convert to PAGE_SIZE pages */
@@ -248,7 +334,9 @@ unsigned long __init setup_memory(void)
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
#endif
+#ifndef CONFIG_MEMHOTPLUGTEST
system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
+#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(system_max_low_pfn));
printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
@@ -426,7 +514,11 @@ void __init set_highmem_pages_init(int b
void __init set_max_mapnr_init(void)
{
#ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_MEMHOTPLUGTEST
+ highmem_start_page = phys_to_virt(max_low_pfn << PAGE_SHIFT);
+#else /* !CONFIG_MEMHOTPLUGTEST */
highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map;
+#endif /* !CONFIG_MEMHOTPLUGTEST */
num_physpages = highend_pfn;
#else
num_physpages = max_low_pfn;
diff -duprb linux-2.6.0-test7/arch/i386/mm/pgtable.c testdir/arch/i386/mm/pgtable.c
--- linux-2.6.0-test7/arch/i386/mm/pgtable.c Wed Oct 8 12:24:53 2003
+++ testdir/arch/i386/mm/pgtable.c Tue Nov 25 19:23:46 2003
@@ -118,6 +118,30 @@ void set_pmd_pfn(unsigned long vaddr, un
*/
__flush_tlb_one(vaddr);
}
+void set_pmd_pfn_withpgd(unsigned long vaddr, unsigned long pfn, pgd_t *pgd, pgprot_t flags)
+{
+ pmd_t *pmd;
+
+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
+ printk ("set_pmd_pfn: vaddr misaligned\n");
+ return; /* BUG(); */
+ }
+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
+ printk ("set_pmd_pfn: pfn misaligned\n");
+ return; /* BUG(); */
+ }
+ if (pgd_none(*pgd)) {
+ printk ("set_pmd_pfn: pgd_none\n");
+ return; /* BUG(); */
+ }
+ pmd = pmd_offset(pgd, vaddr);
+ set_pmd(pmd, pfn_pmd(pfn, flags));
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
diff -duprb linux-2.6.0-test7/drivers/char/mem.c testdir/drivers/char/mem.c
--- linux-2.6.0-test7/drivers/char/mem.c Wed Oct 8 12:24:06 2003
+++ testdir/drivers/char/mem.c Sat Nov 22 17:53:41 2003
@@ -24,6 +24,9 @@
#include <linux/smp_lock.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/highmem.h>
+#endif
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -104,6 +107,36 @@ static ssize_t do_write_mem(struct file
return written;
}
+#ifdef CONFIG_HIGHMEM
+static ssize_t read_highmem(struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long p = *ppos;
+ ssize_t read = 0;
+ int off, pfn = p >> PAGE_SHIFT;
+ char *pp;
+ struct page *page;
+
+ if (! pfn_valid(pfn))
+ return 0;
+ page = pfn_to_page(pfn);
+ pp = kmap(page);
+
+ off = p & (PAGE_SIZE - 1);
+ if (PAGE_SIZE - off > count)
+ count = PAGE_SIZE - off;
+
+ if (copy_to_user(buf, pp + off, count)) {
+ kunmap(page);
+ return -EFAULT;
+ }
+ read += count;
+ *ppos += read;
+ kunmap(page);
+ return read;
+}
+
+#endif
/*
* This funcion reads the *physical* memory. The f_pos points directly to the
@@ -118,7 +151,11 @@ static ssize_t read_mem(struct file * fi
end_mem = __pa(high_memory);
if (p >= end_mem)
+#ifdef CONFIG_HIGHMEM
+ return read_highmem(file, buf, count, ppos);
+#else
return 0;
+#endif
if (count > end_mem - p)
count = end_mem - p;
read = 0;
diff -duprb linux-2.6.0-test7/fs/proc/kcore.c testdir/fs/proc/kcore.c
--- linux-2.6.0-test7/fs/proc/kcore.c Wed Oct 8 12:24:07 2003
+++ testdir/fs/proc/kcore.c Sat Nov 22 17:54:58 2003
@@ -387,7 +387,7 @@ read_kcore(struct file *file, char __use
}
kfree(elf_buf);
} else {
- if (kern_addr_valid(start)) {
+ if (1 /*kern_addr_valid(start)*/) {
unsigned long n;
n = copy_to_user(buffer, (char *)start, tsz);
diff -duprb linux-2.6.0-test7/include/asm-i386/mmzone.h testdir/include/asm-i386/mmzone.h
--- linux-2.6.0-test7/include/asm-i386/mmzone.h Wed Oct 8 12:24:06 2003
+++ testdir/include/asm-i386/mmzone.h Sat Nov 22 17:54:41 2003
@@ -128,6 +128,7 @@ static inline struct pglist_data *pfn_to
#endif /* CONFIG_X86_NUMAQ */
extern int get_memcfg_numa_flat(void );
+extern int get_memcfg_numa_blks(void );
/*
* This allows any one NUMA architecture to be compiled
* for, and still fall back to the flat function if it
@@ -140,6 +141,9 @@ static inline void get_memcfg_numa(void)
return;
#elif CONFIG_ACPI_SRAT
if (get_memcfg_from_srat())
+ return;
+#elif CONFIG_MEMHOTPLUGTEST
+ if (get_memcfg_numa_blks())
return;
#endif
diff -duprb linux-2.6.0-test7/include/asm-i386/numnodes.h testdir/include/asm-i386/numnodes.h
--- linux-2.6.0-test7/include/asm-i386/numnodes.h Wed Oct 8 12:24:02 2003
+++ testdir/include/asm-i386/numnodes.h Sat Nov 22 17:54:41 2003
@@ -13,6 +13,10 @@
/* Max 8 Nodes */
#define NODES_SHIFT 3
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT 3
+
#endif /* CONFIG_X86_NUMAQ */
#endif /* _ASM_MAX_NUMNODES_H */
diff -duprb linux-2.6.0-test7/include/linux/mm.h testdir/include/linux/mm.h
--- linux-2.6.0-test7/include/linux/mm.h Wed Oct 8 12:24:01 2003
+++ testdir/include/linux/mm.h Sat Nov 22 17:54:21 2003
@@ -219,7 +219,14 @@ struct page {
*/
#define put_page_testzero(p) \
({ \
- BUG_ON(page_count(p) == 0); \
+ if (page_count(p) == 0) { \
+ int i; \
+ printk("Page: %lx ", (long)p); \
+ for(i = 0; i < sizeof(struct page); i++) \
+ printk(" %02x", ((unsigned char *)p)[i]); \
+ printk("\n"); \
+ BUG(); \
+ } \
atomic_dec_and_test(&(p)->count); \
})
@@ -622,5 +629,17 @@ kernel_map_pages(struct page *page, int
}
#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define page_trace(p) page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define page_trace(p) do { } while(0)
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define page_trace(p) page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define page_trace(p) do { } while(0)
+#endif /* CONFIG_MEMHOTPLUGTEST */
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff -duprb linux-2.6.0-test7/include/linux/mmzone.h testdir/include/linux/mmzone.h
--- linux-2.6.0-test7/include/linux/mmzone.h Wed Oct 8 12:24:08 2003
+++ testdir/include/linux/mmzone.h Sat Nov 22 17:54:23 2003
@@ -174,6 +174,7 @@ struct zone {
* footprint of this construct is very small.
*/
struct zonelist {
+ rwlock_t zonelist_lock;
struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
};
@@ -235,10 +236,29 @@ void wakeup_kswapd(struct zone *zone);
* next_zone - helper magic for for_each_zone()
* Thanks to William Lee Irwin III for this piece of ingenuity.
*/
+extern char zone_active[];
+
static inline struct zone *next_zone(struct zone *zone)
{
pg_data_t *pgdat = zone->zone_pgdat;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ unsigned int zone_idx = zone - pgdat->node_zones;
+ do{
+ if (zone_idx < MAX_NR_ZONES -1){
+ zone++;
+ zone_idx++;
+ }else if (pgdat->pgdat_next){
+ pgdat = pgdat->pgdat_next;
+ zone = pgdat->node_zones;
+ zone_idx=0;
+ }else
+ return NULL;
+ }while(!zone_active[pgdat->node_id * MAX_NR_ZONES + zone_idx]);
+
+ return zone;
+
+#else
if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
zone++;
else if (pgdat->pgdat_next) {
@@ -248,6 +268,7 @@ static inline struct zone *next_zone(str
zone = NULL;
return zone;
+#endif
}
/**
@@ -359,6 +380,10 @@ static inline unsigned int num_online_me
}
return num;
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+#endif
#else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
diff -duprb linux-2.6.0-test7/mm/page_alloc.c testdir/mm/page_alloc.c
--- linux-2.6.0-test7/mm/page_alloc.c Wed Oct 8 12:24:01 2003
+++ testdir/mm/page_alloc.c Tue Nov 25 18:48:01 2003
@@ -31,6 +31,7 @@
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
+#include <linux/proc_fs.h>
#include <asm/tlbflush.h>
@@ -52,6 +53,11 @@ EXPORT_SYMBOL(nr_swap_pages);
*/
struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+EXPORT_SYMBOL(zone_active);
+static const struct page *page_trace_list[10];
+#endif
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -512,9 +518,28 @@ static struct page *buffered_rmqueue(str
mod_page_state(pgalloc, 1 << order);
prep_new_page(page, order);
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_active[page->flags >> ZONE_SHIFT])
+ BUG();
+#endif
return page;
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+ int i;
+
+ for(i = 0; ; i++) {
+ if (zone_table[i] == z)
+ return zone_active[i];
+ if (zone_table[i] == NULL)
+ BUG();
+ }
+}
+#endif
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -544,6 +569,7 @@ __alloc_pages(unsigned int gfp_mask, uns
int i;
int cold;
int do_retry;
+ unsigned long flag;
might_sleep_if(wait);
@@ -551,10 +577,13 @@ __alloc_pages(unsigned int gfp_mask, uns
if (gfp_mask & __GFP_COLD)
cold = 1;
+ read_lock_irqsave(&zonelist->zonelist_lock,flag);
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
classzone = zones[0];
- if (classzone == NULL) /* no zones in the zonelist */
+ if (classzone == NULL){ /* no zones in the zonelist */
+ read_unlock_irqrestore(&zonelist->zonelist_lock,flag);
return NULL;
+ }
/* Go through the zonelist once, looking for a zone with enough free */
min = 1UL << order;
@@ -562,6 +591,10 @@ __alloc_pages(unsigned int gfp_mask, uns
struct zone *z = zones[i];
unsigned long local_low;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
@@ -589,6 +622,10 @@ __alloc_pages(unsigned int gfp_mask, uns
for (i = 0; zones[i] != NULL; i++) {
unsigned long local_min;
struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
@@ -612,6 +649,10 @@ rebalance:
/* go through the zonelist yet again, ignoring mins */
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
page = buffered_rmqueue(z, order, cold);
if (page)
@@ -668,6 +709,7 @@ rebalance:
}
nopage:
+ read_unlock_irqrestore(&zonelist->zonelist_lock,flag);
if (!(gfp_mask & __GFP_NOWARN)) {
printk("%s: page allocation failure."
" order:%d, mode:0x%x\n",
@@ -676,6 +718,24 @@ nopage:
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
+#if 1 // debug
+ /* Validate page */
+ {
+ struct zone *z = page_zone(page);
+ int idx = page - z->zone_mem_map;
+ if (idx < 0 || idx >= z->spanned_pages) {
+ printk("0x%08x %d\n", (int)(page->flags >> ZONE_SHIFT), idx);
+ read_unlock(&zonelist->zonelist_lock);
+ BUG();
+ }
+ }
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+ read_unlock_irqrestore(&zonelist->zonelist_lock,flag);
+ if (! zone_active[page->flags >> ZONE_SHIFT]){
+ BUG();
+ }
+#endif
return page;
}
@@ -1046,7 +1106,11 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
*/
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+#else
static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+#endif
{
switch (k) {
struct zone *zone;
@@ -1076,6 +1140,9 @@ static int __init build_zonelists_node(p
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct zone *zone;
+#endif
local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
@@ -1092,6 +1159,7 @@ static void __init build_zonelists(pg_da
if (i & __GFP_DMA)
k = ZONE_DMA;
+#ifndef CONFIG_MEMHOTPLUGTEST
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1107,6 +1175,26 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
zonelist->zones[j++] = NULL;
+#else
+ rwlock_init(&zonelist->zonelist_lock);
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ if (!zone_activep(zone))
+ continue;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ for (node = local_node + 1; node < numnodes; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone_activep(zone) && zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ for (node = 0; node < local_node; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone_activep(zone) && zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ }
+#endif
}
}
@@ -1162,8 +1250,14 @@ static inline unsigned long wait_table_b
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+#if CONFIG_MEMHOTPLUGTEST
+static void calculate_zone_totalpages(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+#else
static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
+#endif
+
{
unsigned long realtotalpages, totalpages = 0;
int i;
@@ -1199,6 +1293,20 @@ static void __init calculate_zone_bitmap
}
}
+#if CONFIG_MEMHOTPLUGTEST
+static void calculate_addzone_bitmap(struct pglist_data *pgdat, unsigned long *zones_size)
+{
+ unsigned long size = zones_size[ZONE_HIGHMEM];
+
+ size = LONG_ALIGN((size + 7) >> 3);
+ if (size) {
+ pgdat->valid_addr_bitmap = (unsigned long *)kmalloc(size,GFP_KERNEL);
+ memset(pgdat->valid_addr_bitmap, 0, size);
+ }
+}
+
+#endif
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -1252,6 +1360,45 @@ static void __init free_area_init_core(s
unsigned long batch;
zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ /* only node 0 is active */
+ if ( nid ){ /* node 1-... are node active */
+ /* XXX : This should be changed. */
+ zone_active[nid * MAX_NR_ZONES + j ] = 0;
+ zone->spanned_pages = 0;
+ zone->present_pages = 0;
+ zone->name = zone_names[j];
+ spin_lock_init(&zone->lock);
+ spin_lock_init(&zone->lru_lock);
+ zone->zone_pgdat = pgdat;
+ zone->free_pages = 0;
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone->pageset[cpu].pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 0;
+ pcp->batch = 0;
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[1]; /* cold */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 0;
+ pcp->batch = 0;
+ INIT_LIST_HEAD(&pcp->list);
+ }
+ INIT_LIST_HEAD(&zone->active_list);
+ INIT_LIST_HEAD(&zone->inactive_list);
+ atomic_set(&zone->refill_counter, 0);
+ zone->nr_active = 0;
+ zone->nr_inactive = 0;
+
+ continue;
+ }
+ zone_active[nid * MAX_NR_ZONES + j ] = 1 ; /* only node 0 is active */
+#endif
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1295,8 +1442,8 @@ static void __init free_area_init_core(s
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
}
- printk(" %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
+ printk(" %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+ zone_names[j], realsize, batch, zone_start_pfn);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
atomic_set(&zone->refill_counter, 0);
@@ -1381,14 +1528,22 @@ void __init free_area_init_node(int nid,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (!node_mem_map && !nid) {
+#else
if (!node_mem_map) {
+#endif
size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
node_mem_map = alloc_bootmem_node(pgdat, size);
}
pgdat->node_mem_map = node_mem_map;
free_area_init_core(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if(!nid)memblk_set_online(node_to_memblk(nid)); /* only node 0 is online */
+#else
memblk_set_online(node_to_memblk(nid));
+#endif
calculate_zone_bitmap(pgdat, zones_size);
}
@@ -1644,3 +1799,387 @@ int min_free_kbytes_sysctl_handler(ctl_t
setup_per_zone_pages_min();
return 0;
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void rebuild_all_zonelist(unsigned long nid)
+{
+ struct zonelist *zonelist;
+ unsigned long node, p_node, j=0;
+
+
+ zonelist = NODE_DATA(nid)->node_zonelists + ZONE_HIGHMEM;
+ write_lock(&zonelist->zonelist_lock);
+ memset(zonelist, 0, sizeof(*zonelist));
+
+ /* build zonelist for added zone */
+ j= build_zonelists_node( NODE_DATA(nid), zonelist, j, ZONE_HIGHMEM);
+
+ for ( node = nid + 1; node < numnodes; node++)
+ j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+ for (node = 0; node < nid ; node++)
+ j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+
+
+ /* rebuild zonelist for other node */
+ for( p_node = 0; p_node < numnodes ; p_node++){
+ zonelist = NODE_DATA(p_node)->node_zonelists + ZONE_HIGHMEM;
+ j=0;
+
+ j = build_zonelists_node( NODE_DATA(p_node), zonelist, j, ZONE_HIGHMEM);
+
+ for ( node = p_node + 1 ; node < numnodes ; node++ )
+ j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+ for ( node = 0; node < p_node; node++ )
+ j = build_zonelists_node( NODE_DATA(node), zonelist, j, ZONE_HIGHMEM);
+ zonelist->zones[j++] = NULL;
+
+ }
+ write_unlock(&zonelist->zonelist_lock);
+}
+
+
+static void free_area_add_core(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long i;
+ const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+ int cpu, nid = pgdat->node_id;
+ struct page *lmem_map = pgdat->node_mem_map;
+ unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+ pgdat->nr_zones = 0;
+ init_waitqueue_head(&pgdat->kswapd_wait);
+
+ {
+ struct zone *zone = pgdat->node_zones + ZONE_HIGHMEM;
+ unsigned long size, realsize;
+ unsigned long batch;
+
+ zone_table[nid * MAX_NR_ZONES + ZONE_HIGHMEM] = zone;
+
+ realsize = size = zones_size[ZONE_HIGHMEM];
+ if (zholes_size)
+ realsize -= zholes_size[ZONE_HIGHMEM];
+
+ zone->spanned_pages = size;
+ zone->present_pages = realsize;
+ zone->name = zone_names[ZONE_HIGHMEM];
+ spin_lock_init(&zone->lock);
+ spin_lock_init(&zone->lru_lock);
+ zone->zone_pgdat = pgdat;
+ zone->free_pages = 0;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone->pageset[cpu].pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[1]; /* cold */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+ }
+ printk(" %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+ zone_names[ZONE_HIGHMEM], realsize, batch, zone_start_pfn);
+ INIT_LIST_HEAD(&zone->active_list);
+ INIT_LIST_HEAD(&zone->inactive_list);
+ atomic_set(&zone->refill_counter, 0);
+ zone->nr_active = 0;
+ zone->nr_inactive = 0;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_size = wait_table_size(size);
+ zone->wait_table_bits =
+ wait_table_bits(zone->wait_table_size);
+ zone->wait_table = (wait_queue_head_t *)kmalloc(zone->wait_table_size
+ * sizeof(wait_queue_head_t), GFP_KERNEL);
+ /* XXX: wait_table might have to be allocate own node. */
+
+ for(i = 0; i < zone->wait_table_size; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+
+ pgdat->nr_zones = ZONE_HIGHMEM+1;
+
+ zone->zone_mem_map = lmem_map;
+ zone->zone_start_pfn = zone_start_pfn;
+
+ if ((zone_start_pfn) & (zone_required_alignment-1))
+ printk("BUG: wrong zone alignment, it will crash\n");
+
+ memmap_init_zone(lmem_map, size, nid, ZONE_HIGHMEM, zone_start_pfn);
+
+ for (i = 0; ; i++) {
+ unsigned long bitmap_size;
+
+ INIT_LIST_HEAD(&zone->free_area[i].free_list);
+ if (i == MAX_ORDER-1) {
+ zone->free_area[i].map = NULL;
+ break;
+ }
+
+ /*
+ * Page buddy system uses "index >> (i+1)",
+ * where "index" is at most "size-1".
+ *
+ * The extra "+3" is to round down to byte
+ * size (8 bits per byte assumption). Thus
+ * we get "(size-1) >> (i+4)" as the last byte
+ * we can access.
+ *
+ * The "+1" is because we want to round the
+ * byte allocation up rather than down. So
+ * we should have had a "+7" before we shifted
+ * down by three. Also, we have to add one as
+ * we actually _use_ the last bit (it's [0,n]
+ * inclusive, not [0,n[).
+ *
+ * So we actually had +7+1 before we shift
+ * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+ * (modulo overflows, which we do not have).
+ *
+ * Finally, we LONG_ALIGN because all bitmap
+ * operations are on longs.
+ */
+ bitmap_size = (size-1) >> (i+4);
+ bitmap_size = LONG_ALIGN(bitmap_size+1);
+ zone->free_area[i].map =
+ (unsigned long *) kmalloc(bitmap_size, GFP_KERNEL);
+ /* XXX: bitmap might have to be allocate own node too. */
+ }
+ }
+}
+
+extern void *node_remap_start_vaddr[];
+
+void free_area_add_node(int nid, struct pglist_data *pgdat,unsigned long *zones_size,
+ unsigned long node_start_pfn, unsigned long *zholes_size)
+{
+ unsigned long size;
+
+ calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+
+ size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+ remap_add_node_kva(nid);
+
+ free_area_add_core(pgdat, zones_size, zholes_size);
+ calculate_addzone_bitmap(pgdat, zones_size);
+
+}
+
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+
+static void node_enable(unsigned long nid)
+{
+ unsigned long idx = nid * MAX_NR_ZONES + ZONE_HIGHMEM;
+ unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+ unsigned long *zholes_size;
+
+ if (nid > numnodes){ /* XXX : nid should has continuity now */
+ /* but it should be changed */
+ printk("nid=%d isn&t possible to enable \n",nid);
+ return;
+ }
+
+ if (node_online(nid)){
+ printk("nid=%d is already enable \n", nid);
+ return;
+ }
+
+ zones_size[ZONE_HIGHMEM] = node_end_pfn[nid] - node_start_pfn[nid];
+ /* XXX: This information should be got from firmware.
+ However, this is emulation. */
+ if( !zones_size[ZONE_HIGHMEM] ){
+ printk("nid=%d is size 0\n",nid);
+ return;
+ }
+
+ zholes_size = get_zholes_size(nid);
+
+ free_area_add_node(nid, NODE_DATA(nid), zones_size, node_start_pfn[nid], zholes_size);
+
+ setup_per_zone_pages_min(); /* set up again */
+
+ rebuild_all_zonelist( nid);
+ memblk_set_online(node_to_memblk(nid));
+ node_set_online(nid);
+ zone_active[idx] = 1;
+
+}
+
+static int mhtest_read(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ char *p;
+ int i, len;
+ const struct zone *z;
+
+ p = page;
+ for(i = 0; ; i++) {
+ z = zone_table[i];
+ if (z == NULL)
+ break;
+ if (! z->present_pages)
+ /* skip empty zone */
+ continue;
+ len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+ zone_active[i] ? "en" : "dis", (int)z->free_pages, (int)z->nr_active,
+ (int)z->present_pages);
+ p += len;
+ }
+ len = p - page;
+
+ if (len <= off + count)
+ *eof = 1;
+ *start = page + off;
+ len -= off;
+ if (len < 0)
+ len = 0;
+ if (len > count)
+ len = count;
+
+ return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ unsigned long idx;
+ char buf[64], *p;
+ int i;
+
+ if (count > sizeof(buf) - 1)
+ count = sizeof(buf) - 1;
+ if (copy_from_user(buf, buffer, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ p = strchr(buf, ' ');
+ if (p == NULL)
+ goto out;
+
+ *p++ = '\0';
+ idx = simple_strtoul(p, NULL, 0);
+
+ if (strcmp(buf, "trace") == 0) {
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++)
+ if (page_trace_list[i] == NULL) {
+ page_trace_list[i] = (struct page *)idx;
+ printk("add trace %lx\n", (unsigned long)idx);
+ goto out;
+ }
+ printk("page_trace_list is full (not added)\n");
+ goto out;
+ } else if (strcmp(buf, "untrace") == 0) {
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++)
+ if (page_trace_list[i] == (struct page *)idx)
+ break;
+ if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0])) {
+ printk("not registered\n");
+ goto out;
+ }
+ for(; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]) - 1; i++)
+ page_trace_list[i] = page_trace_list[i + 1];
+ page_trace_list[i] = NULL;
+ goto out;
+ }
+ if (idx > MAX_NUMNODES) {
+ printk("Argument out of range\n");
+ goto out;
+ }
+ if (strcmp(buf, "disable") == 0) {
+ printk("disable node = %d\n", (int)idx); /* XXX */
+ goto out;
+ } else if (strcmp(buf, "purge") == 0) {
+ /* XXX */
+ } else if (strcmp(buf, "enable") == 0) {
+ printk("enable node = %d\n", (int)idx);
+ node_enable(idx);
+ } else if (strcmp(buf, "active") == 0) {
+ /*
+ if (zone_table[idx] == NULL)
+ goto out;
+ spin_lock_irq(&zone_table[idx]->lru_lock);
+ i = 0;
+ list_for_each(l, &zone_table[idx]->active_list) {
+ printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+ i++;
+ if (i == 10)
+ break;
+ }
+ spin_unlock_irq(&zone_table[idx]->lru_lock);
+ printk("\n");
+ */
+ } else if (strcmp(buf, "inuse") == 0) {
+ /*
+ if (zone_table[idx] == NULL)
+ goto out;
+ for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+ if (page_count(&zone_table[idx]->zone_mem_map[i]))
+ printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+ printk("\n");
+ */
+ }
+out:
+ return count;
+}
+
+static int __init procmhtest_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("memhotplug", 0, NULL);
+ if (entry == NULL)
+ return -1;
+
+ entry->read_proc = &mhtest_read;
+ entry->write_proc = &mhtest_write;
+ return 0;
+}
+__initcall(procmhtest_init);
+
+void
+page_trace_func(const struct page *p, const char *func, int line) {
+ int i;
+
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++) {
+ if (page_trace_list[i] == NULL)
+ return;
+ if (page_trace_list[i] == p)
+ break;
+ }
+ if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0]))
+ return;
+
+ printk("Page %lx, %s %d\n", (unsigned long)p, func, line);
+}
+#endif
next prev parent reply other threads:[~2003-12-02 22:27 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-12-01 20:12 memory hotremove prototype, take 3 Luck, Tony
2003-12-02 3:01 ` IWAMOTO Toshihiro
2003-12-02 6:43 ` Hirokazu Takahashi
2003-12-02 22:26 ` Yasunori Goto [this message]
-- strict thread matches above, loose matches on Subject: below --
2003-12-10 0:45 Luck, Tony
2003-12-03 17:57 Luck, Tony
2003-12-03 5:19 Perez-Gonzalez, Inaky
2003-12-01 3:41 IWAMOTO Toshihiro
2003-12-01 19:56 ` Pavel Machek
2003-12-03 19:41 ` Martin J. Bligh
2003-12-04 3:58 ` IWAMOTO Toshihiro
2003-12-04 5:38 ` Martin J. Bligh
2003-12-04 15:44 ` IWAMOTO Toshihiro
2003-12-04 17:12 ` Martin J. Bligh
2003-12-04 18:27 ` Jesse Barnes
2003-12-04 18:29 ` Martin J. Bligh
2003-12-04 18:59 ` Jesse Barnes
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20031202111944.57B2.YGOTO@fsw.fujitsu.com \
--to=ygoto@fsw.fujitsu.com \
--cc=iwamoto@valinux.co.jp \
--cc=lhms-devel@lists.sourceforge.net \
--cc=linux-kernel@vger.kernel.org \
--cc=pavel@suse.cz \
--cc=taka@valinux.co.jp \
--cc=tony.luck@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).