Migrate pages from a ccNUMA node to another

* Migrate pages from a ccNUMA node to another - patch
@ 2004-03-26  9:18 Zoltan Menyhart
  2004-03-26 17:20 ` Dave Hansen
  0 siblings, 1 reply; 17+ messages in thread
From: Zoltan Menyhart @ 2004-03-26  9:18 UTC (permalink / raw)
  To: linux-ia64, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]

 

[-- Attachment #2: mig-2.6.4-bk4-2004-march-25 --]
[-- Type: text/plain, Size: 65740 bytes --]

diff -Nru 2.6.4.ref/arch/ia64/kernel/acpi.c 2.6.4.mig4/arch/ia64/kernel/acpi.c

--- 2.6.4.ref/arch/ia64/kernel/acpi.c	Tue Mar 16 10:18:04 2004
+++ 2.6.4.mig4/arch/ia64/kernel/acpi.c	Thu Mar 25 08:58:09 2004
@@ -457,6 +457,7 @@
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (pxm_bit_test(i)) {
 			pxm_to_nid_map[i] = numnodes;
+			node_set_online(numnodes);
 			nid_to_pxm_map[numnodes++] = i;
 		}
 	}
diff -Nru 2.6.4.ref/arch/ia64/kernel/entry.S 2.6.4.mig4/arch/ia64/kernel/entry.S
--- 2.6.4.ref/arch/ia64/kernel/entry.S	Tue Mar 16 10:18:04 2004
+++ 2.6.4.mig4/arch/ia64/kernel/entry.S	Thu Mar 25 08:58:09 2004
@@ -1518,7 +1518,11 @@
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall			// 1275
+#if defined(CONFIG_NUMA)
+	data8 sys_page_migrate			// 1276: Migrate pages to another NUMA node
+#else
 	data8 sys_ni_syscall
+#endif
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
diff -Nru 2.6.4.ref/arch/ia64/mm/Makefile 2.6.4.mig4/arch/ia64/mm/Makefile
--- 2.6.4.ref/arch/ia64/mm/Makefile	Tue Mar 16 10:18:04 2004
+++ 2.6.4.mig4/arch/ia64/mm/Makefile	Thu Mar 25 08:58:15 2004
@@ -5,7 +5,7 @@
 obj-y := init.o fault.o tlb.o extable.o
 
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA)	   += numa.o
+obj-$(CONFIG_NUMA)	   += numa.o migrate.o
 obj-$(CONFIG_DISCONTIGMEM) += discontig.o
 ifndef CONFIG_DISCONTIGMEM
 obj-y += contig.o
diff -Nru 2.6.4.ref/arch/ia64/mm/migrate.c 2.6.4.mig4/arch/ia64/mm/migrate.c
--- 2.6.4.ref/arch/ia64/mm/migrate.c	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/arch/ia64/mm/migrate.c	Thu Mar 25 08:58:15 2004
@@ -0,0 +1,1274 @@
+/*
+ * Migrate pages from a ccNUMA node to another.
+ * ============================================
+ *
+ * Version 0.1, 31st of March 2004
+ * By Zoltan Menyhart, Bull S.A. <Zoltan.Menyhart_AT_bull.net@no.spam.org>
+ * The usual GPL applies.
+ *
+ * This is Linux, and explanatory comments / error messages are seen
+ * as a sign of weakness :-)))
+ *
+ * O.K. check out "migrate.txt" and "page_migrate.h".
+ */
+
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/rmap-locking.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <asm/rmap.h>
+#include <asm/tlbflush.h>
+#include <asm/page_migrate.h>
+#if defined(_TEST_)
+#include <linux/delay.h>		// For "ia64_get_itc()"
+#endif
+
+#if !defined(CONFIG_DISCONTIGMEM) || !defined(CONFIG_NUMA)
+#error	"That's a NUMA stuff"
+#endif
+
+
+/*
+ * Type of a virtual address.
+ */
+typedef	unsigned long	vaddr_t;	// Pointers converted to this type
+
+
+#if defined(_TEST_)
+
+// Set the bits - as defined below - for some kernel messages.
+unsigned int _pr_flag_;
+
+#define	PRINT_page	1		// Dump "struct page"-s
+#define	PRINT_mm	2		// Dump "struct mm_struct"-s
+#define	PRINT_vma	4		// Dump "struct vm_area_struct"-s
+#define	PRINT_pte	8		// Show PTE-s, r-maps
+#define	PRINT_errors	0x10
+#define	PRINT_etc	0x20
+#define	PRINT_pgd	0x40		// Show PGD scan
+
+#define PRINT(args...)	do { if (_pr_flag_) printk(args); } while (0)
+#define PRINT_ERR(args...)								\
+			do { if (_pr_flag_ & PRINT_errors) printk(args); } while (0)
+#define PRINT_ETC(args...)								\
+			do { if (_pr_flag_ & PRINT_etc) printk(args); } while (0)
+#define PRINT_PGD(args...)								\
+			do { if (_pr_flag_ & PRINT_pgd) printk(args); } while (0)
+
+static const char	dest_not_online[] =	"Destination node not online\n";
+static const char	no_vma[] =		"Cannot find VMA for address 0x%lx\n";
+static const char	illegal_pid[] =		"Illegal PID\n";
+static const char	inv_n_addresses[] = 	"Invalid number of addresses";
+static const char	ill_va_alias[] =	"v-addr alias in range: 0x%p...0x%p\n";
+static const char	no_momory[] =		"No more memory\n";
+static const char	ill_user_buff[] =	"Illegal user buffer address\n";
+
+void		dump_mm(const struct mm_struct * const);
+void		dump_vma(const struct vm_area_struct * const);
+void		dump_page(const char * const, const struct page * const);
+void		dump_pte_stuff(const pte_t * const);
+phaddr_t	gimme_an_address(const caddr_t);
+
+#define	STATIC
+#define	INLINE
+
+#else	// #if defined(_TEST_)
+
+#define PRINT(args...)		do { } while (0)
+#define PRINT_ERR(args...)	do { } while (0)
+#define PRINT_ETC(args...)	do { } while (0)
+#define PRINT_PGD(args...)	do { } while (0)
+
+#define	dump_mm(vma)		do { } while (0)
+#define	dump_vma(vma)		do { } while (0)
+#define	dump_page(text, page)	do { } while (0)
+#define	dump_pte_stuff(ptep)	do { } while (0)
+
+#define	STATIC			static
+#define	INLINE			inline
+
+#endif	// #if defined(_TEST_)
+
+
+STATIC INLINE long long
+migr_virt_addr_range(const caddr_t, const size_t, const int, const pid_t);
+
+STATIC INLINE long long
+migr_vaddr_range_2(vaddr_t, const vaddr_t, const int, struct mm_struct * const);
+
+STATIC INLINE int
+migr_1_page_by_pte(pte_t * const, const int, struct mm_struct * const);
+
+STATIC INLINE long long
+batch_migrate(const caddr_t, size_t, const int, const pid_t);
+
+STATIC struct mm_struct	*
+look_up_mm(const pid_t);
+
+int
+check_migr_1_page_part_2(struct page * const, struct page * const,
+						struct mm_struct * const, pte_t * const);
+
+int
+get_pages_if_valid(phaddr_t * const, unsigned int);
+
+STATIC INLINE int
+check_pages_if_in_pgd(phaddr_t * const, const size_t, const struct mm_struct * const);
+
+STATIC INLINE int
+check_migrate_1_page(const phaddr_t, const int, struct mm_struct * const);
+
+
+// These are the flags which are copied for the new page:
+#define	FLAG_MASK	(PG_referenced | PG_uptodate | PG_dirty | PG_active |		\
+				PG_highmem | PG_arch_1 | PG_private | PG_writeback |	\
+				PG_nosave | PG_mappedtodisk | PG_reclaim | PG_compound)
+
+
+#if defined(_NEED_STATISTICS_)
+
+/*
+ * Statistics are accessed in a non atomic way. Who cares? Just some statistics :-)
+ */
+STATIC struct _statistics_	_statistics;
+STATIC struct _statistics_size_	_statistics_sizes = {sizeof _statistics, MAX_NUMNODES};
+
+#define	DECLARE_ITC_VAR(var)		unsigned long var
+#define	SAVE_ITC(var)			var = ia64_get_itc()
+#define	STORE_DELAY(var, destination)	_statistics.t.destination += ia64_get_itc() - var
+#define	COUNT(what)			_statistics.c.what++
+#define	ERROR_CNT(what)			_statistics.e.what++
+#define	ERROR_CNT_ADD(var, delta)	_statistics.e.var += delta
+
+STATIC INLINE int	page_migrate_statistics(const caddr_t, const int);
+
+#else
+
+#define	DECLARE_ITC_VAR(var)
+#define	SAVE_ITC(var)			do { } while (0)
+#define	STORE_DELAY(var, destination)	do { } while (0)
+#define	COUNT(what)			do { } while (0)
+#define	ERROR_CNT(what)			do { } while (0)
+#define	ERROR_CNT_ADD(var, delta)	do { } while (0)
+
+#endif	// #if defined(_NEED_STATISTICS_)
+
+
+/*
+ * Migrate pages from a NUMA node to another (and some other minor services).
+ * (See "migrate.txt" and "page_migrate.h".)
+ *
+ * As usual, "-Exxx" returned on errors.
+ */
+asmlinkage long long
+sys_page_migrate(const int cmd, const caddr_t address, const size_t length,
+							const int node, const pid_t pid)
+{
+	long long	rc;
+	DECLARE_ITC_VAR(time);			// Total time for "sys_page_migrate()"
+
+	SAVE_ITC(/* out */ time);
+	PRINT("\nsys_page_migrate(%d, 0x%p, 0x%lx, %d, %d): pid = %d\n",
+					cmd, address, length, node, pid, current->pid);
+	switch (cmd){
+	//
+	// Migrate some pages from a NUMA node to another.
+	//
+	case _PHADDR_BATCH_MIGRATE_:
+		if (!node_online(node)){
+			PRINT_ERR(dest_not_online);
+			ERROR_CNT(bad_request);
+			rc = -ENODEV;
+			break;
+		}
+		if (length > PAGE_SIZE / sizeof(phaddr_t)){
+			PRINT_ERR(inv_n_addresses);
+			ERROR_CNT(bad_request);
+			rc = -EINVAL;
+			break;
+		}
+		rc = batch_migrate(/* user buffer */ address, /* buffer */ length,
+									node, pid);
+		break;
+	//
+	// Migrate virtual address range.
+	//
+	case _VA_RANGE_MIGRATE_:
+		if (!node_online(node)){
+			PRINT_ERR(dest_not_online);
+			ERROR_CNT(bad_request);
+			rc = -ENODEV;
+			break;
+		}
+		//
+		// Some architectures do not decode all the MSB-s of virtual addresses
+		// for the PGD, PMD and PTE indices, i.e. they have got holes or aliases
+		// in the virtual address space. Make sure that "length" does not span
+		// over virtual address holes nor it creates illegal alias to an
+		// otherwise valid address.
+		//
+		if (__IS_VA_ALIAS((vaddr_t) address, length)){
+			PRINT_ERR(ill_va_alias, address, address + length);
+			ERROR_CNT(non_existent_addr);
+			rc = -EFAULT;
+			break;
+		}
+		rc = migr_virt_addr_range(/* user virtual */ address,
+						/* address range */ length, node, pid);
+		break;
+
+#if defined(_NEED_STATISTICS_)
+	case _STATISTICS_:
+		rc = page_migrate_statistics(/* user buffer */ address,
+						/* ? clear statistics ? */ length != 0);
+		break;
+	case _SIZEOF_STATISTICS_:
+		rc =  *(long long *) &_statistics_sizes;	// Yeh, I know...
+		break;
+#endif
+#if defined(_TEST_)
+	case _GIMME_AN_ADDRESS_:
+		rc = (long long) gimme_an_address(/* user vistual */ address);
+		break;
+#endif
+	default:
+		ERROR_CNT(bad_request);
+		rc = -EINVAL;
+		break;
+	}
+	STORE_DELAY(/* in */ time, /* out */ total);
+	return rc;
+}
+
+
+/*
+ * Migrate virtual address range of a process.
+ *
+ * Arguments:	address:	Starting virtual address in a process'es address space
+ *		length:		Length of the address range to be migrated
+ *		node:		Destination NUMA node
+ *		pid:		ID of the victim process, "0" means myself
+ *
+ * Returns:	On (partial) success, the number of the pages actually migrated is
+ *		returned (in form of "struct _un_success_count_").
+ *		As usual, "-Exxx" returned on errors.
+ */
+STATIC INLINE long long
+migr_virt_addr_range(const caddr_t address, const size_t length, const int node,
+									const pid_t pid)
+{
+	const vaddr_t		ulimit = (vaddr_t) address + length;
+	struct mm_struct	*mm;
+	long long		rc;
+	struct vm_area_struct	*beg_vma;
+	DECLARE_ITC_VAR(vma_time);		// Time for "find_vma()"
+	DECLARE_ITC_VAR(mmap_sem);		// Time for "down_read(&mm->mmap_sem)"
+	DECLARE_ITC_VAR(pgd_lock);		// "spin_lock(&mm->page_table_lock)"
+	DECLARE_ITC_VAR(pgd_unlock);		// "spin_unlock(&mm->page_table_lock)"
+
+	if (pid != 0 && pid != current->pid){
+		//
+		//  Look up the "mm_struct" belonging to the process ID.
+		//
+		if ((mm = look_up_mm(pid)) == NULL){
+			PRINT_ERR(illegal_pid);
+			ERROR_CNT(bad_request);
+			return -ESRCH;
+		}
+		//
+		// On success, "mm->mm_users" got incremented to make sure that
+		// "mm_struct" does not go away.
+		//
+	} else {
+		mm = current->mm;
+		//
+		// Actually, there is no need to grab "mm" because it is ours, wont go
+		// away in the mean time. As we do not want to ask questions when
+		// releasing it...
+		// It is safe just to increment the counter: it is ours.
+		//
+		atomic_inc(&mm->mm_users);
+	}
+	SAVE_ITC(/* out */ mmap_sem);
+	down_read(&mm->mmap_sem);			// Protect the VMA list
+	STORE_DELAY(/* in */ mmap_sem, /* out */ mmap_sem);
+	dump_mm(mm);
+	//
+	// Check if the starting virtual "address" is valid.
+	//
+	SAVE_ITC(/* out */ vma_time);
+	beg_vma = find_vma(mm, (vaddr_t) address);	// Look up the first VMA for
+							// which "address < ->vm_end"
+	STORE_DELAY(/* in */ vma_time, /* out */ find_vma);
+	if (beg_vma == NULL || beg_vma->vm_start > (vaddr_t) address){
+		if (beg_vma != NULL)
+			dump_vma(beg_vma);
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+		PRINT_ERR(no_vma, (vaddr_t) address);
+		ERROR_CNT(non_existent_addr);
+		return -EFAULT;
+	}
+	//
+	// It is safe to start walking the PGD, the PMD and the PTE at "address".
+	//
+	dump_vma(beg_vma);
+	//
+	// We need the page table lock to synchronize with "kswapd"
+	// and the SMP-safe atomic PTE updates.
+	//
+	SAVE_ITC(/* out */ pgd_lock);
+	spin_lock(&mm->page_table_lock);
+	STORE_DELAY(/* in */ pgd_lock, /* out */ pgd_lock);
+	//
+	// Look up pages in the PGD and migrate them one by one.
+	//
+	rc = migr_vaddr_range_2((vaddr_t) address & PAGE_MASK,
+					/* round up */ PAGE_ALIGN(ulimit), node, mm);
+	//
+	// Let the others complete the page fault handler code. They will find the
+	// condition "someone has already installed the PTE" to be TRUE.
+	//
+	SAVE_ITC(/* out */ pgd_unlock);
+	spin_unlock(&mm->page_table_lock);
+	STORE_DELAY(/* in */ pgd_unlock, /* out */ pgd_unlock);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return rc;
+}
+
+
+/*
+ * Migrate virtual address range belonging to a PGD.
+ *
+ * Arguments:	address:	Starting virtual address in a process'es address space
+ *		ulimit:		The address range is below this upper limit
+ *		node:		Destination NUMA node
+ *		mm:		-> victim "mm_struct"
+ *
+ * Returns:	On (partial) success, the number of the pages actually migrated is
+ *		returned in "struct _un_success_count_".
+ *		As usual, "-Exxx" returned on errors.
+ *
+ * Notes:	- We've already checked that "[address...ulimit)" is inside the allowed
+ *		  user virtual address range.
+ *		- Caller has to hold "mm->mmap_sem" for read and "mm->page_table_lock".
+ */
+STATIC INLINE long long
+migr_vaddr_range_2(vaddr_t address, const vaddr_t ulimit, const int node,
+							struct mm_struct * const mm)
+{
+	unsigned long			g, m, e;	// PGD, MPD and PTE indices
+	const pgd_t			*pgd;
+	const pmd_t			*pmd;
+	pte_t				*pte, *pte0;
+	int				rc;
+	struct _un_success_count_	count = {0, 0};
+	DECLARE_ITC_VAR(pgd_scan_t);			// PGD scan time
+
+	//
+	// We've already cheked that it is safe to start walking the PGD, the PMD and the
+	// PTE at "address". We've also checked that "[address...ulimit)" does not span
+	// over virtual address holes nor it creates illegal alias to an otherwise
+	// valid address.
+	//
+	g = pgd_index(address);			// PGD scan starts here
+	m = pmd_index(address);			// The 1st PMD scan starts here
+	e = pte_index(address);			// The 1st PTE scan starts here
+	//
+	// Check the user pages only, starting at the PTE corresponding to "address".
+	// Note: "mm->pgd" is an identity mapped virtual address.
+	//
+	SAVE_ITC(/* out */ pgd_scan_t);
+	for (pgd = mm->pgd + g; address < ulimit && g < USER_PTRS_PER_PGD;
+					// Other than the 1st PMD scans start at 0 index
+								m = 0, g++, pgd++){
+		PRINT_PGD("address: 0x%016lx pgd: 0x%p ", address, pgd);
+		PRINT_PGD("g: 0x%lx m: 0x%lx e: 0x%lx\n", g, m, e);
+		PRINT_PGD("__VA():\t 0x%016lx\n", __VA(g, m, e));
+		//
+		// "pgd" contains an identity mapped virtual address.
+		// Migration tolarates holes in the virtual address space.
+		//
+		if (pgd_none(*pgd) || pgd_bad(*pgd)){
+			address &= ~(PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE - 1);
+			address += PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE;
+			continue;
+		}
+		//
+		// "*pgd" is a physical address.
+		//
+		for (pmd = pmd_offset(pgd, 0) + m; m < PTRS_PER_PMD && address < ulimit;
+					// Other than the 1st PTE scans start at 0 index
+								e = 0, m++, pmd++){
+			//
+			// "pmd" contains an identity mapped virtual address.
+			// Migration tolarates holes in the virtual address space.
+			//
+			if (pmd_none(*pmd) || pmd_bad(*pmd)){
+				address &= ~(PTRS_PER_PTE * PAGE_SIZE - 1);
+				address += PTRS_PER_PTE * PAGE_SIZE;
+				continue;
+			}
+			//
+			// "*pmd" is a physical address.
+			//
+			pte0 = pte_offset_map(pmd, 0);
+			//
+			// "pte0" contains some kind of virtual address of the
+			// beginning of a PTE page.
+			//
+			for (pte = pte0 + e; e < PTRS_PER_PTE && address < ulimit;
+						address += PAGE_SIZE, e++, pte++){
+				if (!pte_present(*pte))
+					continue;
+				//
+				// We've found a page... Let's move it.
+				//
+				PRINT("\nVirtual addr:\t0x%016lx\n", __VA(g, m, e));
+				STORE_DELAY(/* in */ pgd_scan_t, /* out */ pgd_scan);
+				if ((rc = migr_1_page_by_pte(pte, node, mm)) < 0){
+					pte_unmap(pte0);
+					return rc;
+				}
+				SAVE_ITC(/* out */ pgd_scan_t);
+				if (rc > 0)
+					count.successful++;
+				else
+					count.failed++;
+			}
+			pte_unmap(pte0);
+		}
+	}
+	STORE_DELAY(/* in */ pgd_scan_t, /* out */ pgd_scan);
+	return *(long long *) &count;		// Yeh, I know...
+}
+
+
+/*
+ * Common part of checking & migrating the pages one by one.
+ *
+ * Arguments:	src_node:	Source NUMA node
+ *		old_p:		-> old page structure
+ *		node:		Destination NUMA node
+ *		mm:		-> victim "mm_struct"
+ *		pte:		-> PTE of the page to be moved
+ *
+ * Returns:	1:		Migration O. K.
+ *		0:		Minor error, no actual migration has been done
+ *		-Exxx:		Catastrophic error
+ *
+ * Notes:	- "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ *		- The old page is "get_page()"-ed on entry to meke sure it does not go
+ *		  away in the mean time - on return it gets "put_page()"-ed.
+ */
+STATIC int
+common_check_migrate_1_page(const int src_node, struct page * const old_p,
+			const int node, struct mm_struct * const mm, pte_t * const pte)
+{
+	struct page	*new_p;
+	int		rc;
+	DECLARE_ITC_VAR(alloc_time);	// Time for "vmalloc()"
+	DECLARE_ITC_VAR(lock_time);	// Time for "lock_page()"
+	DECLARE_ITC_VAR(unlock_time);	// Time for "unlock_page()"
+	DECLARE_ITC_VAR(free_time);	// "__free_pages()", "page_cache_release()"
+
+	//
+	// Allocate the new page in advance. It is less dangerous
+	// - to have a page "floating around" and then take locks
+	// than
+	// - to acquire some locks (e.g. to be able to check the conditions)
+	//   and then allocate the page
+	//
+	// Do not insist on allocating the page...
+	//
+	SAVE_ITC(/* out */ alloc_time);
+	new_p = alloc_pages_node(node, GFP_HIGHUSER | __GFP_NORETRY, 0);
+	STORE_DELAY(/* in */ alloc_time, /* out */ page_alloc);
+	if (new_p == NULL){
+		put_page(old_p);
+		PRINT_ERR("No more memory on node %d\n", node);
+		ERROR_CNT(no_memory);
+		return -ENOMEM;
+	}
+	SAVE_ITC(/* out */ lock_time);
+	lock_page(old_p);
+	STORE_DELAY(/* in */ lock_time, /* out */ page_lock);
+
+	//
+	// Would be too long to do everything here.
+	//
+	rc = check_migr_1_page_part_2(old_p, new_p, mm, pte);
+
+	SAVE_ITC(/* out */ unlock_time);
+	unlock_page(old_p);
+	STORE_DELAY(/* in */ unlock_time, /* out */ page_unlock);
+	PRINT("check_migr_1_page_part_2() returned: %d\n", rc);
+	if (rc == 0){
+		//
+		// The old page was "lru_cache_add_active()"-ed e.g. in
+		// "do_anonymous_page()". As on entry the old page was again
+		// "get_page()"-ed, its reference counter is at least 2 right now.
+		//
+		page_cache_release(old_p);
+		_statistics.count[src_node][node]++;
+	} else{
+		SAVE_ITC(/* out */ free_time);
+		__free_pages(new_p, 0);
+		STORE_DELAY(/* in */ free_time, /* out */ page_free);
+	}
+	//
+	// On success, "put_page()" sets free the old page
+	// (unless someone else got hold of it in the mean time).
+	//
+	SAVE_ITC(/* out */ free_time);
+	put_page(old_p);
+	STORE_DELAY(/* in */ free_time, /* out */ page_free);
+	return rc == 0 ? 1 : 0;
+}
+
+
+/*
+ * Migrate a page identified by its PTE.
+ *
+ * Arguments:	pte:		-> PTE of the page to be moved
+ *		node:		Destination NUMA node
+ *		mm:		-> victim "mm_struct"
+ *
+ * Returns:	1:		Success
+ *		0:		We cannot cope with this page (it is valid, though)
+ *		-Exxx:		Fatal errors
+ *
+ * Note:	"mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ */
+STATIC INLINE int
+migr_1_page_by_pte(pte_t * const pte, const int node, struct mm_struct * const mm)
+{
+	const phaddr_t	old_addr = pte_val(*pte) & _PFN_MASK;
+	const int	src_node = paddr_to_nid(old_addr);
+	struct page	* const old_p = pfn_to_page(old_addr >> PAGE_SHIFT);
+
+	dump_page("\nOld", old_p);
+	if (node == src_node){
+		PRINT_ETC("Old ph adr:\t0x%016llx old node: %d new node: %d\n",
+							old_addr, src_node, node);
+		return 1;			// Done :-)
+	}
+	//
+	// Actually, there is no need to grab the old page because it is sure that it
+	// has been "get_page()"-ed before and we still keep "->page_table_lock".
+	// We are going to invoke "common_check_migrate_1_page()" that is used by the
+	// physical address driven migration, too. This latter - not knowing in advance
+	// whom a page belongs to and what "->page_table_lock" is to take - needs to
+	// grab the old page.
+	// Invoking the common service requires us to do the same.
+	//
+	get_page(old_p);			// Should we call "page_cache_get()" ?
+	//
+	// The old page will be "put_page()"-ed.
+	//
+	return common_check_migrate_1_page(src_node, old_p, node, mm, pte);
+}
+
+
+/*
+ * Migrate some pages identified by their physical address from a NUMA node to another.
+ *
+ * Arguments:	table:		-> the user buffer containing the physical addresses of
+ *				the pages to be migrated.
+ *				Max. "PAGE_SIZE / sizeof(phaddr_t *)" of them can be
+ *				mifgrated at once.
+ *		n:		Number of the physical page addresses
+ *		node:		Destination NUMA node
+ *		pid:		Pages are assumed to belong to this process
+ *
+ * Returns:	On (partial) success, the number of the pages actually migrated is
+ *		returned (in form of "struct _un_success_count_").
+ *		As usual, "-Exxx" returned on errors.
+ */
+STATIC INLINE long long
+batch_migrate(const caddr_t table, size_t n, const int node, const pid_t pid)
+{
+	int				rc;
+	phaddr_t			*p, *bp;
+	struct mm_struct		*mm;
+	struct _un_success_count_	count = { 0, 0};
+	DECLARE_ITC_VAR(alloc_time);		// Time for "vmalloc()"
+	DECLARE_ITC_VAR(mmap_sem);		// Time for "down_read(&mm->mmap_sem)"
+	DECLARE_ITC_VAR(pgd_lock);		// "spin_lock(&mm->page_table_lock)"
+	DECLARE_ITC_VAR(pgd_unlock);		// "spin_unlock(&mm->page_table_lock)"
+	DECLARE_ITC_VAR(pgd_scan_t);		// PGD scan time
+
+	if (pid != 0 && pid != current->pid){
+		//
+		//  Look up the "mm_struct" belonging to the process ID.
+		//
+		if ((mm = look_up_mm(pid)) == NULL){
+			PRINT_ERR(illegal_pid);
+			ERROR_CNT(bad_request);
+			return -ESRCH;
+		}
+		//
+		// On success, "mm->mm_users" got incremented to make sure that
+		// "mm_struct" does not go away.
+		//
+	} else {
+		mm = current->mm;
+		//
+		// Actually, there is no need to grab "mm" because it is ours, wont go
+		// away in the mean time. As we do not want to ask questions when
+		// releasing it...
+		// It is safe just to increment the counter: it is ours.
+		//
+		atomic_inc(&mm->mm_users);
+	}
+	//
+	// Fetch the table of the addresses.
+	//
+	SAVE_ITC(/* out */ alloc_time);
+	bp = vmalloc(PAGE_SIZE);
+	STORE_DELAY(/* in */ alloc_time, /* out */ page_alloc);
+	if (bp == NULL){
+		mmput(mm);
+		PRINT_ERR(no_momory);
+		ERROR_CNT(no_memory);
+		return -ENOMEM;
+	}
+	if (copy_from_user(bp, table, n * sizeof(phaddr_t)) != 0){
+		vfree(bp);
+		mmput(mm);
+		PRINT_ERR(ill_user_buff);
+		ERROR_CNT(bad_request);
+		return -EFAULT;
+	}
+	SAVE_ITC(/* out */ mmap_sem);
+	down_read(&mm->mmap_sem);		// Protect the VMA list
+	STORE_DELAY(/* in */ mmap_sem, /* out */ mmap_sem);
+	dump_mm(mm);
+	//
+	// We need the page table lock to synchronize with "kswapd"
+	// and the SMP-safe atomic PTE updates.
+	//
+	SAVE_ITC(/* out */ pgd_lock);
+	spin_lock(&mm->page_table_lock);
+	STORE_DELAY(/* in */ pgd_lock, /* out */ pgd_lock);
+	//
+	// Check to see if the pages are mapped by "mm->pgd" as user pages.
+	// For those which are, call "get_page()" to make sure they do not go away.
+	// "1" will be OR-ed to invalid addresses.
+	//
+	SAVE_ITC(/* out */ pgd_scan_t);
+	rc = check_pages_if_in_pgd(bp, n, mm);
+	STORE_DELAY(/* in */ pgd_scan_t, /* out */ pgd_scan);
+	if (rc >= 0){
+		//
+		// The number of the valid addresses is equal to "rc".
+		//
+		ERROR_CNT_ADD(non_existent_addr, n - rc);
+		for (n = rc, p = bp; n > 0; p++){
+#if defined(_TEST_)
+			if (p - bp >= PAGE_SIZE / sizeof(phaddr_t))
+				panic("\nAddress table overflow\n");
+#endif
+			if (*p & 1)		// Address has not been validated
+				continue;
+			//
+			// Check & migrate the next page.
+			// The old page gets "put_page()"-ed.
+			//
+			if ((rc = check_migrate_1_page(*p, node, mm)) < 0)
+				break;
+			else if (rc > 0){
+				count.successful++;
+				n--;		// Decrement for a good address only
+			} else
+				count.failed++;
+		}
+		if (rc >= 0)
+			rc = *(long long *) &count;	// Yeh, I know...
+	}
+	//
+	// Let the others complete the page fault handler code. They will find the
+	// condition "someone has already installed the PTE" to be TRUE.
+	//
+	SAVE_ITC(/* out */ pgd_unlock);
+	spin_unlock(&mm->page_table_lock);
+	STORE_DELAY(/* in */ pgd_unlock, /* out */ pgd_unlock);
+	up_read(&mm->mmap_sem);
+	vfree(bp);
+	mmput(mm);				// Decrement "mm->mm_users" and free
+						//	"*mm" if the counter becomes zero
+	return rc;
+}
+
+
+/*
+ * Check & migrate the pages one by one.
+ *
+ * Arguments:	address:	Physical addresses of the page to be migrated
+ *		node:		Destination NUMA node
+ *		mm:		-> victim "mm_struct"
+ *
+ * Returns:	1:		Migration O. K.
+ *		0:		Minor error, no actual migration has been done
+ *		-Exxx:		Catastrophic error
+ *
+ * Notes:	- "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ *		- The old page is "get_page()"-ed on entry to meke sure it does not go
+ *		  away in the mean time - on return it gets "put_page()"-ed.
+ */
+STATIC INLINE int
+check_migrate_1_page(const phaddr_t address, const int node, struct mm_struct * const mm)
+{
+	const int		src_node = paddr_to_nid(address);
+	const unsigned long	old_pfn = address >> PAGE_SHIFT;
+	struct page		*old_p;
+
+	//
+	// Should be revised for the node hot plug :-)
+	//
+#if defined(_TEST_)
+	if (src_node == -1)
+		panic("\nCannot map source address to node\n");
+	if (!node_online(src_node))
+		panic("\nSource node not online\n");
+	if (!pfn_valid(old_pfn))
+		panic("\nNot a valid source pfn\n");
+#endif
+	old_p = pfn_to_page(old_pfn);
+	if (node == src_node){
+		PRINT_ETC("Old ph adr:\t0x%016llx old node: %d new node: %d\n",
+								address, src_node, node);
+		put_page(old_p);
+		return 1;			// Done :-)
+	}
+	return common_check_migrate_1_page(src_node, old_p, node, mm, NULL);
+}
+
+
+/*
+ * The real page migration is done here.
+ *
+ * Arguments:	old:		-> old page structure
+ *		new:		-> new page structure
+ *		node:		Destination NUMA node
+ *		pte:		-> PTE of the page to be moved
+ *
+ * Returns:	Negative values (like -Exxx) indicate errors
+ *
+ * Notes:	- The old page has to be "get_page()"-ed and locked.
+ *		- Its "pte_chain" has to locked.
+ *		- The new page and its "pte_chain" has to locked.
+ *		- "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ */
+STATIC INLINE int
+page_migrate_2(struct page * const old, struct page * const new,
+					struct mm_struct * const mm, pte_t *pte_p)
+{
+	const struct page	*pte_page;
+	struct vm_area_struct	*vma;
+	pte_t			pte;
+	vaddr_t			vaddress;
+	DECLARE_ITC_VAR(vma_time);		// Time for "find_vma()"
+	DECLARE_ITC_VAR(flush_tlb_time);	// Time for "flush_tlb_page()"
+	DECLARE_ITC_VAR(add_lru_time);		// Time for "lru_cache_add_active()"
+	DECLARE_ITC_VAR(copy_time);		// Time for "copy_user_highpage()"
+	DECLARE_ITC_VAR(upd_mmu_cache);		// Time for "update_mmu_cache()"
+
+	if (!PageDirect(old)){
+		PRINT_ERR("Direct mapped pages only\n");
+		ERROR_CNT(page_type_not_supp);
+		return -EFAULT;
+	}
+	if (pte_p == NULL)			// Architecture independent code :-)
+		pte_p = rmap_ptep_map(old->pte.direct);
+	//
+	// "struct page" of the page that hotst the PTE.
+	//
+	pte_page = kmap_atomic_to_page(pte_p);	// Architecture independent code :-)
+	//
+	// "pte_page->mapping" points at the victim process'es "mm_struct"
+	//
+#if defined(_TEST_)
+	if (mm != (struct mm_struct *) pte_page->mapping)
+		panic("\nBroken r-map ???\n");
+	dump_mm(mm);
+#endif
+	//
+	// "page->index" has the high bits of the address; the lower bits of the address
+	// are calculated from the offset of the PTE within the page table page.
+	//
+	vaddress = pte_page->index + ((unsigned long) pte_p & ~PAGE_MASK) * PTRS_PER_PTE;
+	PRINT("Virtual addr:\t0x%lx\n", vaddress);
+	//
+	// Double check if the virtual address is still valid.
+	//
+	SAVE_ITC(/* out */ vma_time);
+	vma = find_vma(mm, vaddress);		// "vma" cache should help much
+	STORE_DELAY(/* in */ vma_time, /* out */ find_vma);
+	if (vma == NULL || vma->vm_start > vaddress){
+		PRINT_ERR("During mremap() ?\n");
+		ERROR_CNT(page_gone_away);
+		rmap_ptep_unmap(pte_p);
+		return -EFAULT;
+	}
+	dump_vma(vma);
+	//
+	// Nuke the page table entry.
+	//
+	flush_cache_page(vma, vaddress);	// Architecture independent code :-)
+	pte = ptep_get_and_clear(pte_p);
+	SAVE_ITC(/* out */ flush_tlb_time);
+	flush_tlb_page(vma, vaddress);		// Architecture independent code :-)
+	STORE_DELAY(/* in */ flush_tlb_time, /* out */ flush_tlb);
+	//
+	// From now on, the other CPUs cannot touch the content of the page. Should they
+	// try to, they would observe page faults. They pass easily "mmap_sem" beacause
+	// they take it for read, too. As we hold "page_table_lock", they queue up in the
+	// page fault handler.
+	//
+	PRINT("Old ph addr:\t0x%016lx\n", page_to_phys(old));
+	PRINT("Old PTE:\t0x%016lx\n", pte_val(pte));
+	PRINT("_PFN_MASK:\t0x%016lx\n", _PFN_MASK);
+	//
+	// Copy some of the page structure.
+	//
+	dump_page("Source", old);
+	new->flags = (new->flags & ~FLAG_MASK) | (old->flags & FLAG_MASK);
+	new->pte.direct = old->pte.direct;
+	SetPageDirect(new);			// Direct mapped pages only
+	old->pte.direct = NULL;
+	ClearPageDirect(old);
+	if (PagePrivate(new))
+		new->private = old->private;
+	SAVE_ITC(/* out */ add_lru_time);
+	lru_cache_add_active(new);
+	STORE_DELAY(/* in */ add_lru_time, /* out */ add_lru);
+	dump_page("New", new);
+	//
+	// Here is where the data is copied.
+	//
+	SAVE_ITC(/* out */ copy_time);
+	copy_user_highpage(new, old, vaddress);	// Architecture independent code :-)
+	STORE_DELAY(/* in */ copy_time, /* out */ copy);
+	//
+	// The new PTE keeps everything but the PFN.
+	//
+	pte = mk_pte(new, __pgprot((pte_val(pte) & ~_PFN_MASK)));
+	PRINT("New ph addr:\t0x%016lx\nNew PTE:\t0x%016lx\n\n",
+							page_to_phys(new), pte_val(pte));
+	set_pte(pte_p, pte);
+	SAVE_ITC(/* out */ upd_mmu_cache);
+	update_mmu_cache(vma, vaddress, pte);	// Architecture independent code :-)
+	STORE_DELAY(/* in */ upd_mmu_cache, /* out */ update_mmu_cache);
+	rmap_ptep_unmap(pte_p);
+	return 0;
+}
+
+
+/*
+ * Some more tests and go on with the page migration.
+ *
+ * Arguments:	old:		-> old page structure
+ *		new:		-> new page structure
+ *		node:		Destination NUMA node
+ *		pte:		-> PTE of the page to be moved
+ *
+ * Returns:	Negative values indicate errors
+ *
+ * Notes:	- The old page has to be "get_page()"-ed and locked.
+ *		- "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ */
+STATIC INLINE int
+check_migr_1_page_part_2(struct page * const old, struct page * const new,
+					struct mm_struct * const mm, pte_t * const pte)
+{
+	int rc;
+	DECLARE_ITC_VAR(pte_chain_lock_time);	// Time for "pte_chain_lock()"
+	DECLARE_ITC_VAR(unlock_time);		// Time for "unlock_page()"
+
+	if (PageReserved(old)){
+		PRINT_ERR("What shall I do with a reserved page ?\n");
+		ERROR_CNT(page_type_not_supp);
+		return -ENXIO;
+	}
+	if (PageError(old)){
+		PRINT_ERR("Page has got error(s)\n");
+		ERROR_CNT(errors);
+		return -EIO;
+	}
+	if (!PageUptodate(old)){
+		PRINT_ERR("Page has no valid data ???\n");
+//		return -EIO;
+	}
+	if (PageCompound(old)){
+		PRINT_ERR("What shall I do with a compound page ?\n");
+		ERROR_CNT(page_type_not_supp);
+		return -ENXIO;
+	}
+	if (old->mapping != NULL){
+		PRINT_ERR("Anonymous pages only\n");
+		ERROR_CNT(page_type_not_supp);
+		return -ENXIO;
+	}
+	if (PageSwapCache(old)){
+		PRINT_ERR("What shall I do with a page in swap cache ?\n");
+		ERROR_CNT(page_type_not_supp);
+		return -ENXIO;
+	}
+	if (PageHighMem(page)){
+		PRINT_ERR("What shall I do with a HIGHMEM page ?\n");
+		ERROR_CNT(page_type_not_supp);
+		return -ENXIO;
+	}
+	SAVE_ITC(/* out */ pte_chain_lock_time);
+	pte_chain_lock(old);
+	STORE_DELAY(/* in */ pte_chain_lock_time, /* out */ pte_chain_lock);
+	if (!page_mapped(old)){			// Actually means "r-mapped"
+		PRINT_ERR("Page not in r-map\n");
+		pte_chain_unlock(old);
+		ERROR_CNT(page_type_not_supp);
+		return -EFAULT;
+	}
+	//
+	// As nobody else should know about this new page, taking these locks should not
+	// be in conflict with anything.
+	//
+	if (TestSetPageLocked(new))
+		panic("\nSomeone is stealing my new page\n");
+	if (!pte_chain_trylock(new))
+		panic("\nSomeone is stealing my new pte chain\n");
+
+	//
+	// The real page migration.
+	//
+	rc = page_migrate_2(old, new, mm, pte);
+
+	pte_chain_unlock(new);
+	SAVE_ITC(/* out */ unlock_time);
+	unlock_page(new);
+	STORE_DELAY(/* in */ unlock_time, /* out */ new_page_unlock);
+	pte_chain_unlock(old);
+	return rc;
+}
+
+
+/*
+ * Check to see if the pages are mapped by "mm->pgd" as user pages.
+ * For those which are, call "get_page()" to make sure they do not go away.
+ *
+ * Arguments:	phaddresses:	-> the user buffer containing the physical addresses of
+ *				the pages to be migrated
+ *		n:		Number of the physical page addresses
+ *		mm:		-> victim "mm_struct"
+ *
+ * Returns:	The number of the addresses validated
+ *
+ * Note:	Caller has to hold "mm->mmap_sem" for read and "mm->page_table_lock".
+ */
+STATIC INLINE int
+check_pages_if_in_pgd(phaddr_t * const phaddresses, const size_t n,
+						const struct mm_struct * const mm)
+{
+	const pgd_t	*pgd;
+	const pmd_t	*pmd;
+	const pte_t	*pte, *pte0;
+	unsigned long	g, m, e;
+	unsigned int	i, found = 0;
+	phaddr_t	*p;
+
+	//
+	// Mark the addresses as not already validated.
+	//
+	for (i = 0, p = phaddresses; i < n; i++, p++)
+		*p |= 1;			// "(1 & _PFN_MASK) == 0"
+	//
+	// Check the user pages only.
+	// Note: "mm->pgd" is an identity mapped virtual address.
+	//
+	for (g = FIRST_USER_PGD_NR, pgd = mm->pgd + g; g < USER_PTRS_PER_PGD;
+									g++, pgd++){
+		if (pgd_none(*pgd) || pgd_bad(*pgd))
+			continue;
+		//
+		// "*pgd" is a physical address.
+		//
+		for (m = 0, pmd = pmd_offset(pgd, 0); m < PTRS_PER_PMD; m++, pmd++){
+			//
+			// "pmd" contains an identity mapped virtual address.
+			//
+			if (pmd_none(*pmd) || pmd_bad(*pmd))
+				continue;
+			//
+			// "*pmd" is a physical address.
+			//
+			pte0 = pte_offset_map(pmd, 0);
+			//
+			// "pte0" contains some kind of virtual address of the
+			// beginning of a PTE page.
+			//
+			for (e = 0, pte = pte0; e < PTRS_PER_PTE; e++, pte++){
+				if (!pte_present(*pte))
+					continue;
+				//
+				// Check this PTE against the list of the addresses.
+				//
+				for (i = 0, p = phaddresses; i < n; i++, p++){
+					if ((pte_val(*pte) & _PFN_MASK) !=
+									(*p & _PFN_MASK))
+						continue;
+					*p &= _PFN_MASK;	// Validate the address
+					PRINT("Virtual addr:\t0x%016lx\n",
+									__VA(g, m, e));
+					//
+					// Make sure the page does not go away.
+					// Should we call "page_cache_get()" instead ?
+					//
+					get_page(pfn_to_page(*p >> PAGE_SHIFT));
+					if (++found == n){
+						pte_unmap(pte0);
+						return found;
+					}
+					//
+					// There should be no more than one address on
+					// the list that matches this PTE.
+					//
+					break;
+				}
+			}
+			pte_unmap(pte0);
+		}
+	}
+	return found;
+}
+
+
+/*
+ * Look up an "mm_struct" belonging to a process ID.
+ *
+ * "NULL" is returned on failure.
+ *
+ * Notes:	- On success, "->mm_users" gets incremented to make sure that "mm_struct"
+ *		  does not go away.
+ *		- "->mm" of a kernel thread is "NULL"; anyway, we don't dare to touch a
+ *		  kernel thread
+ */
+STATIC struct mm_struct *
+look_up_mm(const pid_t pid)
+{
+	struct task_struct	*p;
+	struct mm_struct	*mm;
+	DECLARE_ITC_VAR(time);			// "mm" look up time
+
+	SAVE_ITC(/* out */ time);
+	read_lock(&tasklist_lock);
+	if ((p = find_task_by_pid(pid)) == NULL){
+		read_unlock(&tasklist_lock);
+		STORE_DELAY(/* in */ time, /* out */ mm_lookup);
+		return NULL;
+	}
+	//
+	// "get_task_mm()" includes "task_lock()" that "nests both inside and outside of
+	// read_lock(&tasklist_lock)" - as a note in "sched.h" states.
+	//
+	mm = get_task_mm(p);			// Can be "NULL" for a kernel thread
+	//
+	// On success, "mm->mm_users" got incremented to make sure that "mm_struct" does
+	// not go away.
+	//
+	read_unlock(&tasklist_lock);
+	STORE_DELAY(/* in */ time, /* out */ mm_lookup);
+	return mm;
+}
+
+
+#if defined(_NEED_STATISTICS_)
+
+
+/*
+ * Fetch and clear the statistics.
+ *
+ * Accessed in a non atomic way. Who cares? Just some statistics :-)
+ */
+STATIC INLINE int
+page_migrate_statistics(const caddr_t vaddress, const int flag)
+{
+	//
+	// Assuming all the CPU-s are clocked at the same frequency.
+	//
+	_statistics.t.cyc_per_usec = local_cpu_data->cyc_per_usec;
+	if (copy_to_user(vaddress, &_statistics, sizeof _statistics) != 0)
+		return -EFAULT;
+	if (flag)
+		memset(&_statistics, 0,sizeof _statistics);
+	return 0;
+}
+
+
+#endif	// #if defined(_NEED_STATISTICS_)
+
+
+#if defined(_TEST_)
+
+
+void
+dump_mm(const struct mm_struct * const mm)
+{
+	if (_pr_flag_ & PRINT_mm){
+		PRINT("mm: 0x%p\n", mm);
+		PRINT("mmap: 0x%p mm_rb.rb_node: 0x%p\n", mm->mmap, mm->mm_rb.rb_node);
+		PRINT("mmap_cache: 0x%p free_area_cache: 0x%lx\n", mm->mmap_cache,
+								mm->free_area_cache);
+		PRINT("pgd: 0x%p mm_users: %d mm_count: %d map_count: %d\n",
+					mm->pgd, atomic_read(&mm->mm_users),
+					atomic_read(&mm->mm_count), mm->map_count);
+		PRINT("mmap_sem.count: %d mmap_sem.wait_lock: %d\n", mm->mmap_sem.count,
+							mm->mmap_sem.wait_lock.lock);
+		PRINT("&mmap_sem.wait_list: 0x%p next: 0x%p prev: 0x%p\n",
+				&mm->mmap_sem.wait_list, mm->mmap_sem.wait_list.next,
+							mm->mmap_sem.wait_list.prev);
+		PRINT("page_table_lock: %u\n", mm->page_table_lock.lock);
+		PRINT("&mmlist: 0x%p next: 0x%p prev: 0x%p\n", &mm->mmlist,
+						mm->mmlist.next, mm->mmlist.prev);
+		PRINT("start_code: 0x%lx end_code: 0x%lx\n", mm->start_code,
+									mm->end_code);
+		PRINT("start_data: 0x%lx end_data: 0x%lx\n", mm->start_data,
+									mm->end_data);
+		PRINT("start_brk: 0x%lx brk: 0x%lx start_stack: 0x%lx\n", mm->start_brk,
+							mm->brk, mm->start_stack);
+		PRINT("arg_start: 0x%lx arg_end: 0x%lx\n", mm->arg_start, mm->arg_end);
+		PRINT("env_start: 0x%lx env_end: 0x%lx\n", mm->env_start, mm->env_end);
+		PRINT("rss: 0x%lx total_vm: 0x%lx locked_vm: 0x%lx\n", mm->rss,
+							mm->total_vm, mm->locked_vm);
+		PRINT("def_flags: 0x%lu cpu_vm_mask: 0x%lx\n", mm->def_flags,
+								mm->cpu_vm_mask);
+//	unsigned long saved_auxv[40];
+		PRINT("dumpable: %u ", mm->dumpable);
+#ifdef CONFIG_HUGETLB_PAGE
+		PRINT("used_hugetlb: 0x%d ", mm->used_hugetlb);
+#endif
+		PRINT("context: 0x%lu core_waiters: %d\n", mm->context,
+								mm->core_waiters);
+		PRINT("core_startup_done: 0x%p\n", mm->core_startup_done);
+		PRINT("core_done.done: %d ", mm->core_done.done);
+		PRINT("core_done.wait.lock: %u\n", mm->core_done.wait.lock.lock);
+		PRINT("&core_done.wait.task_list: 0x%p next: 0x%p prev: 0x%p\n",
+		&mm->core_done.wait.task_list, mm->core_done.wait.task_list.next,
+						mm->core_done.wait.task_list.prev);
+		PRINT("ioctx_list_lock.read_counter: %d ",
+						mm->ioctx_list_lock.read_counter);
+		PRINT("ioctx_list_lock.write_lock: %d\n",
+						mm->ioctx_list_lock.write_lock);
+		PRINT("ioctx_list: 0x%p &default_kioctx: 0x%p\n\n", mm->ioctx_list,
+								&mm->default_kioctx);
+	}
+}
+
+
+void
+dump_vma(const struct vm_area_struct * const vma)
+{
+	if (_pr_flag_ & PRINT_vma){
+		PRINT("vm_area: 0x%p\n", vma);
+		PRINT("mm: 0x%p\n", vma->vm_mm);
+		PRINT("start: 0x%lx end: 0x%lx\n", vma->vm_start, vma->vm_end);
+		PRINT("next: 0x%p &rb: 0x%p\n", vma->vm_next, &vma->vm_rb);
+		PRINT("prot: 0x%lx flags: 0x%lx\n", pgprot_val(vma->vm_page_prot),
+									vma->vm_flags);
+		PRINT("&shared: 0x%p next: 0x%p prev: 0x%p\n", &vma->shared,
+						vma->shared.next, vma->shared.prev);
+		PRINT("ops: 0x%p private: 0x%p\n", vma->vm_ops, vma->vm_private_data);
+		PRINT("file: 0x%p pgoff: 0x%lx\n\n", vma->vm_file, vma->vm_pgoff);
+	}
+}
+
+
+void
+dump_page(const char * const text, const struct page * const p)
+{
+	if (_pr_flag_ & PRINT_page){
+		PRINT("%s page struct: 0x%p\n", text, p);
+		PRINT("flags: 0x%lx count: 0x%x\n", p->flags, atomic_read(&p->count));
+		PRINT("&list: 0x%p next: 0x%p prev: 0x%p\n", &p->list,
+							p->list.next, p->list.prev);
+		PRINT("mapping: 0x%p index: 0x%lx\n", p->mapping, p->index);
+		PRINT("&lru: 0x%p next: 0x%p prev: 0x%p\n", &p->lru,
+							p->lru.next, p->lru.prev);
+		if (PageDirect(p))
+			PRINT("pte.direct: 0x%p", p->pte.direct);
+		else
+			PRINT("pte.chain: 0x%p", p->pte.chain);
+		PRINT(" private: 0x%lx\n", p->private);
+#if defined(WANT_PAGE_VIRTUAL)
+		PRINT("virtual: 0x%p\n", p->virtual);
+#endif
+		PRINT("\n");
+	}
+}
+
+
+void
+dump_pte_stuff(const pte_t * const pte_addr)
+{
+	if (_pr_flag_ & PRINT_pte){
+		// "struct page" of the page that hosts the PTE.
+		const struct page * const pte_page = kmap_atomic_to_page(pte_addr);
+
+		PRINT("pte_paddr: 0x%p pte: 0x%016lx\n", pte_addr, pte_val(*pte_addr));
+		dump_page("\npte", pte_page);
+	}
+}
+
+
+#define	_DATA_		0x6000000000000000UL	// User data segment
+
+
+/*
+ * Give me a valid physical address (if "vaddress == -1"),
+ * otherwise translate a user mode virtual address to a physical one.
+ */
+phaddr_t
+gimme_an_address(caddr_t vaddress)
+{
+	const struct vm_area_struct	*vma;
+	const pgd_t			*pgd;
+	const pmd_t			*pmd;
+	const pte_t			*pte;
+	phaddr_t			phaddress = -EFAULT;
+
+	if (vaddress == (caddr_t) -1)
+		vaddress = (caddr_t) _DATA_;
+	PRINT("Virtual addr:\t0x%016lx\n", (vaddr_t) vaddress);
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, (vaddr_t) vaddress);
+	if (vma == NULL || vma->vm_start > (vaddr_t) vaddress){
+		up_read(&current->mm->mmap_sem);
+		return -EFAULT;
+	}
+	spin_lock(&current->mm->page_table_lock);
+	do {
+		pgd = pgd_offset(current->mm, (vaddr_t) vaddress);
+		if (pgd_none(*pgd) || pgd_bad(*pgd))
+			break;
+		pmd = pmd_offset(pgd, (vaddr_t) vaddress);
+		if (pmd_none(*pmd) || pmd_bad(*pmd))
+			break;
+		pte = pte_offset_map(pmd, (vaddr_t) vaddress);
+		if (!pte_present(*pte)){
+			pte_unmap(pte);
+			break;
+		}
+		phaddress = pte_pfn(*pte) << PAGE_SHIFT;
+		pte_unmap(pte);
+	} while (0);
+	spin_unlock(&current->mm->page_table_lock);
+	up_read(&current->mm->mmap_sem);
+	PRINT("Physical addr:\t0x%016llx\n", (long long) phaddress);
+	return phaddress;
+}
+
+
+#endif // #if defined(_TEST_)
diff -Nru 2.6.4.ref/include/asm-generic/rmap.h 2.6.4.mig4/include/asm-generic/rmap.h
--- 2.6.4.ref/include/asm-generic/rmap.h	Tue Mar 16 10:18:15 2004
+++ 2.6.4.mig4/include/asm-generic/rmap.h	Thu Mar 25 08:59:42 2004
@@ -87,4 +87,45 @@
 }
 #endif
 
+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * We use an array of pte pointers in this structure to minimise cache misses
+ * while traversing reverse maps.
+ */
+#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
+
+/*
+ * next_and_idx encodes both the address of the next pte_chain and the
+ * offset of the highest-index used pte in ptes[].
+ */
+struct pte_chain {
+	unsigned long next_and_idx;
+	pte_addr_t ptes[NRPTE];
+} ____cacheline_aligned;
+
+static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
+{
+	return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
+}
+
+static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
+{
+	return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
+}
+
+static inline int pte_chain_idx(struct pte_chain *pte_chain)
+{
+	return pte_chain->next_and_idx & NRPTE;
+}
+
+static inline unsigned long
+pte_chain_encode(struct pte_chain *pte_chain, int idx)
+{
+	return (unsigned long)pte_chain | idx;
+}
+
 #endif /* _GENERIC_RMAP_H */
diff -Nru 2.6.4.ref/include/asm-ia64/page_migrate.h 2.6.4.mig4/include/asm-ia64/page_migrate.h
--- 2.6.4.ref/include/asm-ia64/page_migrate.h	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/include/asm-ia64/page_migrate.h	Thu Mar 25 08:59:42 2004
@@ -0,0 +1,245 @@
+#define	_TEST_
+#define	_NEED_STATISTICS_
+
+
+/*
+ * Migrate pages from a NUMA node to another.
+ * ==========================================
+ *
+ * Version 0.1, 23th of March 2004
+ * By Zoltan Menyhart, Bull S.A. <Zoltan.Menyhart_AT_bull.net@no.spam.org>
+ * The usual GPL applies.
+ *
+ * (See "migrate.txt".)
+ *
+ * Sytem call syntax:
+ *
+ *	long long sys_page_migrate(int command, caddr_t address, size_t length,
+ *								int node, pid_t pid);
+ *
+ * On error "-1" is returned and "errno" holds the error code.
+ *
+ * The following commands are available:
+ */
+enum {
+/*
+ * - Return a physical address.
+ *   (testing only, the kernel has to be compiled with "#define	_TEST_")
+ */
+	_GIMME_AN_ADDRESS_,
+/*
+ *   On entry, if "address" is a valid virtual address in the address space of the
+ *   current task with an existing backing page, then its physical address is returned;
+ *   if it is equal to "-1L", then the system finds a valid physical address on its own.
+ *   The other arguments are don't care.
+ *
+ * - Fetch and clear the statistics.
+ */
+	_STATISTICS_,
+/*
+ *   "address" is a pointer to the user's buffer. If "length != 0" then having bben
+ *   fetched, the statistics get cleared. The other arguments are don't care.
+ *
+ * - Obtain the size of the statistics structure (see "struct _statistics_size_"):
+ */
+	_SIZEOF_STATISTICS_,
+/*
+ *   The arguments are don't care.
+ *
+ * - Batch migrate pages from a NUMA node to another.
+ */
+	_PHADDR_BATCH_MIGRATE_,
+/*
+ *   "address" points at the user table containing the physical address of the pages to
+ *   be migrated. "length" is the number of the physical addresses in the buffer. Max.
+ *   "PAGE_SIZE / sizeof(phaddr_t)" of them can be mifgrated at once.
+ *   "node" is the destination NUMA node.
+ *   Addresses are assumed to belong to the process indicated by "pid".
+ *   The number of the pages actually migrated is returned
+ *   (see "struct _un_success_count_).
+ *
+ * - Migrate virtual address range of a process:
+ */
+	_VA_RANGE_MIGRATE_,
+/*
+ *   "sddress" is the starting virtual address in a process'es address space.
+ *   "length" is the length of the address range to be migrated
+ *   Addresses are assumed to belong to the process indicated by "pid".
+ *   The number of the pages actually migrated is returned
+ *   (see "struct _un_success_count_).
+ */
+};
+
+
+/*
+ * Type of a physical address -- hopefully enough for all architectures.
+ * (We allow negative values, too, for indicating some errors.)
+ */
+typedef	long long	phaddr_t;
+
+
+struct _un_success_count_ {
+	unsigned int	successful;		// Pages successfully migrated
+	unsigned int	failed;			// Minor failures
+};
+
+
+struct _statistics_size_ {
+	unsigned int	sizeof_statistics;	// sizeof(struct _statistics_)
+	unsigned int	max_nodes;		// MAX_NUMNODES
+};
+
+
+/*
+ * Statistics are accessed in a non atomic way. Who cares? Just some statistics :-)
+ */
+struct _statistics_ {
+	struct {					// Error counters
+		unsigned long	non_existent_addr;
+		unsigned long	page_gone_away;
+		unsigned long	busy;
+		unsigned long	bad_request;
+		unsigned long	no_memory;		// On the target node
+		unsigned long	page_type_not_supp;
+		unsigned long	errors;			// "PageError(page)" is set
+	} e;
+	struct {					// Clock ticks
+		unsigned long	total;
+		unsigned long	page_alloc;
+		unsigned long	page_free;
+		unsigned long	page_lock;
+		unsigned long	new_page_unlock;
+		unsigned long	page_unlock;
+		unsigned long	validation;
+		unsigned long	pgd_scan;
+		unsigned long	pgd_lock;
+		unsigned long	pgd_unlock;
+		unsigned long	mm_list_lock;
+		unsigned long	mmap_sem;
+		unsigned long	pte_chain_lock;
+		unsigned long	find_vma;
+		unsigned long	flush_tlb;
+		unsigned long	add_lru;
+		unsigned long	copy;
+		unsigned long	update_mmu_cache;
+		unsigned long	mm_lookup;
+		unsigned long	cyc_per_usec;
+		unsigned long	perfbullctl;
+		unsigned long	pci_cfg_rd;
+		unsigned long	pci_cfg_wr;
+	} t;
+	struct {					// Event counters
+		unsigned long	mm_hit;
+		unsigned long	pgd_scan;
+		unsigned long	perfbullctl;
+		unsigned long	pci_cfg_rd;
+		unsigned long	pci_cfg_wr;
+	} c;
+#if defined(__KERNEL__)
+	unsigned long	count[MAX_NUMNODES][MAX_NUMNODES];
+#else
+	unsigned long	count[0][0];
+#endif
+};
+
+
+#if !defined(__KERNEL__)
+
+
+#include <unistd.h>
+#include <sys/types.h>
+
+#if !defined(__NR_page_migrate)
+#define __NR_page_migrate	1276
+#endif
+
+
+/*
+ * Migrate some pages of the process of PID.
+ */
+static inline int
+migrate_ph_pages(const phaddr_t * const table, const size_t length, const int node,
+				struct _un_success_count_ * const p, const pid_t pid)
+{
+	union {
+		long long			ll;
+		struct _un_success_count_	s;
+	} u;
+
+	u.ll = syscall(__NR_page_migrate, _PHADDR_BATCH_MIGRATE_,
+							table, length, node, pid);
+	if (u.ll == -1)
+		return -1;
+	if (p != NULL){
+		p->successful = u.s.successful;
+		p->failed = u.s.failed;
+	}
+	return 0; 
+}
+
+
+/*
+ * Migrate virtual address range of the process of PID.
+ */
+static inline int
+migrate_virt_addr_range(const caddr_t address, const size_t length, const int node,
+				struct _un_success_count_ * const p, const pid_t pid)
+{
+	union {
+		long long			ll;
+		struct _un_success_count_	s;
+	} u;
+
+	u.ll = syscall(__NR_page_migrate, _VA_RANGE_MIGRATE_,
+							address, length, node, pid);
+	if (u.ll == -1)
+		return -1;
+	if (p != NULL){
+		p->successful = u.s.successful;
+		p->failed = u.s.failed;
+	}
+	return 0; 
+}
+
+
+/*
+ * Obtain the size of the statistics structure.
+ */
+static inline int
+get_stat_sizes(struct _statistics_size_ * const p)
+{
+	union {
+		long long			ll;
+		struct _statistics_size_	s;
+	} u;
+
+	u.ll = syscall(__NR_page_migrate, _SIZEOF_STATISTICS_, 0, 0, 0, 0);
+	if (u.ll == -1)
+		return -1;
+	if (p != NULL)
+		*p = u.s;
+	return 0; 
+}
+
+
+/*
+ * Fetch and clear the statistics.
+ */
+static inline int
+get_staistics(struct _statistics_ * const p, const long slear_flag)
+{
+	return syscall(__NR_page_migrate, _STATISTICS_, p, slear_flag, 0, 0);
+}
+
+
+/*
+ * Return a physical address.
+ */
+static inline phaddr_t
+gimme_a_ph_address(const caddr_t p)
+{
+	return syscall(__NR_page_migrate, _GIMME_AN_ADDRESS_, p, 0, 0, 0);
+}
+
+
+#endif
diff -Nru 2.6.4.ref/include/asm-ia64/pgtable.h 2.6.4.mig4/include/asm-ia64/pgtable.h
--- 2.6.4.ref/include/asm-ia64/pgtable.h	Tue Mar 16 10:18:15 2004
+++ 2.6.4.mig4/include/asm-ia64/pgtable.h	Thu Mar 25 08:59:42 2004
@@ -112,6 +112,28 @@
 #define PTRS_PER_PTE	(__IA64_UL(1) << (PAGE_SHIFT-3))
 
 /*
+ * The IA64 architecture does not decode all the MSB-s of virtual addresses for PGD, PMD
+ * and PTE indices, i.e. IA64 has got holes or aliases in the virtual address space.
+ * These def's are provided to check to see if an "address" -- "length" pair spans over
+ * virtual address holes or it creates illegal alias to an otherwise valid address.
+ * (User mode only.)
+ */
+#define	__VA_BITS_PER_REGION	(PAGE_SHIFT - 3 - 3 +	/* PGD low index */		\
+				2 * (PAGE_SHIFT - 3) +	/* PMD and PTE indices */	\
+				PAGE_SHIFT)		/* The page itself */
+#define	__VA_ALIAS_MASK		((1UL << __VA_BITS_PER_REGION) - 1)
+#define	__IS_VA_ALIAS(address, length)							\
+				((~__VA_ALIAS_MASK & (address)) !=			\
+					(~__VA_ALIAS_MASK & ((address) + (length) - 1)))
+
+/*
+ * Virtual address composed by use of PGD, PMD and PTE indices:
+ */
+#define	__VA(pgdi, pmdi, ptei)	(((pgdi) >> (PAGE_SHIFT - 6)) << 61 |			\
+				((pgdi) & ((PTRS_PER_PGD >> 3) - 1)) << PGDIR_SHIFT |	\
+				(pmdi) << PMD_SHIFT | (ptei) << PAGE_SHIFT)
+
+/*
  * All the normal masks have the "page accessed" bits on, as any time
  * they are used, the page is accessed. They are cleared only by the
  * page-out routines.
@@ -325,8 +347,10 @@
 	(init_mm.pgd + (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)))
 
 /* Find an entry in the second-level page table.. */
-#define pmd_offset(dir,addr) \
-	((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+#define pmd_index(addr) \
+	(((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pmd_offset(dir, addr) \
+	((pmd_t *) pgd_page(*(dir)) + pmd_index(addr))
 
 /*
  * Find an entry in the third-level page table.  This looks more complicated than it
diff -Nru 2.6.4.ref/include/asm-ia64/rmap-locking.h 2.6.4.mig4/include/asm-ia64/rmap-locking.h
--- 2.6.4.ref/include/asm-ia64/rmap-locking.h	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/include/asm-ia64/rmap-locking.h	Thu Mar 25 08:59:42 2004
@@ -0,0 +1,25 @@
+/*
+ * include/linux/rmap-locking.h
+ *
+ * Locking primitives for exclusive access to a page's reverse-mapping
+ * pte chain.
+ */
+
+#include <linux/slab.h>
+
+struct pte_chain;
+extern kmem_cache_t *pte_chain_cache;
+
+#define pte_chain_lock(page)	bit_spin_lock(PG_chainlock, &page->flags)
+#define pte_chain_trylock(page)	bit_spin_trylock(PG_chainlock, &page->flags)
+#define pte_chain_unlock(page)	bit_spin_unlock(PG_chainlock, &page->flags)
+
+struct pte_chain *pte_chain_alloc(int gfp_flags);
+void __pte_chain_free(struct pte_chain *pte_chain);
+
+static inline void pte_chain_free(struct pte_chain *pte_chain)
+{
+	if (pte_chain)
+		__pte_chain_free(pte_chain);
+}
+
diff -Nru 2.6.4.ref/include/linux/rmap-locking.h 2.6.4.mig4/include/linux/rmap-locking.h
--- 2.6.4.ref/include/linux/rmap-locking.h	Tue Mar 16 10:18:15 2004
+++ 2.6.4.mig4/include/linux/rmap-locking.h	Thu Mar 25 08:59:42 2004
@@ -11,6 +11,7 @@
 extern kmem_cache_t *pte_chain_cache;
 
 #define pte_chain_lock(page)	bit_spin_lock(PG_chainlock, &page->flags)
+#define pte_chain_trylock(page)	bit_spin_trylock(PG_chainlock, &page->flags)
 #define pte_chain_unlock(page)	bit_spin_unlock(PG_chainlock, &page->flags)
 
 struct pte_chain *pte_chain_alloc(int gfp_flags);
diff -Nru 2.6.4.ref/mm/rmap.c 2.6.4.mig4/mm/rmap.c
--- 2.6.4.ref/mm/rmap.c	Tue Mar 16 10:18:17 2004
+++ 2.6.4.mig4/mm/rmap.c	Thu Mar 25 09:00:13 2004
@@ -46,40 +46,9 @@
  * We use an array of pte pointers in this structure to minimise cache misses
  * while traversing reverse maps.
  */
-#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
-
-/*
- * next_and_idx encodes both the address of the next pte_chain and the
- * offset of the highest-index used pte in ptes[].
- */
-struct pte_chain {
-	unsigned long next_and_idx;
-	pte_addr_t ptes[NRPTE];
-} ____cacheline_aligned;
 
 kmem_cache_t	*pte_chain_cache;
 
-static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
-{
-	return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
-}
-
-static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
-{
-	return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
-}
-
-static inline int pte_chain_idx(struct pte_chain *pte_chain)
-{
-	return pte_chain->next_and_idx & NRPTE;
-}
-
-static inline unsigned long
-pte_chain_encode(struct pte_chain *pte_chain, int idx)
-{
-	return (unsigned long)pte_chain | idx;
-}
-
 /*
  * pte_chain list management policy:
  *
diff -Nru 2.6.4.ref/test/migstat.c 2.6.4.mig4/test/migstat.c
--- 2.6.4.ref/test/migstat.c	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/test/migstat.c	Thu Mar 25 09:02:00 2004
@@ -0,0 +1,130 @@
+/*
+ * Display and reset page migration statistics.
+ *
+ * Usage: migstat [-c]
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <malloc.h>
+#include "page_migrate.h"
+
+#define	CONV(x)		x, (x * mult + div / 2) / div, (x * mult + div / 2) / div / 1000
+
+extern int		errno;
+
+struct _statistics_		*sp;
+struct _statistics_size_	ss;
+
+main(const int argc, const char * const argv[])
+{
+	int		from, to;
+	unsigned long	*p;
+	unsigned long	ok = 0;
+	unsigned long	mult = 1, div = 1;
+	unsigned long	time;
+	int		clear_flag = 0;
+
+	if (argc == 2 && strcmp(argv[1], "-c") == 0)
+		clear_flag = 1;
+	else if (argc != 1){
+		fprintf(stderr, "Usage: %s [-c]\n", argv[0]);
+		return 1;
+	}
+	if (get_stat_sizes(&ss) < 0){
+		perror("get_stat_sizes()");
+		return 1;
+	}
+	if ((sp = malloc(ss.sizeof_statistics)) == NULL){
+		fprintf(stderr, "malloc(%d) failed\n", ss.sizeof_statistics);
+		return 1;
+	}
+	if (get_staistics(sp, clear_flag) < 0){
+		perror("get_staistics()");
+		return 1;
+	}
+	printf("\nError counters:\n");
+	if (sp->e.non_existent_addr != 0)
+		printf("non_existent_addr:  %ld\n", sp->e.non_existent_addr);
+	if (sp->e.page_gone_away != 0)
+		printf("page_gone_away:     %ld\n", sp->e.page_gone_away);
+	if (sp->e.busy != 0)
+		printf("busy:               %ld\n", sp->e.busy);
+	if (sp->e.bad_request != 0)
+		printf("bad_request:        %ld\n", sp->e.bad_request);
+	if (sp->e.no_memory != 0)
+		printf("no_memory:          %ld\n", sp->e.no_memory);
+	if (sp->e.page_type_not_supp != 0)
+		printf("page_type_not_supp: %ld\n", sp->e.page_type_not_supp);
+	if (sp->e.errors != 0)
+		printf("page errors:        %ld\n", sp->e.errors);
+	printf("Total:              %ld\n", sp->e.non_existent_addr +
+			sp->e.page_gone_away + sp->e.busy + sp->e.bad_request +
+			sp->e.no_memory + sp->e.page_type_not_supp + sp->e.errors);
+
+	printf("\n\tMigrated to:\n");
+	printf("From:\t");
+	for (to = 0; to < ss.max_nodes; to++)
+		printf("%d:%c", to, to < ss.max_nodes - 1 ? '\t' : '\n');
+	p = &sp->count[0][0];
+	for (from = 0; from < ss.max_nodes; from++){
+		printf("%d:\t", from);
+		for (to = 0; to < ss.max_nodes; p++, to++){
+			ok += *p;
+			if (from == to && *p == 0)
+				printf("-");
+			else
+				printf("%lu", *p);
+			printf("%c", to < ss.max_nodes - 1 ? '\t' : '\n');
+		}
+	}
+	printf("Total: %ld\n\n", ok);
+
+	div = sp->t.cyc_per_usec;
+	printf("                 Clock ticks:   Microsec:  Millisec:\n");
+	printf("total:          %12ld  %10ld   %8ld\n", CONV(sp->t.total));
+	printf("page_alloc:     %12ld  %10ld   %8ld\n", CONV(sp->t.page_alloc));
+	printf("page_free:      %12ld  %10ld   %8ld\n", CONV(sp->t.page_free));
+	printf("page_lock:      %12ld  %10ld   %8ld\n", CONV(sp->t.page_lock));
+	printf("page_unlock:    %12ld  %10ld   %8ld\n", CONV(sp->t.page_unlock));
+	printf("new_pg_unlock:  %12ld  %10ld   %8ld\n", CONV(sp->t.new_page_unlock));
+	printf("validation:     %12ld  %10ld   %8ld\n", CONV(sp->t.validation));
+	printf("pgd_scan:       %12ld  %10ld   %8ld\n", CONV(sp->t.pgd_scan));
+	printf("pgd_lock:       %12ld  %10ld   %8ld\n", CONV(sp->t.pgd_lock));
+	printf("pgd_unlock:     %12ld  %10ld   %8ld\n", CONV(sp->t.pgd_unlock));
+	printf("mm_list_lock:   %12ld  %10ld   %8ld\n", CONV(sp->t.mm_list_lock));
+	printf("mmap_sem:       %12ld  %10ld   %8ld\n", CONV(sp->t.mmap_sem));
+	printf("pte_chain_lock: %12ld  %10ld   %8ld\n", CONV(sp->t.pte_chain_lock));
+	printf("find_vma:       %12ld  %10ld   %8ld\n", CONV(sp->t.find_vma));
+	printf("flush_tlb:      %12ld  %10ld   %8ld\n", CONV(sp->t.flush_tlb));
+	printf("add_lru:        %12ld  %10ld   %8ld\n", CONV(sp->t.add_lru));
+	printf("copy:           %12ld  %10ld   %8ld\n", CONV(sp->t.copy));
+	printf("upd_mmu_cache:  %12ld  %10ld   %8ld\n", CONV(sp->t.update_mmu_cache));
+	time = sp->t.total - sp->t.page_alloc - sp->t.page_free -
+		sp->t.page_lock - sp->t.page_unlock - sp->t.new_page_unlock -
+		sp->t.validation - // sp->t.pgd_unlock - sp->t.mmap_sem -
+		// sp->t.pgd_scan - sp->t.pgd_lock - sp->t.mmlist_lock -
+		sp->t.pte_chain_lock - sp->t.find_vma - sp->t.flush_tlb -
+		sp->t.add_lru - sp->t.copy - sp->t.update_mmu_cache;
+	printf("Where is        %12ld  %10ld   %8ld ?\n", CONV(time));
+
+	printf("cyc_per_usec:    %11ld\n", sp->t.cyc_per_usec);
+
+	if (sp->c.pgd_scan != 0){
+		printf("\npgd_scan:\t\t%11ld\n", sp->c.pgd_scan);
+		printf("mm_hit:\t\t\t%11ld\nmiss:\t\t\t%11ld\n", sp->c.mm_hit,
+					sp->e.non_existent_addr + ok - sp->c.mm_hit);
+	}
+
+	if (sp->c.perfbullctl != 0){
+		printf("\npci_cfg_rd:\t%11ld\t   %10ld\n", CONV(sp->t.pci_cfg_rd));
+		printf("pci_cfg_rd count:\t%11ld\n", sp->c.pci_cfg_rd);
+		printf("pci_cfg_wr:\t%11ld\t   %10ld\n", CONV(sp->t.pci_cfg_wr));
+		printf("pci_cfg_wr count:\t%11ld\n", sp->c.pci_cfg_wr);
+		printf("perfbullctl:\t%11ld\t   %10ld\n", CONV(sp->t.perfbullctl));
+		printf("perfbullctl count:\t%11ld\n", sp->c.perfbullctl);
+	}
+	return 0;
+}
+
diff -Nru 2.6.4.ref/test/ph.c 2.6.4.mig4/test/ph.c
--- 2.6.4.ref/test/ph.c	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/test/ph.c	Thu Mar 25 09:02:00 2004
@@ -0,0 +1,94 @@
+/*
+ * Demo: migrate some of our pages identified by their physical addresses.
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "page_migrate.h"
+
+#if !defined(PAGE_SIZE)
+#define	PAGE_SIZE	(16 * 1024)
+#endif
+
+#define	MMAPSIZE		(1024 * 1024 * 256)
+
+phaddr_t			address;
+extern int			errno;
+phaddr_t			table[PAGE_SIZE / sizeof(phaddr_t)];
+struct _un_success_count_	u_s;
+
+
+size_t
+fill(volatile void *p)
+{
+	size_t		count = 123;
+	size_t		i;
+
+	for (i = 0; i < count; i++, p += PAGE_SIZE){
+		* (unsigned long *) p = 0xdeadbeefL;
+		if ((address = gimme_a_ph_address((void *) p)) < 0)
+			break;
+		table[i] = address;
+	}
+	printf("# addresses: %d\n", i);
+	return i;
+}
+
+
+mig(volatile void *p, int node)
+{
+	int		rc;
+	size_t		count;
+
+	count = fill(p);
+	rc = migrate_ph_pages(table, count, node, &u_s, 0);
+	printf("\nmig(..., %d): rc = %ld errno = %d *p: 0x%lx\n", node, rc, errno,
+								* (unsigned long *) p);
+	printf("successful: %d failed: %d\n", u_s.successful, u_s.failed);
+	if (rc < 0){
+		perror("migrate_virt_addr_range()");
+		exit(-1);
+	}
+	address = gimme_a_ph_address((void *) p);
+	printf("\nmig(..., %d): ph address = 0x%016llx\n", node, address);
+	if (address < 0){
+		perror("gimme_a_ph_address()");
+		exit(-1);
+	}
+}
+
+
+main()
+{
+	volatile void	*p;
+
+	p = mmap(NULL, MMAPSIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (p == MAP_FAILED){
+		perror("\nmmap()");
+		return 1;
+	}
+	/*
+	 * No backing page => should fail.
+	 */
+	printf("\nmain(): ph address = 0x%llx\n", address);
+	* (unsigned long *) p = 0xdeadbeef03L;
+	/*
+	 * Now there should be a backing page.
+	 */
+	address = gimme_a_ph_address((void *) p);
+	printf("\nmain(): ph address = 0x%016llx\n", address);
+	if (address < 0){
+		perror("gimme_a_ph_address()");
+		return 1;
+	}
+	mig(p, 0);
+	mig(p, 1);
+	mig(p, 2);
+	mig(p, 3);
+	return 0;
+}
+
diff -Nru 2.6.4.ref/test/v.c 2.6.4.mig4/test/v.c
--- 2.6.4.ref/test/v.c	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/test/v.c	Thu Mar 25 09:02:00 2004
@@ -0,0 +1,78 @@
+/*
+ * Demo: migrate some of its own virtual address range.
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "page_migrate.h"
+
+#define	MMAPSIZE	(1024 * 1024 * 256)
+
+phaddr_t	address;
+extern int	errno;
+
+struct _un_success_count_ u_s;
+
+
+mig(volatile void *p, int node)
+{
+	int	rc;
+
+	rc = migrate_virt_addr_range((caddr_t) p, MMAPSIZE, node, &u_s, 0);
+	printf("\nmig(..., %d): rc = %ld errno = %d *p: 0x%lx\n", node, rc, errno,
+								* (unsigned long *) p);
+	printf("successful: %d failed: %d\n", u_s.successful, u_s.failed);
+	if (rc < 0){
+		perror("migrate_virt_addr_range()");
+		exit(-1);
+	}
+	address = gimme_a_ph_address((void *) p);
+	printf("\nmig(..., %d): ph address = 0x%016llx\n", node, address);
+	if (address < 0){
+		perror("gimme_a_ph_address()");
+		exit(-1);
+	}
+}
+
+
+main()
+{
+	volatile void	*p0, *p;
+
+	p0 = p = mmap(NULL, MMAPSIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (p == MAP_FAILED){
+		perror("\nmmap()");
+		return 1;
+	}
+	/*
+	 * Make sure 2 pages are exist.
+	 */
+	* (unsigned long *) p = 0xdeadbeef01L;
+	p += 1024 * 16;
+	* (unsigned long *) p = 0xdeadbeef02L;
+	address = gimme_a_ph_address((void *) p);
+	/*
+	 * No backing page => should fail.
+	 */
+	p += 1024 * 64;
+	printf("\nmain(): ph address = 0x%llx\n", address);
+	* (unsigned long *) p = 0xdeadbeef03L;
+	/*
+	 * Now there should be a backing page.
+	 */
+	address = gimme_a_ph_address((void *) p);
+	printf("\nmain(): ph address = 0x%016llx\n", address);
+	if (address < 0){
+		perror("gimme_a_ph_address()");
+		return 1;
+	}
+	mig(p0, 0);
+	mig(p0, 1);
+	mig(p0, 2);
+	mig(p0, 3);
+	return 0;
+}
diff -Nru 2.6.4.ref/test/victim.c 2.6.4.mig4/test/victim.c
--- 2.6.4.ref/test/victim.c	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/test/victim.c	Thu Mar 25 09:02:00 2004
@@ -0,0 +1,36 @@
+/*
+ * Victim process for "vmig".
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#define	MMAPSIZE	(1024 * 1024 * 1024L)
+#define	N		MMAPSIZE / sizeof(long)
+
+
+main()
+{
+	int		i;
+	volatile long	*p0, *p;
+	long		sum0, sum;
+
+	printf("victim: pid = %d\n", getpid());
+	p0 = p = mmap(NULL, MMAPSIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (p == MAP_FAILED){
+		perror("\nmmap()");
+		return 1;
+	}
+	printf("address: %p, size: 0x%lx\n", p, MMAPSIZE);
+	for (i = 0, sum0 = 0; i < N; i++)
+		sum0 += *p++ = random();
+	do {
+		for (i = 0, sum = 0, p = p0; i < N; i++)
+			sum += *p++;
+		printf("\nvictim: pid = %d, sum: %ld\n", getpid(), sum);
+	} while (sum0 == sum);
+}
diff -Nru 2.6.4.ref/test/vmig.c 2.6.4.mig4/test/vmig.c
--- 2.6.4.ref/test/vmig.c	Thu Jan  1 01:00:00 1970
+++ 2.6.4.mig4/test/vmig.c	Thu Mar 25 09:02:00 2004
@@ -0,0 +1,36 @@
+/*
+ * Migrate the victim process by hand.
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "page_migrate.h"
+
+// Who cares for the SH library ?
+#define	SH_ADDRESS	(2UL << 60)
+#define	SH_SIZE		(16UL * 1024 * 1024 * 1024 * 1024)
+
+struct _un_success_count_ u_s;
+
+main(const int argc, const char * const argv[])
+{
+	int	node;
+	pid_t	pid;
+	int	rc;
+
+	if (argc != 3){
+		fprintf(stderr, "usage: vmig <pid> <node>\n");
+		return 1;
+	}
+	pid = atoi(argv[1]);
+	node = atoi(argv[2]);
+	rc = migrate_virt_addr_range((caddr_t) SH_ADDRESS, SH_SIZE, node, &u_s, pid);
+	if (rc < 0)
+		perror("migrate_virt_addr_range()");
+	else
+		printf("successful: %d failed: %d\n", u_s.successful, u_s.failed);
+	return 0;
+}


^ permalink raw reply	[flat|nested] 17+ messages in thread