All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
@ 2006-12-26  8:18 yunfeng zhang
  2006-12-26  9:03 ` Zhou Yingchao
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: yunfeng zhang @ 2006-12-26  8:18 UTC (permalink / raw)
  To: linux-kernel

In the patch, I introduce a new page system -- pps which can improve
Linux swap subsystem performance, you can find a new document in
Documentation/vm_pps.txt. In brief, swap subsystem should scan/reclaim
pages on VMA instead of zone::active list ...

--- patch-linux/fs/exec.c	2006-12-26 15:20:02.683546016 +0800
+++ linux-2.6.16.29/fs/exec.c	2006-09-13 02:02:10.000000000 +0800
@@ -323,0 +324 @@
+	lru_cache_add_active(page);
@@ -438 +438,0 @@
-		enter_pps(mm, mpnt);
--- patch-linux/mm/swap_state.c	2006-12-26 15:20:02.689545104 +0800
+++ linux-2.6.16.29/mm/swap_state.c	2006-09-13 02:02:10.000000000 +0800
@@ -357,2 +357 @@
-			if (vma == NULL || !(vma->vm_flags & VM_PURE_PRIVATE))
-				lru_cache_add_active(new_page);
+			lru_cache_add_active(new_page);
--- patch-linux/mm/mmap.c	2006-12-26 15:20:02.691544800 +0800
+++ linux-2.6.16.29/mm/mmap.c	2006-09-13 02:02:10.000000000 +0800
@@ -209 +208,0 @@
-	leave_pps(vma, 0);
@@ -597 +595,0 @@
-		leave_pps(next, 0);
@@ -1096,2 +1093,0 @@
-	enter_pps(mm, vma);
-
@@ -1120 +1115,0 @@
-		leave_pps(vma, 0);
@@ -1148 +1142,0 @@
-	leave_pps(vma, 0);
@@ -1726,4 +1719,0 @@
-	if (new->vm_flags & VM_PURE_PRIVATE) {
-		new->vm_flags &= ~VM_PURE_PRIVATE;
-		enter_pps(mm, new);
-	}
@@ -1930 +1919,0 @@
-	enter_pps(mm, vma);
@@ -2054,4 +2042,0 @@
-			if (new_vma->vm_flags & VM_PURE_PRIVATE) {
-				new_vma->vm_flags &= ~VM_PURE_PRIVATE;
-				enter_pps(mm, new_vma);
-			}
--- patch-linux/mm/fremap.c	2006-12-26 15:20:02.695544192 +0800
+++ linux-2.6.16.29/mm/fremap.c	2006-09-13 02:02:10.000000000 +0800
@@ -40 +40 @@
-		if (pte_swapped(pte))
+		if (!pte_file(pte))
--- patch-linux/mm/rmap.c	2006-12-26 15:20:02.696544040 +0800
+++ linux-2.6.16.29/mm/rmap.c	2006-09-13 02:02:10.000000000 +0800
@@ -636 +636 @@
-		BUG_ON(!pte_swapped(*pte));
+		BUG_ON(pte_file(*pte));
--- patch-linux/mm/vmscan.c	2006-12-26 15:20:02.697543888 +0800
+++ linux-2.6.16.29/mm/vmscan.c	2006-09-13 02:02:10.000000000 +0800
@@ -1517,392 +1516,0 @@
-struct series_t {
-	pte_t orig_ptes[MAX_SERIES_LENGTH];
-	pte_t* ptes[MAX_SERIES_LENGTH];
-	struct page* pages[MAX_SERIES_LENGTH];
-	int series_length;
-	int series_stage;
-} series;
-
-static int get_series_stage(pte_t* pte, int index)
-{
-	series.orig_ptes[index] = *pte;
-	series.ptes[index] = pte;
-	if (pte_present(series.orig_ptes[index])) {
-		struct page* page = pfn_to_page(pte_pfn(series.orig_ptes[index]));
-		series.pages[index] = page;
-		if (page == ZERO_PAGE(addr)) // reserved page is exclusive from us.
-			return 7;
-		if (pte_young(series.orig_ptes[index])) {
-			return 1;
-		} else
-			return 2;
-	} else if (pte_unmapped(series.orig_ptes[index])) {
-		struct page* page = pfn_to_page(pte_pfn(series.orig_ptes[index]));
-		series.pages[index] = page;
-		if (!PageSwapCache(page))
-			return 3;
-		else {
-			if (PageWriteback(page) || PageDirty(page))
-				return 4;
-			else
-				return 5;
-		}
-	} else // pte_swapped -- SwappedPTE
-		return 6;
-}
-
-static void find_series(pte_t** start, unsigned long* addr, unsigned long end)
-{
-	int i;
-	int series_stage = get_series_stage((*start)++, 0);
-	*addr += PAGE_SIZE;
-
-	for (i = 1; i < MAX_SERIES_LENGTH && *addr < end; i++, (*start)++,
*addr += PAGE_SIZE) {
-		if (series_stage != get_series_stage(*start, i))
-			break;
-	}
-	series.series_stage = series_stage;
-	series.series_length = i;
-}
-
-struct delay_tlb_task_t delay_tlb_tasks[32] = { [0 ... 31] = {0} };
-
-void timer_flush_tlb_tasks(void* data)
-{
-	// To x86, if we found there were some flushing tasks, we should do
it all together, that is, flush it once.
-	int i;
-#ifdef CONFIG_X86
-	int flag = 0;
-#endif
-	for (i = 0; i < 32; i++) {
-		if (delay_tlb_tasks[i].mm != NULL &&
-				cpu_isset(smp_processor_id(), delay_tlb_tasks[i].mm->cpu_vm_mask) &&
-				cpu_isset(smp_processor_id(), delay_tlb_tasks[i].cpu_mask)) {
-#ifdef CONFIG_X86
-			flag = 1;
-#elif
-			// smp::local_flush_tlb_range(delay_tlb_tasks[i]);
-#endif
-			cpu_clear(smp_processor_id(), delay_tlb_tasks[i].cpu_mask);
-		}
-	}
-#ifdef CONFIG_X86
-	if (flag)
-		local_flush_tlb();
-#endif
-}
-
-static struct delay_tlb_task_t* delay_task = NULL;
-static int vma_index = 0;
-
-static struct delay_tlb_task_t* search_free_tlb_tasks_slot(void)
-{
-	struct delay_tlb_task_t* ret = NULL;
-	int i;
-again:
-	for (i = 0; i < 32; i++) {
-		if (delay_tlb_tasks[i].mm != NULL) {
-			if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
-				mmput(delay_tlb_tasks[i].mm);
-				delay_tlb_tasks[i].mm = NULL;
-				ret = &delay_tlb_tasks[i];
-			}
-		} else
-			ret = &delay_tlb_tasks[i];
-	}
-	if (!ret) { // Force flush TLBs.
-		on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
-		goto again;
-	}
-	return ret;
-}
-
-static void init_delay_task(struct mm_struct* mm)
-{
-	cpus_clear(delay_task->cpu_mask);
-	vma_index = 0;
-	delay_task->mm = mm;
-}
-
-/*
- * We will be working on the mm, so let's force to flush it if necessary.
- */
-static void start_tlb_tasks(struct mm_struct* mm)
-{
-	int i, flag = 0;
-again:
-	for (i = 0; i < 32; i++) {
-		if (delay_tlb_tasks[i].mm == mm) {
-			if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
-				mmput(delay_tlb_tasks[i].mm);
-				delay_tlb_tasks[i].mm = NULL;
-			} else
-				flag = 1;
-		}
-	}
-	if (flag) { // Force flush TLBs.
-		on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
-		goto again;
-	}
-	BUG_ON(delay_task != NULL);
-	delay_task = search_free_tlb_tasks_slot();
-	init_delay_task(mm);
-}
-
-static void end_tlb_tasks(void)
-{
-	if (!cpus_empty(delay_task->cpu_mask)) {
-		atomic_inc(&delay_task->mm->mm_users);
-		delay_task->cpu_mask = delay_task->mm->cpu_vm_mask;
-	} else
-		delay_task->mm = NULL;
-	delay_task = NULL;
-}
-
-static void fill_in_tlb_tasks(struct vm_area_struct* vma, unsigned long addr,
-		unsigned long end)
-{
-	struct mm_struct* mm;
-fill_it:
-	if (vma_index != 32) {
-		delay_task->vma[vma_index] = vma;
-		delay_task->start[vma_index] = addr;
-		delay_task->end[vma_index] = end;
-		vma_index++;
-		return;
-	}
-	mm = delay_task->mm;
-	end_tlb_tasks();
-
-	delay_task = search_free_tlb_tasks_slot();
-	init_delay_task(mm);
-	goto fill_it;
-}
-
-static void shrink_pvma_scan_ptes(struct scan_control* sc,
-		struct mm_struct* mm, struct vm_area_struct* vma, pmd_t* pmd,
-		unsigned long addr, unsigned long end)
-{
-	int i;
-	spinlock_t* ptl = pte_lockptr(mm, pmd);
-	pte_t* pte = pte_offset_map(pmd, addr);
-	int anon_rss = 0;
-	struct pagevec freed_pvec;
-	int may_enter_fs = (sc->gfp_mask & (__GFP_FS | __GFP_IO));
-	struct address_space* mapping = &swapper_space;
-
-	pagevec_init(&freed_pvec, 1);
-	do {
-		memset(&series, 0, sizeof(struct series_t));
-		find_series(&pte, &addr, end);
-		switch (series.series_stage) {
-			case 1: // PTE -- untouched PTE.
-				for (i = 0; i < series.series_length; i++) {
-					struct page* page = series.pages[i];
-					lock_page(page);
-					spin_lock(ptl);
-					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
-						if (pte_dirty(*series.ptes[i]))
-							set_page_dirty(page);
-						set_pte_at(mm, addr + i * PAGE_SIZE, series.ptes[i],
-								pte_mkold(pte_mkclean(*series.ptes[i])));
-					}
-					spin_unlock(ptl);
-					unlock_page(page);
-				}
-				fill_in_tlb_tasks(vma, addr, addr + (PAGE_SIZE * series.series_length));
-				break;
-			case 2: // untouched PTE -- UnmappedPTE.
-				/*
-				 * Note in stage 1, we've flushed TLB in fill_in_tlb_tasks, so
-				 * if it's still clear here, we can shift it to Unmapped type.
-				 *
-				 * If some architecture doesn't support atomic cmpxchg
-				 * instruction or can't atomically set the access bit after
-				 * they touch a pte at first, combine stage 1 with stage 2, and
-				 * send IPI immediately in fill_in_tlb_tasks.
-				 */
-				spin_lock(ptl);
-				for (i = 0; i < series.series_length; i++) {
-					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
-						pte_t pte_unmapped = series.orig_ptes[i];
-						pte_unmapped.pte_low &= ~_PAGE_PRESENT;
-						pte_unmapped.pte_low |= _PAGE_UNMAPPED;
-						if (cmpxchg(&series.ptes[i]->pte_low,
-									series.orig_ptes[i].pte_low,
-									pte_unmapped.pte_low) !=
-								series.orig_ptes[i].pte_low)
-							continue;
-						page_remove_rmap(series.pages[i]);
-						anon_rss--;
-					}
-				}
-				spin_unlock(ptl);
-				break;
-			case 3: // Attach SwapPage to PrivatePage.
-				/*
-				 * A better arithmetic should be applied to Linux SwapDevice to
-				 * allocate fake continual SwapPages which are close to each
-				 * other, the offset between two close SwapPages is less than 8.
-				 */
-				if (sc->may_swap) {
-					for (i = 0; i < series.series_length; i++) {
-						lock_page(series.pages[i]);
-						if (!PageSwapCache(series.pages[i])) {
-							if (!add_to_swap(series.pages[i], GFP_ATOMIC)) {
-								unlock_page(series.pages[i]);
-								break;
-							}
-						}
-						unlock_page(series.pages[i]);
-					}
-				}
-				break;
-			case 4: // SwapPage isn't consistent with PrivatePage.
-				/*
-				 * A mini version pageout().
-				 *
-				 * Current swap space can't commit multiple pages together:(
-				 */
-				if (sc->may_writepage && may_enter_fs) {
-					for (i = 0; i < series.series_length; i++) {
-						struct page* page = series.pages[i];
-						int res;
-
-						if (!may_write_to_queue(mapping->backing_dev_info))
-							break;
-						lock_page(page);
-						if (!PageDirty(page) || PageWriteback(page)) {
-							unlock_page(page);
-							continue;
-						}
-						clear_page_dirty_for_io(page);
-						struct writeback_control wbc = {
-							.sync_mode = WB_SYNC_NONE,
-							.nr_to_write = SWAP_CLUSTER_MAX,
-							.nonblocking = 1,
-							.for_reclaim = 1,
-						};
-						page_cache_get(page);
-						SetPageReclaim(page);
-						res = swap_writepage(page, &wbc);
-						if (res < 0) {
-							handle_write_error(mapping, page, res);
-							ClearPageReclaim(page);
-							page_cache_release(page);
-							break;
-						}
-						if (!PageWriteback(page))
-							ClearPageReclaim(page);
-						page_cache_release(page);
-					}
-				}
-				break;
-			case 5: // UnmappedPTE -- SwappedPTE, reclaim PrivatePage.
-				for (i = 0; i < series.series_length; i++) {
-					struct page* page = series.pages[i];
-					lock_page(page);
-					spin_lock(ptl);
-					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
-						spin_unlock(ptl);
-						unlock_page(page);
-						continue;
-					}
-					swp_entry_t entry = { .val = page_private(page) };
-					swap_duplicate(entry);
-					pte_t pte_swp = swp_entry_to_pte(entry);
-					set_pte_at(mm, addr + i * PAGE_SIZE, series.ptes[i], pte_swp);
-					spin_unlock(ptl);
-					if (PageSwapCache(page) && !PageWriteback(page))
-						delete_from_swap_cache(page);
-					unlock_page(page);
-
-					if (!pagevec_add(&freed_pvec, page))
-						__pagevec_release_nonlru(&freed_pvec);
-					sc->nr_reclaimed++;
-				}
-				break;
-			case 6:
-				// NULL operation!
-				break;
-		}
-	} while (addr < end);
-	add_mm_counter(mm, anon_rss, anon_rss);
-	if (pagevec_count(&freed_pvec))
-		__pagevec_release_nonlru(&freed_pvec);
-}
-
-static void shrink_pvma_pmd_range(struct scan_control* sc, struct
mm_struct* mm,
-		struct vm_area_struct* vma, pud_t* pud,
-		unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	pmd_t* pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		shrink_pvma_scan_ptes(sc, mm, vma, pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void shrink_pvma_pud_range(struct scan_control* sc, struct
mm_struct* mm,
-		struct vm_area_struct* vma, pgd_t* pgd,
-		unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	pud_t* pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		shrink_pvma_pmd_range(sc, mm, vma, pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-}
-
-static void shrink_pvma_pgd_range(struct scan_control* sc, struct
mm_struct* mm,
-		struct vm_area_struct* vma)
-{
-	unsigned long next;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	pgd_t* pgd = pgd_offset(mm, addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		shrink_pvma_pud_range(sc, mm, vma, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void shrink_private_vma(struct scan_control* sc)
-{
-	struct mm_struct* mm;
-	struct vm_area_struct* vma;
-	struct list_head *pos, *lhtemp;
-
-	spin_lock(&mmlist_lock);
-	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
-		mm = list_entry(pos, struct mm_struct, mmlist);
-		if (atomic_inc_return(&mm->mm_users) == 1) {
-			atomic_dec(&mm->mm_users);
-			continue;
-		}
-		spin_unlock(&mmlist_lock);
-		start_tlb_tasks(mm);
-		if (down_read_trylock(&mm->mmap_sem)) {
-			for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
-				if (!(vma->vm_flags & VM_PURE_PRIVATE))
-					continue;
-				if (vma->vm_flags & VM_LOCKED)
-					continue;
-				shrink_pvma_pgd_range(sc, mm, vma);
-			}
-			up_read(&mm->mmap_sem);
-		}
-		end_tlb_tasks();
-		mmput(mm);
-		spin_lock(&mmlist_lock);
-	}
-	spin_unlock(&mmlist_lock);
-}
-
@@ -1952 +1559,0 @@
-	shrink_private_vma(&sc);
--- patch-linux/mm/swapfile.c	2006-12-26 15:20:02.699543584 +0800
+++ linux-2.6.16.29/mm/swapfile.c	2006-09-13 02:02:10.000000000 +0800
@@ -10 +9,0 @@
-#include <linux/mm_inline.h>
@@ -421,157 +419,0 @@
-static int pps_test_swap_type(struct mm_struct* mm, pmd_t* pmd, pte_t* pte, int
-		type, struct page** ret_page)
-{
-	spinlock_t* ptl = pte_lockptr(mm, pmd);
-	swp_entry_t entry;
-	struct page* page;
-
-	spin_lock(ptl);
-	if (!pte_present(*pte) && pte_swapped(*pte)) {
-		entry = pte_to_swp_entry(*pte);
-		if (swp_type(entry) == type) {
-			*ret_page = NULL;
-			spin_unlock(ptl);
-			return 1;
-		}
-	} else {
-		page = pfn_to_page(pte_pfn(*pte));
-		if (PageSwapCache(page)) {
-			entry.val = page_private(page);
-			if (swp_type(entry) == type) {
-				page_cache_get(page);
-				*ret_page = page;
-				spin_unlock(ptl);
-				return 1;
-			}
-		}
-	}
-	spin_unlock(ptl);
-	return 0;
-}
-
-static int pps_swapoff_scan_ptes(struct mm_struct* mm, struct vm_area_struct*
-		vma, pmd_t* pmd, unsigned long addr, unsigned long end, int type)
-{
-	pte_t *pte;
-	struct page* page;
-
-	pte = pte_offset_map(pmd, addr);
-	do {
-		while (pps_test_swap_type(mm, pmd, pte, type, &page)) {
-			if (page == NULL) {
-				switch (__handle_mm_fault(mm, vma, addr, 0)) {
-				case VM_FAULT_SIGBUS:
-				case VM_FAULT_OOM:
-					return -ENOMEM;
-				case VM_FAULT_MINOR:
-				case VM_FAULT_MAJOR:
-					break;
-				default:
-					BUG();
-				}
-			} else {
-				wait_on_page_locked(page);
-				wait_on_page_writeback(page);
-				lock_page(page);
-				if (!PageSwapCache(page)) {
-					unlock_page(page);
-					page_cache_release(page);
-					break;
-				}
-				wait_on_page_writeback(page);
-				delete_from_swap_cache(page);
-				unlock_page(page);
-				page_cache_release(page);
-				break;
-			}
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	return 0;
-}
-
-static int pps_swapoff_pmd_range(struct mm_struct* mm, struct vm_area_struct*
-		vma, pud_t* pud, unsigned long addr, unsigned long end, int type)
-{
-	unsigned long next;
-	int ret;
-	pmd_t* pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		ret = pps_swapoff_scan_ptes(mm, vma, pmd, addr, next, type);
-		if (ret == -ENOMEM)
-			return ret;
-	} while (pmd++, addr = next, addr != end);
-	return 0;
-}
-
-static int pps_swapoff_pud_range(struct mm_struct* mm, struct vm_area_struct*
-		vma, pgd_t* pgd, unsigned long addr, unsigned long end, int type)
-{
-	unsigned long next;
-	int ret;
-	pud_t* pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		ret = pps_swapoff_pmd_range(mm, vma, pud, addr, next, type);
-		if (ret == -ENOMEM)
-			return ret;
-	} while (pud++, addr = next, addr != end);
-	return 0;
-}
-
-static int pps_swapoff_pgd_range(struct mm_struct* mm, struct vm_area_struct*
-		vma, int type)
-{
-	unsigned long next;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int ret;
-	pgd_t* pgd = pgd_offset(mm, addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		ret = pps_swapoff_pud_range(mm, vma, pgd, addr, next, type);
-		if (ret == -ENOMEM)
-			return ret;
-	} while (pgd++, addr = next, addr != end);
-	return 0;
-}
-
-static int pps_swapoff(int type)
-{
-	struct mm_struct* mm;
-	struct vm_area_struct* vma;
-	struct list_head *pos, *lhtemp;
-	int ret = 0;
-
-	spin_lock(&mmlist_lock);
-	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
-		mm = list_entry(pos, struct mm_struct, mmlist);
-		if (atomic_inc_return(&mm->mm_users) == 1) {
-			atomic_dec(&mm->mm_users);
-			continue;
-		}
-		spin_unlock(&mmlist_lock);
-		down_read(&mm->mmap_sem);
-		for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
-			if (!(vma->vm_flags & VM_PURE_PRIVATE))
-				continue;
-			if (vma->vm_flags & VM_LOCKED)
-				continue;
-			ret = pps_swapoff_pgd_range(mm, vma, type);
-			if (ret == -ENOMEM)
-				break;
-		}
-		up_read(&mm->mmap_sem);
-		mmput(mm);
-		spin_lock(&mmlist_lock);
-	}
-	spin_unlock(&mmlist_lock);
-	return ret;
-}
-
@@ -780,6 +621,0 @@
-	// Let's first read all pps pages back! Note, it's one-to-one mapping.
-	retval = pps_swapoff(type);
-	if (retval == -ENOMEM) // something was wrong.
-		return -ENOMEM;
-	// Now, the remain pages are shared pages, go ahead!
-
@@ -1015 +851 @@
-	// struct list_head *p, *next;
+	struct list_head *p, *next;
@@ -1021,3 +856,0 @@
-	/*
-	 * Now, init_mm.mmlist list not only is used by SwapDevice but also is used
-	 * by PPS.
@@ -1028 +860,0 @@
-	*/
--- patch-linux/mm/memory.c	2006-12-26 15:20:02.701543280 +0800
+++ linux-2.6.16.29/mm/memory.c	2006-09-13 02:02:10.000000000 +0800
@@ -439 +439 @@
-		if (pte_swapped(pte)) {
+		if (!pte_file(pte)) {
@@ -661,2 +660,0 @@
-			// if (vma->vm_flags & VM_PURE_PRIVATE && page != ZERO_PAGE(addr))
-			// 	lru_cache_add_active(page);
@@ -682,10 +680 @@
-		if (pte_unmapped(ptent)) {
-			struct page *page;
-			page = pfn_to_page(pte_pfn(ptent));
-			pte_clear_full(mm, addr, pte, tlb->fullmm);
-			// lru_cache_add_active(page);
-			tlb_remove_page(tlb, page);
-			anon_rss--;
-			continue;
-		}
-		if (pte_swapped(ptent))
+		if (!pte_file(ptent))
@@ -1522,2 +1511 @@
-		if (!(vma->vm_flags & VM_PURE_PRIVATE))
-			lru_cache_add_active(new_page);
+		lru_cache_add_active(new_page);
@@ -1879,78 +1866,0 @@
- * New read ahead code, mainly for VM_PURE_PRIVATE only.
- */
-static void pps_swapin_readahead(swp_entry_t entry, unsigned long
addr,struct vm_area_struct *vma, pte_t* pte, pmd_t* pmd)
-{
-	struct page* page;
-	pte_t *prev, *next;
-	swp_entry_t temp;
-	spinlock_t* ptl = pte_lockptr(vma->vm_mm, pmd);
-	int swapType = swp_type(entry);
-	int swapOffset = swp_offset(entry);
-	int readahead = 1, abs;
-
-	if (!(vma->vm_flags & VM_PURE_PRIVATE)) {
-		swapin_readahead(entry, addr, vma);
-		return;
-	}
-
-	page = read_swap_cache_async(entry, vma, addr);
-	if (!page)
-		return;
-	page_cache_release(page);
-
-	// read ahead the whole series, first forward then backward.
-	while (readahead < MAX_SERIES_LENGTH) {
-		next = pte++;
-		if (next - (pte_t*) pmd >= PTRS_PER_PTE)
-			break;
-		spin_lock(ptl);
-        if (!(!pte_present(*next) && pte_swapped(*next))) {
-			spin_unlock(ptl);
-			break;
-		}
-		temp = pte_to_swp_entry(*next);
-		spin_unlock(ptl);
-		if (swp_type(temp) != swapType)
-			break;
-		abs = swp_offset(temp) - swapOffset;
-		abs = abs < 0 ? -abs : abs;
-		swapOffset = swp_offset(temp);
-		if (abs > 8)
-			// the two swap entries are too far, give up!
-			break;
-		page = read_swap_cache_async(temp, vma, addr);
-		if (!page)
-			return;
-		page_cache_release(page);
-		readahead++;
-	}
-
-	swapOffset = swp_offset(entry);
-	while (readahead < MAX_SERIES_LENGTH) {
-		prev = pte--;
-		if (prev - (pte_t*) pmd < 0)
-			break;
-		spin_lock(ptl);
-        if (!(!pte_present(*prev) && pte_swapped(*prev))) {
-			spin_unlock(ptl);
-			break;
-		}
-		temp = pte_to_swp_entry(*prev);
-		spin_unlock(ptl);
-		if (swp_type(temp) != swapType)
-			break;
-		abs = swp_offset(temp) - swapOffset;
-		abs = abs < 0 ? -abs : abs;
-		swapOffset = swp_offset(temp);
-		if (abs > 8)
-			// the two swap entries are too far, give up!
-			break;
-		page = read_swap_cache_async(temp, vma, addr);
-		if (!page)
-			return;
-		page_cache_release(page);
-		readahead++;
-	}
-}
-
-/*
@@ -1978 +1888 @@
- 		pps_swapin_readahead(entry, address, vma, page_table, pmd);
+ 		swapin_readahead(entry, address, vma);
@@ -1997,2 +1907 @@
-	if (!(vma->vm_flags & VM_PURE_PRIVATE))
-		mark_page_accessed(page);
+	mark_page_accessed(page);
@@ -2002,4 +1910,0 @@
-		if (vma->vm_flags & VM_PURE_PRIVATE) {
-			lru_cache_add_active(page);
-			mark_page_accessed(page);
-		}
@@ -2020,4 +1924,0 @@
-		if (vma->vm_flags & VM_PURE_PRIVATE) {
-			lru_cache_add_active(page);
-			mark_page_accessed(page);
-		}
@@ -2095,2 +1995,0 @@
-		if (!(vma->vm_flags & VM_PURE_PRIVATE))
-			lru_cache_add_active(page);
@@ -2097,0 +1997 @@
+		lru_cache_add_active(page);
@@ -2312,14 +2211,0 @@
-		if (pte_unmapped(entry)) {
-			BUG_ON(!(vma->vm_flags & VM_PURE_PRIVATE));
-			struct page* page = pte_page(entry);
-			pte_t temp_pte = mk_pte(page, vma->vm_page_prot);
-			pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-			if (unlikely(pte_same(*pte, entry))) {
-				page_add_new_anon_rmap(page, vma, address);
-				set_pte_at(mm, address, pte, temp_pte);
-				update_mmu_cache(vma, address, temp_pte);
-				lazy_mmu_prot_update(temp_pte);
-			}
-			pte_unmap_unlock(pte, ptl);
-			return VM_FAULT_MINOR;
-		}
@@ -2562,109 +2447,0 @@
-
-static void migrate_back_pte_range(struct mm_struct* mm, pmd_t *pmd, struct
-		vm_area_struct *vma, unsigned long addr, unsigned long end)
-{
-	struct page* page;
-	pte_t entry;
-	pte_t* pte;
-	spinlock_t* ptl;
-
-	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-	do {
-		if (!pte_present(*pte) && pte_unmapped(*pte)) {
-			page = pte_page(*pte);
-			entry = mk_pte(page, vma->vm_page_prot);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			set_pte_at(mm, addr, pte, entry);
-			BUG_ON(page == ZERO_PAGE(addr));
-			page_add_new_anon_rmap(page, vma, addr);
-		}
-		if (pte_present(*pte)) {
-			page = pte_page(*pte);
-			if (page == ZERO_PAGE(addr))
-				continue;
-			lru_cache_add_active(page);
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
-	lru_add_drain();
-}
-
-static void migrate_back_pmd_range(struct mm_struct* mm, pud_t *pud, struct
-		vm_area_struct *vma, unsigned long addr, unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		migrate_back_pte_range(mm, pmd, vma, addr, next);
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void migrate_back_pud_range(struct mm_struct* mm, pgd_t *pgd, struct
-		vm_area_struct *vma, unsigned long addr, unsigned long end)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		migrate_back_pmd_range(mm, pud, vma, addr, next);
-	} while (pud++, addr = next, addr != end);
-}
-
-// migrate all pages of pure private vma back to Linux legacy memory
management.
-static void migrate_back_legacy_linux(struct mm_struct* mm, struct
vm_area_struct* vma)
-{
-	pgd_t* pgd;
-	unsigned long next;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-
-	pgd = pgd_offset(mm, addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		migrate_back_pud_range(mm, pgd, vma, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-LIST_HEAD(pps_head);
-LIST_HEAD(pps_head_buddy);
-
-DEFINE_SPINLOCK(pps_lock);
-
-void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma)
-{
-	int condition = VM_READ | VM_WRITE | VM_EXEC | \
-		 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \
-		 VM_GROWSDOWN | VM_GROWSUP | \
-		 VM_LOCKED | VM_SEQ_READ | VM_RAND_READ | VM_DONTCOPY | VM_ACCOUNT;
-	if (!(vma->vm_flags & ~condition) && vma->vm_file == NULL) {
-		vma->vm_flags |= VM_PURE_PRIVATE;
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			if (list_empty(&mm->mmlist))
-				list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-	}
-}
-
-void leave_pps(struct vm_area_struct* vma, int migrate_flag)
-{
-	struct mm_struct* mm = vma->vm_mm;
-
-	if (vma->vm_flags & VM_PURE_PRIVATE) {
-		vma->vm_flags &= ~VM_PURE_PRIVATE;
-		if (migrate_flag)
-			migrate_back_legacy_linux(mm, vma);
-	}
-}
--- patch-linux/kernel/timer.c	2006-12-26 15:20:02.688545256 +0800
+++ linux-2.6.16.29/kernel/timer.c	2006-09-13 02:02:10.000000000 +0800
@@ -845,2 +844,0 @@
-
-	timer_flush_tlb_tasks(NULL);
--- patch-linux/kernel/fork.c	2006-12-26 15:20:02.688545256 +0800
+++ linux-2.6.16.29/kernel/fork.c	2006-09-13 02:02:10.000000000 +0800
@@ -232 +231,0 @@
-		leave_pps(mpnt, 1);
--- patch-linux/Documentation/vm_pps.txt	2006-12-26 15:45:33.203883456 +0800
+++ linux-2.6.16.29/Documentation/vm_pps.txt	1970-01-01 08:00:00.000000000 +0800
@@ -1,190 +0,0 @@
-                         Pure Private Page System (pps)
-                     Copyright by Yunfeng Zhang on GFDL 1.2
-                              zyf.zeroos@gmail.com
-                              December 24-26, 2006
-
-// Purpose <([{
-The file is used to document the idea which is published firstly at
-http://www.ussg.iu.edu/hypermail/linux/kernel/0607.2/0451.html, as a part of my
-OS -- main page http://blog.chinaunix.net/u/21764/index.php.  You can find the
-overview of the idea in section <How to Reclaim Pages more Efficiently> and how
-I patch it into Linux 2.6.16.29 in section <Pure Private Page System -- pps>.
-// }])>
-
-// How to Reclaim Pages more Efficiently <([{
-Good idea originates from overall design and management ability, when you look
-down from a manager view, you will relief yourself from disordered code and
-find some problem immediately.
-
-OK! to modern OS, its memory subsystem can be divided into three layers
-1) Space layer (InodeSpace, UserSpace and CoreSpace).
-2) VMA layer (PrivateVMA and SharedVMA, memory architecture-independent layer).
-3) PTE and page layer (architecture-dependent).
-
-Since the 2nd layer assembles the much statistic of page-acess information, so
-it's nature that swap subsystem should be deployed and implemented on the 2nd
-layer.
-
-Undoubtedly, there are some virtues about it
-1) SwapDaemon can collect the statistic of process acessing pages and by it
-   unmaps ptes, SMP specially benefits from it for we can use flush_tlb_range
-   to relief frequently TLB IPI interrupt.
-2) Page-fault can issue better readahead requests since history data shows all
-   related pages have conglomerating affinity.
-3) It's conformable to POSIX madvise API family.
-
-Unfortunately, Linux 2.6.16.29 swap subsystem is based on the 3rd layer -- a
-system surrounding page.
-
-I've commited a patch to it, see section <Pure Private Page System --
pps>. Note, it ISN'T perfect.
-// }])>
-
-// Pure Private Page System -- pps  <([{
-Current Linux is just like a monster and still growing, even its swap subsystem
-...
-
-As I've referred in previous section, perfectly applying my idea need to unroot
-page-surrounging swap subsystem to migrate it on VMA, but a huge gap has
-defeated me -- active_list and inactive_list. In fact, you can find
-lru_add_active anywhere ... It's IMPOSSIBLE to me to complete it only by
-myself. It's also the difference between my design and Linux, in my OS, page is
-the charge of its new owner totally, however, to Linux, page management system
-is still tracing it by PG_active flag.
-
-So I conceive another solution:) That is, set up an independent page-recycle
-system rooted on Linux legacy page system -- pps, intercept all private pages
-belonging to PrivateVMA to pps, then use my pps to cycle them.  By the way, the
-whole job should be consist of two parts, here is the first --
-PrivateVMA-oriented (PPS), other is SharedVMA-oriented (should be called SPS)
-scheduled in future. Of course, if all are done, it will empty Linux legacy
-page system.
-
-In fact, pps is centered on how to better collect and unmap process private
-pages in SwapDaemon mm/vmscan.c:shrink_private_vma, the whole process is
-divided into six stages -- <Stage Definition>. Other sections show the remain
-aspects of pps
-1) <Data Definition> is basic data definition.
-2) <Concurrent racers of Shrinking pps> is focused on synchronization.
-3) <Private Page Lifecycle of pps> -- how private pages enter in/go off pps.
-4) <VMA Lifecycle of pps> which VMA is belonging to pps.
-
-PPS uses init_mm.mm_list list to enumerate all swappable UserSpace.
-
-I'm also glad to highlight my a new idea -- dftlb which is described in
-section <Delay to Flush TLB>.
-// }])>
-
-// Delay to Flush TLB (dftlb) <([{
-Delay to flush TLB is instroduced by me to enhance flushing TLB efficiency, in
-brief, when we want to unmap a page from the page table of a process, why we
-send TLB IPI to other CPUs immediately, since every CPU has timer interrupt, we
-can insert flushing tasks into timer interrupt route to implement a
-free-charged TLB flushing.
-
-The trick is implemented in
-1) TLB flushing task is added in fill_in_tlb_task of mm/vmscan.c.
-2) timer_flush_tlb_tasks of kernel/timer.c is used by other CPUs to execute
-   flushing tasks.
-3) all data are defined in include/linux/mm.h.
-
-The restriction of dftlb. Following conditions must be met
-1) atomic cmpxchg instruction.
-2) atomically set the access bit after they touch a pte firstly.
-3) To some architectures, vma parameter of flush_tlb_range is maybe important,
-   if it's true, since it's possible that the vma of a TLB flushing task has
-   gone when a CPU starts to execute the task in timer interrupt, so don't use
-   dftlb.
-combine stage 1 with stage 2, and send IPI immediately in fill_in_tlb_tasks.
-// }])>
-
-// Stage Definition <([{
-The whole process of private page page-out is divided into six stages, as
-showed in shrink_pvma_scan_ptes of mm/vmscan.c
-1) PTE to untouched PTE (access bit is cleared), append flushing
tasks to dftlb.
-2) Convert untouched PTE to UnmappedPTE.
-3) Link SwapEntry to every UnmappedPTE.
-4) Synchronize the page of a UnmappedPTE with its physical swap page.
-5) Reclaimed the page and shift UnmappedPTE to SwappedPTE.
-6) SwappedPTE stage.
-// }])>
-
-// Data Definition <([{
-New VMA flag (VM_PURE_PRIVATE) is appended into VMA in include/linux/mm.h.
-
-New PTE type (UnmappedPTE) is appended into PTE system in
-include/asm-i386/pgtable.h.
-// }])>
-
-// Concurrent Racers of Shrinking pps <([{
-shrink_private_vma of mm/vmscan.c uses init_mm.mmlist to scan all swappable
-mm_struct instances, during the process of scaning and reclaiming process, it
-readlockes every mm_struct object, which brings some potential concurrent
-racers
-1) mm/swapfile.c    pps_swapoff (swapoff API).
-2) mm/memory.c  do_wp_page, handle_pte_fault::unmapped_pte, do_anonymous_page
-   (page-fault).
-// }])>
-
-// Private Page Lifecycle of pps <([{
-All pages belonging to pps are called as pure private page.
-
-IN (NOTE, when a pure private page enters into pps, it's also trimmed from
-Linux legacy page system by commeting lru_cache_add_active clause)
-1) fs/exec.c	install_arg_pages	(argument pages).
-2) mm/memory	do_anonymous_page, do_wp_page, do_swap_page	(page fault).
-3) mm/swap_state.c	read_swap_cache_async	(swap pages).
-
-OUT
-1) mm/vmscan.c  shrink_pvma_scan_ptes   (stage 6, reclaim a private page).
-2) mm/memory    zap_pte_range   (free a page).
-3) kernel/fork.c	dup_mmap	(if someone uses fork, migrate all pps pages
-   back to let Linux legacy page system manage them).
-
-When a pure private page is in pps, it can be visited simultaneously by
-page-fault and SwapDaemon.
-// }])>
-
-// VMA Lifecycle of pps <([{
-When a PrivateVMA enters into pps, it's or-ed a new flag -- VM_PURE_PRIVATE,
-the flag is used in the shrink_private_vma of mm/vmscan.c.  Other fields are
-reserved untouch state.
-
-IN.
-1) fs/exec.c	setup_arg_pages	(StackVMA).
-2) mm/mmap.c	do_mmap_pgoff, do_brk	(DataVMA).
-3) mm/mmap.c	split_vma, copy_vma	(in some cases, we need copy a VMA from an
-   exist VMA).
-
-OUT.
-1) kernel/fork.c	dup_mmap	(if someone uses fork, return the vma back to
-   Linux legacy system).
-2) mm/mmap.c	remove_vma, vma_adjust	(destroy VMA).
-3) mm/mmap.c	do_mmap_pgoff	(delete VMA when some errors occur).
-// }])>
-
-// Postscript <([{
-Note, some circumstances aren't tested due to hardware restriction e.g. SMP
-dftlb.
-
-Here are some improvements about pps
-1) In fact, I recommend one-to-one private model -- PrivateVMA, (PTE,
-   UnmappedPTE) and PrivatePage which is described in my OS and the aboved
-   hyperlink of Linux kernel mail list. So it's a compromise to use Linux
-   legacy SwapCache in my pps.
-2) SwapCache should provide more flexible interfaces, shrink_pvma_scan_ptes
-   need allocate swap entries in batch, exactly, allocate a batch of fake
-   continual swap entries, see mm/pps_swapin_readahead.
-3) pps statistic entry in /proc/meminfo.
-4) a better arithmetic to pick mm out to scan and shrink in shrink_private_vma.
-5) It's better to execute the first 2 stages when system is idle, current
-   SwapDaemon only is activated when free pages are low.
-6) A scanning count should be added into mm_struct, so when the count is
-   becoming enough old to open stage 3 and 4.
-
-I'm still working on improvement 4, 5 and 6 to find out how to maximum the
-performance of swap subsystem.
-
-If Linux kernel group can't make a schedule to re-write their memory code,
-however, pps maybe is the best solution until now.
-// }])>
-// vim: foldmarker=<([{,}])> foldmethod=marker et
--- patch-linux/include/linux/mm.h	2006-12-26 15:20:02.685545712 +0800
+++ linux-2.6.16.29/include/linux/mm.h	2006-09-13 02:02:10.000000000 +0800
@@ -169,2 +168,0 @@
-#define VM_PURE_PRIVATE	0x04000000	/* Is the vma is only belonging to a mm,
-									   see more from Documentation/vm_pps.txt */
@@ -1061,18 +1058,0 @@
-/* vmscan.c::delay flush TLB */
-struct delay_tlb_task_t
-{
-	struct mm_struct* mm; //__attribute__ ((section(".pure_private_vma.data")));
-	cpumask_t cpu_mask;
-	struct vm_area_struct* vma[32];
-	unsigned long start[32];
-	unsigned long end[32];
-};
-extern struct delay_tlb_task_t delay_tlb_tasks[32];
-
-// The prototype of the function is fit with the "func" of "int
smp_call_function (void (*func) (void *info), void *info, int retry,
int wait);" of include/linux/smp.h of 2.6.16.29. Call it with NULL.
-void timer_flush_tlb_tasks(void* data /* = NULL */);
-
-void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma);
-void leave_pps(struct vm_area_struct* vma, int migrate_flag);
-
-#define MAX_SERIES_LENGTH 8
--- patch-linux/include/linux/swapops.h	2006-12-26 15:20:02.686545560 +0800
+++ linux-2.6.16.29/include/linux/swapops.h	2006-09-13 02:02:10.000000000 +0800
@@ -53 +53 @@
-	BUG_ON(!pte_swapped(pte));
+	BUG_ON(pte_file(pte));
@@ -67 +67 @@
-	BUG_ON(!pte_swapped(__swp_entry_to_pte(arch_entry)));
+	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
--- patch-linux/include/asm-i386/pgtable-2level.h	2006-12-26
15:20:02.687545408 +0800
+++ linux-2.6.16.29/include/asm-i386/pgtable-2level.h	2006-09-13
02:02:10.000000000 +0800
@@ -49 +49 @@
- * Bits 0, 5, 6 and 7 are taken, split up the 28 bits of offset
+ * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
@@ -52 +52 @@
-#define PTE_FILE_MAX_BITS	28
+#define PTE_FILE_MAX_BITS	29
@@ -55 +55 @@
-	((((pte).pte_low >> 1) & 0xf ) + (((pte).pte_low >> 8) << 4 ))
+	((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
@@ -58 +58 @@
-	((pte_t) { (((off) & 0xf) << 1) + (((off) >> 4) << 8) + _PAGE_FILE })
+	((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
@@ -61 +61 @@
-#define __swp_type(x)			(((x).val >> 1) & 0xf)
+#define __swp_type(x)			(((x).val >> 1) & 0x1f)
@@ -63 +63 @@
-#define __swp_entry(type, offset)	((swp_entry_t) { ((type & 0xf) <<
1) | ((offset) << 8) | _PAGE_SWAPPED })
+#define __swp_entry(type, offset)	((swp_entry_t) { ((type) << 1) |
((offset) << 8) })
--- patch-linux/include/asm-i386/pgtable.h	2006-12-26 15:20:02.687545408 +0800
+++ linux-2.6.16.29/include/asm-i386/pgtable.h	2006-09-13
02:02:10.000000000 +0800
@@ -124,5 +124 @@
-#define _PAGE_UNMAPPED	0x020	/* a special PTE type, hold its page reference
-								   even it's unmapped, see more from
-								   Documentation/vm_pps.txt. */
-#define _PAGE_SWAPPED 0x040 /* swapped PTE. */
-#define _PAGE_FILE	0x060	/* nonlinear file mapping, saved PTE; */
+#define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
@@ -235,3 +231 @@
-static inline int pte_unmapped(pte_t pte)	{ return ((pte).pte_low &
0x60) == _PAGE_UNMAPPED; }
-static inline int pte_swapped(pte_t pte)	{ return ((pte).pte_low &
0x60) == _PAGE_SWAPPED; }
-static inline int pte_file(pte_t pte)		{ return ((pte).pte_low &
0x60) == _PAGE_FILE; }
+static inline int pte_file(pte_t pte)		{ return (pte).pte_low & _PAGE_FILE; }

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-26  8:18 [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem yunfeng zhang
@ 2006-12-26  9:03 ` Zhou Yingchao
  2006-12-27  3:33   ` yunfeng zhang
  2006-12-27  3:38 ` yunfeng zhang
  2006-12-27 18:44 ` Pavel Machek
  2 siblings, 1 reply; 14+ messages in thread
From: Zhou Yingchao @ 2006-12-26  9:03 UTC (permalink / raw)
  To: yunfeng zhang; +Cc: linux-kernel

2006/12/26, yunfeng zhang <zyf.zeroos@gmail.com>:
> In the patch, I introduce a new page system -- pps which can improve
> Linux swap subsystem performance, you can find a new document in
> Documentation/vm_pps.txt. In brief, swap subsystem should scan/reclaim
> pages on VMA instead of zone::active list ...
   The early swap subsystem was really scan/reclaim based on mm/vma,
but now it changes to pages on active/inactive list.  Perhaps you are
not following a right direction.
-- 
Yingchao Zhou
***********************************************
 Institute Of Computing Technology
 Chinese Academy of Sciences
***********************************************

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-26  9:03 ` Zhou Yingchao
@ 2006-12-27  3:33   ` yunfeng zhang
  2006-12-30  5:50     ` Zhou Yingchao
  0 siblings, 1 reply; 14+ messages in thread
From: yunfeng zhang @ 2006-12-27  3:33 UTC (permalink / raw)
  To: linux-kernel; +Cc: Zhou Yingchao

To multiple address space, multiple memory inode architecture, we can introduce
a new core object -- section which has several features
1) Section is used as the atomic unit to contain the pages of a VMA residing in
   the memory inode of the section.
2) When page migration occurs among different memory inodes, new secion should
   be set up to trace the pages.
3) Section can be scanned by the SwapDaemon of its memory inode directely.
4) All sections of a VMA are excluded with each other not overlayed.
5) VMA is made up of sections totally, but its section objects scatter on memory
   inodes.
So to the architecture, we can deploy swap subsystem on an
architecture-independent layer by section and scan pages batchly.

The idea issued by me is whether swap subsystem should be deployed on layer 2 or
layer 3 which is described in Documentation/vm_pps.txt of my patch. To multiple
memory inode architecture, the special memory model should be encapsulated on
layer 3 (architecture-dependent), I think.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-26  8:18 [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem yunfeng zhang
  2006-12-26  9:03 ` Zhou Yingchao
@ 2006-12-27  3:38 ` yunfeng zhang
  2006-12-27 18:44 ` Pavel Machek
  2 siblings, 0 replies; 14+ messages in thread
From: yunfeng zhang @ 2006-12-27  3:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: torvalds

The job listed in Documentation/vm_pps.txt of my patch is too heavy to me, so
I'm appreciate that Linux kernel group can arrange a schedule to help me.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-26  8:18 [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem yunfeng zhang
  2006-12-26  9:03 ` Zhou Yingchao
  2006-12-27  3:38 ` yunfeng zhang
@ 2006-12-27 18:44 ` Pavel Machek
  2006-12-29  6:45   ` yunfeng zhang
  2 siblings, 1 reply; 14+ messages in thread
From: Pavel Machek @ 2006-12-27 18:44 UTC (permalink / raw)
  To: yunfeng zhang; +Cc: linux-kernel

On Tue 26-12-06 16:18:32, yunfeng zhang wrote:
> In the patch, I introduce a new page system -- pps which 
> can improve
> Linux swap subsystem performance, you can find a new 
> document in
> Documentation/vm_pps.txt. In brief, swap subsystem 
> should scan/reclaim
> pages on VMA instead of zone::active list ...

Is it april's fools days?

Read Doc*/SubmittingPatches.
							Pavel

> 
> --- patch-linux/fs/exec.c	2006-12-26 
> 15:20:02.683546016 +0800
> +++ linux-2.6.16.29/fs/exec.c	2006-09-13 
> 02:02:10.000000000 +0800
> @@ -323,0 +324 @@
> +	lru_cache_add_active(page);
> @@ -438 +438,0 @@
> -		enter_pps(mm, mpnt);

						Pavel
-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-27 18:44 ` Pavel Machek
@ 2006-12-29  6:45   ` yunfeng zhang
  2006-12-29  9:15     ` Pavel Machek
  0 siblings, 1 reply; 14+ messages in thread
From: yunfeng zhang @ 2006-12-29  6:45 UTC (permalink / raw)
  To: Pavel Machek; +Cc: linux-kernel, torvalds

I've re-published my work on quilt, sorry.


Index: linux-2.6.16.29/Documentation/vm_pps.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16.29/Documentation/vm_pps.txt	2006-12-29 14:36:36.507332384 +0800
@@ -0,0 +1,192 @@
+                         Pure Private Page System (pps)
+                     Copyright by Yunfeng Zhang on GFDL 1.2
+                              zyf.zeroos@gmail.com
+                              December 24-26, 2006
+
+// Purpose <([{
+The file is used to document the idea which is published firstly at
+http://www.ussg.iu.edu/hypermail/linux/kernel/0607.2/0451.html, as a part of my
+OS -- main page http://blog.chinaunix.net/u/21764/index.php. In brief, a patch
+of the document to enchance the performance of Linux swap subsystem. You can
+find the overview of the idea in section <How to Reclaim Pages more
+Efficiently> and how I patch it into Linux 2.6.16.29 in section <Pure Private
+Page System -- pps>.
+// }])>
+
+// How to Reclaim Pages more Efficiently <([{
+Good idea originates from overall design and management ability, when you look
+down from a manager view, you will relief yourself from disordered code and
+find some problem immediately.
+
+OK! to modern OS, its memory subsystem can be divided into three layers
+1) Space layer (InodeSpace, UserSpace and CoreSpace).
+2) VMA layer (PrivateVMA and SharedVMA, memory architecture-independent layer).
+3) PTE and page layer (architecture-dependent).
+
+Since the 2nd layer assembles the much statistic of page-acess information, so
+it's nature that swap subsystem should be deployed and implemented on the 2nd
+layer.
+
+Undoubtedly, there are some virtues about it
+1) SwapDaemon can collect the statistic of process acessing pages and by it
+   unmaps ptes, SMP specially benefits from it for we can use flush_tlb_range
+   to unmap ptes batchly rather than frequently TLB IPI interrupt per a page in
+   current Linux legacy swap subsystem.
+2) Page-fault can issue better readahead requests since history data shows all
+   related pages have conglomerating affinity. In contrast, Linux page-fault
+   readaheads the pages relative to the SwapSpace position of current
+   page-fault page.
+3) It's conformable to POSIX madvise API family.
+
+Unfortunately, Linux 2.6.16.29 swap subsystem is based on the 3rd layer -- a
+system on zone::active_list/inactive_list.
+
+I've finished a patch, see section <Pure Private Page System -- pps>.
Note, it ISN'T perfect.
+// }])>
+
+// Pure Private Page System -- pps  <([{
+As I've referred in previous section, perfectly applying my idea need to unroot
+page-surrounging swap subsystem to migrate it on VMA, but a huge gap has
+defeated me -- active_list and inactive_list. In fact, you can find
+lru_add_active code anywhere ... It's IMPOSSIBLE to me to complete it only by
+myself. It's also the difference between my design and Linux, in my OS, page is
+the charge of its new owner totally, however, to Linux, page management system
+is still tracing it by PG_active flag.
+
+So I conceive another solution:) That is, set up an independent page-recycle
+system rooted on Linux legacy page system -- pps, intercept all private pages
+belonging to PrivateVMA to pps, then use my pps to cycle them.  By the way, the
+whole job should be consist of two parts, here is the first --
+PrivateVMA-oriented (PPS), other is SharedVMA-oriented (should be called SPS)
+scheduled in future. Of course, if all are done, it will empty Linux legacy
+page system.
+
+In fact, pps is centered on how to better collect and unmap process private
+pages in SwapDaemon mm/vmscan.c:shrink_private_vma, the whole process is
+divided into six stages -- <Stage Definition>. Other sections show the remain
+aspects of pps
+1) <Data Definition> is basic data definition.
+2) <Concurrent racers of Shrinking pps> is focused on synchronization.
+3) <Private Page Lifecycle of pps> -- how private pages enter in/go off pps.
+4) <VMA Lifecycle of pps> which VMA is belonging to pps.
+
+PPS uses init_mm.mm_list list to enumerate all swappable UserSpace.
+
+I'm also glad to highlight my a new idea -- dftlb which is described in
+section <Delay to Flush TLB>.
+// }])>
+
+// Delay to Flush TLB (dftlb) <([{
+Delay to flush TLB is instroduced by me to enhance flushing TLB efficiency, in
+brief, when we want to unmap a page from the page table of a process, why we
+send TLB IPI to other CPUs immediately, since every CPU has timer interrupt, we
+can insert flushing tasks into timer interrupt route to implement a
+free-charged TLB flushing.
+
+The trick is implemented in
+1) TLB flushing task is added in fill_in_tlb_task of mm/vmscan.c.
+2) timer_flush_tlb_tasks of kernel/timer.c is used by other CPUs to execute
+   flushing tasks.
+3) all data are defined in include/linux/mm.h.
+
+The restriction of dftlb. Following conditions must be met
+1) atomic cmpxchg instruction.
+2) atomically set the access bit after they touch a pte firstly.
+3) To some architectures, vma parameter of flush_tlb_range is maybe important,
+   if it's true, since it's possible that the vma of a TLB flushing task has
+   gone when a CPU starts to execute the task in timer interrupt, so don't use
+   dftlb.
+combine stage 1 with stage 2, and send IPI immediately in fill_in_tlb_tasks.
+// }])>
+
+// Stage Definition <([{
+The whole process of private page page-out is divided into six stages, as
+showed in shrink_pvma_scan_ptes of mm/vmscan.c
+1) PTE to untouched PTE (access bit is cleared), append flushing
tasks to dftlb.
+2) Convert untouched PTE to UnmappedPTE.
+3) Link SwapEntry to every UnmappedPTE.
+4) Synchronize the page of a UnmappedPTE with its physical swap page.
+5) Reclaimed the page and shift UnmappedPTE to SwappedPTE.
+6) SwappedPTE stage.
+// }])>
+
+// Data Definition <([{
+New VMA flag (VM_PURE_PRIVATE) is appended into VMA in include/linux/mm.h.
+
+New PTE type (UnmappedPTE) is appended into PTE system in
+include/asm-i386/pgtable.h.
+// }])>
+
+// Concurrent Racers of Shrinking pps <([{
+shrink_private_vma of mm/vmscan.c uses init_mm.mmlist to scan all swappable
+mm_struct instances, during the process of scaning and reclaiming process, it
+readlockes every mm_struct object, which brings some potential concurrent
+racers
+1) mm/swapfile.c    pps_swapoff (swapoff API).
+2) mm/memory.c  do_wp_page, handle_pte_fault::unmapped_pte, do_anonymous_page
+   (page-fault).
+// }])>
+
+// Private Page Lifecycle of pps <([{
+All pages belonging to pps are called as pure private page.
+
+IN (NOTE, when a pure private page enters into pps, it's also trimmed from
+Linux legacy page system by commeting lru_cache_add_active clause)
+1) fs/exec.c	install_arg_pages	(argument pages).
+2) mm/memory	do_anonymous_page, do_wp_page, do_swap_page	(page fault).
+3) mm/swap_state.c	read_swap_cache_async	(swap pages).
+
+OUT
+1) mm/vmscan.c  shrink_pvma_scan_ptes   (stage 6, reclaim a private page).
+2) mm/memory    zap_pte_range   (free a page).
+3) kernel/fork.c	dup_mmap	(if someone uses fork, migrate all pps pages
+   back to let Linux legacy page system manage them).
+
+When a pure private page is in pps, it can be visited simultaneously by
+page-fault and SwapDaemon.
+// }])>
+
+// VMA Lifecycle of pps <([{
+When a PrivateVMA enters into pps, it's or-ed a new flag -- VM_PURE_PRIVATE,
+the flag is used in the shrink_private_vma of mm/vmscan.c.  Other fields are
+left untouched.
+
+IN.
+1) fs/exec.c	setup_arg_pages	(StackVMA).
+2) mm/mmap.c	do_mmap_pgoff, do_brk	(DataVMA).
+3) mm/mmap.c	split_vma, copy_vma	(in some cases, we need copy a VMA from an
+   exist VMA).
+
+OUT.
+1) kernel/fork.c	dup_mmap	(if someone uses fork, return the vma back to
+   Linux legacy system).
+2) mm/mmap.c	remove_vma, vma_adjust	(destroy VMA).
+3) mm/mmap.c	do_mmap_pgoff	(delete VMA when some errors occur).
+// }])>
+
+// Postscript <([{
+Note, some circumstances aren't tested due to hardware restriction e.g. SMP
+dftlb.
+
+Here are some improvements about pps
+1) In fact, I recommend one-to-one private model -- PrivateVMA, (PTE,
+   UnmappedPTE) and PrivatePage (SwapPage) which is described in my OS and the
+   aboved hyperlink of Linux kernel mail list. So it's a compromise to use
+   Linux legacy SwapCache in my pps.
+2) SwapCache should provide more flexible interfaces, shrink_pvma_scan_ptes
+   need allocate swap entries in batch, exactly, allocate a batch of fake
+   continual swap entries, see mm/pps_swapin_readahead.
+3) pps statistic entry in /proc/meminfo.
+4) a better arithmetic to pick mm out to scan and shrink in shrink_private_vma.
+5) It's better to execute the first 2 stages when system is idle, current
+   SwapDaemon only is activated when free pages are low.
+6) A scanning count should be added into mm_struct, so when the count is
+   becoming enough old to open stage 3 and 4.
+
+I'm still working on improvement 4, 5 and 6 to find out how to maximum the
+performance of swap subsystem.
+
+If Linux kernel group can't make a schedule to re-write their memory code,
+however, pps maybe is the best solution until now.
+// }])>
+// vim: foldmarker=<([{,}])> foldmethod=marker et
Index: linux-2.6.16.29/fs/exec.c
===================================================================
--- linux-2.6.16.29.orig/fs/exec.c	2006-12-29 13:56:51.000000000 +0800
+++ linux-2.6.16.29/fs/exec.c	2006-12-29 13:57:18.000000000 +0800
@@ -321,7 +321,6 @@
 		goto out;
 	}
 	inc_mm_counter(mm, anon_rss);
-	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_new_anon_rmap(page, vma, address);
@@ -436,6 +435,7 @@
 			kmem_cache_free(vm_area_cachep, mpnt);
 			return ret;
 		}
+		enter_pps(mm, mpnt);
 		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	}

Index: linux-2.6.16.29/include/asm-i386/pgtable-2level.h
===================================================================
--- linux-2.6.16.29.orig/include/asm-i386/pgtable-2level.h	2006-12-29
13:56:53.000000000 +0800
+++ linux-2.6.16.29/include/asm-i386/pgtable-2level.h	2006-12-29
13:57:19.612186872 +0800
@@ -46,21 +46,21 @@
 }

 /*
- * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
+ * Bits 0, 5, 6 and 7 are taken, split up the 28 bits of offset
  * into this range:
  */
-#define PTE_FILE_MAX_BITS	29
+#define PTE_FILE_MAX_BITS	28

 #define pte_to_pgoff(pte) \
-	((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
+	((((pte).pte_low >> 1) & 0xf ) + (((pte).pte_low >> 8) << 4 ))

 #define pgoff_to_pte(off) \
-	((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
+	((pte_t) { (((off) & 0xf) << 1) + (((off) >> 4) << 8) + _PAGE_FILE })

 /* Encode and de-code a swap entry */
-#define __swp_type(x)			(((x).val >> 1) & 0x1f)
+#define __swp_type(x)			(((x).val >> 1) & 0xf)
 #define __swp_offset(x)			((x).val >> 8)
-#define __swp_entry(type, offset)	((swp_entry_t) { ((type) << 1) |
((offset) << 8) })
+#define __swp_entry(type, offset)	((swp_entry_t) { ((type & 0xf) <<
1) | ((offset) << 8) | _PAGE_SWAPPED })
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { (pte).pte_low })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })

Index: linux-2.6.16.29/include/asm-i386/pgtable.h
===================================================================
--- linux-2.6.16.29.orig/include/asm-i386/pgtable.h	2006-12-29
13:56:53.000000000 +0800
+++ linux-2.6.16.29/include/asm-i386/pgtable.h	2006-12-29
13:57:19.846151304 +0800
@@ -121,7 +121,11 @@
 #define _PAGE_UNUSED3	0x800

 /* If _PAGE_PRESENT is clear, we use these: */
-#define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_UNMAPPED	0x020	/* a special PTE type, hold its page reference
+								   even it's unmapped, see more from
+								   Documentation/vm_pps.txt. */
+#define _PAGE_SWAPPED 0x040 /* swapped PTE. */
+#define _PAGE_FILE	0x060	/* nonlinear file mapping, saved PTE; */
 #define _PAGE_PROTNONE	0x080	/* if the user mapped it with PROT_NONE;
 				   pte_present gives true */
 #ifdef CONFIG_X86_PAE
@@ -228,7 +232,9 @@
 /*
  * The following only works if pte_present() is not true.
  */
-static inline int pte_file(pte_t pte)		{ return (pte).pte_low & _PAGE_FILE; }
+static inline int pte_unmapped(pte_t pte)	{ return ((pte).pte_low &
0x60) == _PAGE_UNMAPPED; }
+static inline int pte_swapped(pte_t pte)	{ return ((pte).pte_low &
0x60) == _PAGE_SWAPPED; }
+static inline int pte_file(pte_t pte)		{ return ((pte).pte_low &
0x60) == _PAGE_FILE; }

 static inline pte_t pte_rdprotect(pte_t pte)	{ (pte).pte_low &=
~_PAGE_USER; return pte; }
 static inline pte_t pte_exprotect(pte_t pte)	{ (pte).pte_low &=
~_PAGE_USER; return pte; }
Index: linux-2.6.16.29/include/linux/mm.h
===================================================================
--- linux-2.6.16.29.orig/include/linux/mm.h	2006-12-29 13:56:53.000000000 +0800
+++ linux-2.6.16.29/include/linux/mm.h	2006-12-29 13:57:19.098265000 +0800
@@ -166,6 +166,8 @@
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
 #define VM_INSERTPAGE	0x02000000	/* The vma has had
"vm_insert_page()" done on it */
+#define VM_PURE_PRIVATE	0x04000000	/* Is the vma is only belonging to a mm,
+									   see more from Documentation/vm_pps.txt */

 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -1056,5 +1058,25 @@
 extern int randomize_va_space;
 #endif

+/* vmscan.c::delay flush TLB */
+struct delay_tlb_task_t
+{
+	struct mm_struct* mm;
+	cpumask_t cpu_mask;
+	struct vm_area_struct* vma[32];
+	unsigned long start[32];
+	unsigned long end[32];
+};
+extern struct delay_tlb_task_t delay_tlb_tasks[32];
+
+// The prototype of the function is fit with the "func" of "int
+// smp_call_function (void (*func) (void *info), void *info, int retry, int
+// wait);" of include/linux/smp.h of 2.6.16.29. Call it with NULL.
+void timer_flush_tlb_tasks(void* data /* = NULL */);
+
+void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma);
+void leave_pps(struct vm_area_struct* vma, int migrate_flag);
+
+#define MAX_SERIES_LENGTH 8
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
Index: linux-2.6.16.29/include/linux/swapops.h
===================================================================
--- linux-2.6.16.29.orig/include/linux/swapops.h	2006-12-29
13:56:53.000000000 +0800
+++ linux-2.6.16.29/include/linux/swapops.h	2006-12-29 13:57:19.000000000 +0800
@@ -50,7 +50,7 @@
 {
 	swp_entry_t arch_entry;

-	BUG_ON(pte_file(pte));
+	BUG_ON(!pte_swapped(pte));
 	arch_entry = __pte_to_swp_entry(pte);
 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
@@ -64,6 +64,6 @@
 	swp_entry_t arch_entry;

 	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
-	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
+	BUG_ON(!pte_swapped(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
Index: linux-2.6.16.29/kernel/fork.c
===================================================================
--- linux-2.6.16.29.orig/kernel/fork.c	2006-12-29 13:56:52.000000000 +0800
+++ linux-2.6.16.29/kernel/fork.c	2006-12-29 13:57:20.000000000 +0800
@@ -229,6 +229,7 @@
 		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
+		leave_pps(mpnt, 1);
 		*tmp = *mpnt;
 		pol = mpol_copy(vma_policy(mpnt));
 		retval = PTR_ERR(pol);
Index: linux-2.6.16.29/kernel/timer.c
===================================================================
--- linux-2.6.16.29.orig/kernel/timer.c	2006-12-29 13:56:52.000000000 +0800
+++ linux-2.6.16.29/kernel/timer.c	2006-12-29 13:57:20.000000000 +0800
@@ -842,6 +842,8 @@
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
  	run_posix_cpu_timers(p);
+
+	timer_flush_tlb_tasks(NULL);
 }

 /*
Index: linux-2.6.16.29/mm/fremap.c
===================================================================
--- linux-2.6.16.29.orig/mm/fremap.c	2006-12-29 13:56:51.000000000 +0800
+++ linux-2.6.16.29/mm/fremap.c	2006-12-29 13:57:21.000000000 +0800
@@ -37,7 +37,7 @@
 			page_cache_release(page);
 		}
 	} else {
-		if (!pte_file(pte))
+		if (pte_swapped(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
 		pte_clear(mm, addr, ptep);
 	}
Index: linux-2.6.16.29/mm/memory.c
===================================================================
--- linux-2.6.16.29.orig/mm/memory.c	2006-12-29 13:56:52.000000000 +0800
+++ linux-2.6.16.29/mm/memory.c	2006-12-29 13:57:51.000000000 +0800
@@ -436,7 +436,7 @@

 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
-		if (!pte_file(pte)) {
+		if (pte_swapped(pte)) {
 			swap_duplicate(pte_to_swp_entry(pte));
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
@@ -658,6 +658,8 @@
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
+			// if (vma->vm_flags & VM_PURE_PRIVATE && page != ZERO_PAGE(addr))
+			// 	lru_cache_add_active(page);
 			if (PageAnon(page))
 				anon_rss--;
 			else {
@@ -677,7 +679,16 @@
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(ptent))
+		if (pte_unmapped(ptent)) {
+			struct page *page;
+			page = pfn_to_page(pte_pfn(ptent));
+			pte_clear_full(mm, addr, pte, tlb->fullmm);
+			// lru_cache_add_active(page);
+			tlb_remove_page(tlb, page);
+			anon_rss--;
+			continue;
+		}
+		if (pte_swapped(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
@@ -1508,7 +1519,8 @@
 		ptep_establish(vma, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
-		lru_cache_add_active(new_page);
+		if (!(vma->vm_flags & VM_PURE_PRIVATE))
+			lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);

 		/* Free the old page.. */
@@ -1864,6 +1876,84 @@
 }

 /*
+ * New read ahead code, mainly for VM_PURE_PRIVATE only.
+ */
+static void pps_swapin_readahead(swp_entry_t entry, unsigned long
addr,struct vm_area_struct *vma, pte_t* pte, pmd_t* pmd)
+{
+	struct page* page;
+	pte_t *prev, *next;
+	swp_entry_t temp;
+	spinlock_t* ptl = pte_lockptr(vma->vm_mm, pmd);
+	int swapType = swp_type(entry);
+	int swapOffset = swp_offset(entry);
+	int readahead = 1, abs;
+
+	if (!(vma->vm_flags & VM_PURE_PRIVATE)) {
+		swapin_readahead(entry, addr, vma);
+		return;
+	}
+
+	page = read_swap_cache_async(entry, vma, addr);
+	if (!page)
+		return;
+	page_cache_release(page);
+
+	// read ahead the whole series, first forward then backward.
+	while (readahead < MAX_SERIES_LENGTH) {
+		next = pte++;
+		if (next - (pte_t*) pmd >= PTRS_PER_PTE)
+			break;
+		spin_lock(ptl);
+        if (!(!pte_present(*next) && pte_swapped(*next))) {
+			spin_unlock(ptl);
+			break;
+		}
+		temp = pte_to_swp_entry(*next);
+		spin_unlock(ptl);
+		if (swp_type(temp) != swapType)
+			break;
+		abs = swp_offset(temp) - swapOffset;
+		abs = abs < 0 ? -abs : abs;
+		swapOffset = swp_offset(temp);
+		if (abs > 8)
+			// the two swap entries are too far, give up!
+			break;
+		page = read_swap_cache_async(temp, vma, addr);
+		if (!page)
+			return;
+		page_cache_release(page);
+		readahead++;
+	}
+
+	swapOffset = swp_offset(entry);
+	while (readahead < MAX_SERIES_LENGTH) {
+		prev = pte--;
+		if (prev - (pte_t*) pmd < 0)
+			break;
+		spin_lock(ptl);
+        if (!(!pte_present(*prev) && pte_swapped(*prev))) {
+			spin_unlock(ptl);
+			break;
+		}
+		temp = pte_to_swp_entry(*prev);
+		spin_unlock(ptl);
+		if (swp_type(temp) != swapType)
+			break;
+		abs = swp_offset(temp) - swapOffset;
+		abs = abs < 0 ? -abs : abs;
+		swapOffset = swp_offset(temp);
+		if (abs > 8)
+			// the two swap entries are too far, give up!
+			break;
+		page = read_swap_cache_async(temp, vma, addr);
+		if (!page)
+			return;
+		page_cache_release(page);
+		readahead++;
+	}
+}
+
+/*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -1885,7 +1975,7 @@
 again:
 	page = lookup_swap_cache(entry);
 	if (!page) {
- 		swapin_readahead(entry, address, vma);
+ 		pps_swapin_readahead(entry, address, vma, page_table, pmd);
  		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
 			/*
@@ -1904,10 +1994,15 @@
 		grab_swap_token();
 	}

-	mark_page_accessed(page);
+	if (!(vma->vm_flags & VM_PURE_PRIVATE))
+		mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
 		/* Page migration has occured */
+		if (vma->vm_flags & VM_PURE_PRIVATE) {
+			lru_cache_add_active(page);
+			mark_page_accessed(page);
+		}
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
@@ -1922,6 +2017,10 @@

 	if (unlikely(!PageUptodate(page))) {
 		ret = VM_FAULT_SIGBUS;
+		if (vma->vm_flags & VM_PURE_PRIVATE) {
+			lru_cache_add_active(page);
+			mark_page_accessed(page);
+		}
 		goto out_nomap;
 	}

@@ -1993,8 +2092,9 @@
 		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 		if (!pte_none(*page_table))
 			goto release;
+		if (!(vma->vm_flags & VM_PURE_PRIVATE))
+			lru_cache_add_active(page);
 		inc_mm_counter(mm, anon_rss);
-		lru_cache_add_active(page);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
@@ -2209,6 +2309,20 @@

 	old_entry = entry = *pte;
 	if (!pte_present(entry)) {
+		if (pte_unmapped(entry)) {
+			BUG_ON(!(vma->vm_flags & VM_PURE_PRIVATE));
+			struct page* page = pte_page(entry);
+			pte_t temp_pte = mk_pte(page, vma->vm_page_prot);
+			pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+			if (unlikely(pte_same(*pte, entry))) {
+				page_add_new_anon_rmap(page, vma, address);
+				set_pte_at(mm, address, pte, temp_pte);
+				update_mmu_cache(vma, address, temp_pte);
+				lazy_mmu_prot_update(temp_pte);
+			}
+			pte_unmap_unlock(pte, ptl);
+			return VM_FAULT_MINOR;
+		}
 		if (pte_none(entry)) {
 			if (!vma->vm_ops || !vma->vm_ops->nopage)
 				return do_anonymous_page(mm, vma, address,
@@ -2445,3 +2559,112 @@
 }

 #endif	/* __HAVE_ARCH_GATE_AREA */
+
+static void migrate_back_pte_range(struct mm_struct* mm, pmd_t *pmd, struct
+		vm_area_struct *vma, unsigned long addr, unsigned long end)
+{
+	struct page* page;
+	pte_t entry;
+	pte_t* pte;
+	spinlock_t* ptl;
+
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	do {
+		if (!pte_present(*pte) && pte_unmapped(*pte)) {
+			page = pte_page(*pte);
+			entry = mk_pte(page, vma->vm_page_prot);
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			set_pte_at(mm, addr, pte, entry);
+			BUG_ON(page == ZERO_PAGE(addr));
+			page_add_new_anon_rmap(page, vma, addr);
+		}
+		if (pte_present(*pte)) {
+			page = pte_page(*pte);
+			if (page == ZERO_PAGE(addr))
+				continue;
+			lru_cache_add_active(page);
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(pte - 1, ptl);
+	lru_add_drain();
+}
+
+static void migrate_back_pmd_range(struct mm_struct* mm, pud_t *pud, struct
+		vm_area_struct *vma, unsigned long addr, unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		migrate_back_pte_range(mm, pmd, vma, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void migrate_back_pud_range(struct mm_struct* mm, pgd_t *pgd, struct
+		vm_area_struct *vma, unsigned long addr, unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		migrate_back_pmd_range(mm, pud, vma, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+// migrate all pages of pure private vma back to Linux legacy memory
management.
+static void migrate_back_legacy_linux(struct mm_struct* mm, struct
vm_area_struct* vma)
+{
+	pgd_t* pgd;
+	unsigned long next;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		migrate_back_pud_range(mm, pgd, vma, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+LIST_HEAD(pps_head);
+LIST_HEAD(pps_head_buddy);
+
+DEFINE_SPINLOCK(pps_lock);
+
+void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma)
+{
+	int condition = VM_READ | VM_WRITE | VM_EXEC | \
+		 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \
+		 VM_GROWSDOWN | VM_GROWSUP | \
+		 VM_LOCKED | VM_SEQ_READ | VM_RAND_READ | VM_DONTCOPY | VM_ACCOUNT;
+	if (!(vma->vm_flags & ~condition) && vma->vm_file == NULL) {
+		vma->vm_flags |= VM_PURE_PRIVATE;
+		if (list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			if (list_empty(&mm->mmlist))
+				list_add(&mm->mmlist, &init_mm.mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+	}
+}
+
+void leave_pps(struct vm_area_struct* vma, int migrate_flag)
+{
+	struct mm_struct* mm = vma->vm_mm;
+
+	if (vma->vm_flags & VM_PURE_PRIVATE) {
+		vma->vm_flags &= ~VM_PURE_PRIVATE;
+		if (migrate_flag)
+			migrate_back_legacy_linux(mm, vma);
+	}
+}
Index: linux-2.6.16.29/mm/mmap.c
===================================================================
--- linux-2.6.16.29.orig/mm/mmap.c	2006-12-29 13:56:51.000000000 +0800
+++ linux-2.6.16.29/mm/mmap.c	2006-12-29 13:57:20.000000000 +0800
@@ -206,6 +206,7 @@
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_free(vma_policy(vma));
+	leave_pps(vma, 0);
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
@@ -593,6 +594,7 @@
 			fput(file);
 		mm->map_count--;
 		mpol_free(vma_policy(next));
+		leave_pps(next, 0);
 		kmem_cache_free(vm_area_cachep, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
@@ -1091,6 +1093,8 @@
 	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 		vma->vm_flags &= ~VM_ACCOUNT;

+	enter_pps(mm, vma);
+
 	/* Can addr have changed??
 	 *
 	 * Answer: Yes, several device drivers can do it in their
@@ -1113,6 +1117,7 @@
 			fput(file);
 		}
 		mpol_free(vma_policy(vma));
+		leave_pps(vma, 0);
 		kmem_cache_free(vm_area_cachep, vma);
 	}
 out:	
@@ -1140,6 +1145,7 @@
 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
+	leave_pps(vma, 0);
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
 	if (charged)
@@ -1717,6 +1723,10 @@

 	/* most fields are the same, copy all, and then fixup */
 	*new = *vma;
+	if (new->vm_flags & VM_PURE_PRIVATE) {
+		new->vm_flags &= ~VM_PURE_PRIVATE;
+		enter_pps(mm, new);
+	}

 	if (new_below)
 		new->vm_end = addr;
@@ -1917,6 +1927,7 @@
 	vma->vm_pgoff = pgoff;
 	vma->vm_flags = flags;
 	vma->vm_page_prot = protection_map[flags & 0x0f];
+	enter_pps(mm, vma);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
@@ -2040,6 +2051,10 @@
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
+			if (new_vma->vm_flags & VM_PURE_PRIVATE) {
+				new_vma->vm_flags &= ~VM_PURE_PRIVATE;
+				enter_pps(mm, new_vma);
+			}
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 		}
 	}
Index: linux-2.6.16.29/mm/rmap.c
===================================================================
--- linux-2.6.16.29.orig/mm/rmap.c	2006-12-29 13:56:51.000000000 +0800
+++ linux-2.6.16.29/mm/rmap.c	2006-12-29 13:57:21.000000000 +0800
@@ -633,7 +633,7 @@
 			spin_unlock(&mmlist_lock);
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
+		BUG_ON(!pte_swapped(*pte));
 		dec_mm_counter(mm, anon_rss);
 	} else
 		dec_mm_counter(mm, file_rss);
Index: linux-2.6.16.29/mm/swap_state.c
===================================================================
--- linux-2.6.16.29.orig/mm/swap_state.c	2006-12-29 13:56:51.000000000 +0800
+++ linux-2.6.16.29/mm/swap_state.c	2006-12-29 13:57:20.000000000 +0800
@@ -354,7 +354,8 @@
 			/*
 			 * Initiate read into locked page and return.
 			 */
-			lru_cache_add_active(new_page);
+			if (vma == NULL || !(vma->vm_flags & VM_PURE_PRIVATE))
+				lru_cache_add_active(new_page);
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
Index: linux-2.6.16.29/mm/swapfile.c
===================================================================
--- linux-2.6.16.29.orig/mm/swapfile.c	2006-12-29 13:56:52.000000000 +0800
+++ linux-2.6.16.29/mm/swapfile.c	2006-12-29 13:57:21.000000000 +0800
@@ -7,6 +7,7 @@

 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
@@ -417,6 +418,163 @@
 	}
 }

+static int pps_test_swap_type(struct mm_struct* mm, pmd_t* pmd, pte_t* pte, int
+		type, struct page** ret_page)
+{
+	spinlock_t* ptl = pte_lockptr(mm, pmd);
+	swp_entry_t entry;
+	struct page* page;
+
+	spin_lock(ptl);
+	if (!pte_present(*pte) && pte_swapped(*pte)) {
+		entry = pte_to_swp_entry(*pte);
+		if (swp_type(entry) == type) {
+			*ret_page = NULL;
+			spin_unlock(ptl);
+			return 1;
+		}
+	} else {
+		page = pfn_to_page(pte_pfn(*pte));
+		if (PageSwapCache(page)) {
+			entry.val = page_private(page);
+			if (swp_type(entry) == type) {
+				page_cache_get(page);
+				*ret_page = page;
+				spin_unlock(ptl);
+				return 1;
+			}
+		}
+	}
+	spin_unlock(ptl);
+	return 0;
+}
+
+static int pps_swapoff_scan_ptes(struct mm_struct* mm, struct vm_area_struct*
+		vma, pmd_t* pmd, unsigned long addr, unsigned long end, int type)
+{
+	pte_t *pte;
+	struct page* page;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		while (pps_test_swap_type(mm, pmd, pte, type, &page)) {
+			if (page == NULL) {
+				switch (__handle_mm_fault(mm, vma, addr, 0)) {
+				case VM_FAULT_SIGBUS:
+				case VM_FAULT_OOM:
+					return -ENOMEM;
+				case VM_FAULT_MINOR:
+				case VM_FAULT_MAJOR:
+					break;
+				default:
+					BUG();
+				}
+			} else {
+				wait_on_page_locked(page);
+				wait_on_page_writeback(page);
+				lock_page(page);
+				if (!PageSwapCache(page)) {
+					unlock_page(page);
+					page_cache_release(page);
+					break;
+				}
+				wait_on_page_writeback(page);
+				delete_from_swap_cache(page);
+				unlock_page(page);
+				page_cache_release(page);
+				break;
+			}
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	return 0;
+}
+
+static int pps_swapoff_pmd_range(struct mm_struct* mm, struct vm_area_struct*
+		vma, pud_t* pud, unsigned long addr, unsigned long end, int type)
+{
+	unsigned long next;
+	int ret;
+	pmd_t* pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		ret = pps_swapoff_scan_ptes(mm, vma, pmd, addr, next, type);
+		if (ret == -ENOMEM)
+			return ret;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static int pps_swapoff_pud_range(struct mm_struct* mm, struct vm_area_struct*
+		vma, pgd_t* pgd, unsigned long addr, unsigned long end, int type)
+{
+	unsigned long next;
+	int ret;
+	pud_t* pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		ret = pps_swapoff_pmd_range(mm, vma, pud, addr, next, type);
+		if (ret == -ENOMEM)
+			return ret;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+static int pps_swapoff_pgd_range(struct mm_struct* mm, struct vm_area_struct*
+		vma, int type)
+{
+	unsigned long next;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	int ret;
+	pgd_t* pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		ret = pps_swapoff_pud_range(mm, vma, pgd, addr, next, type);
+		if (ret == -ENOMEM)
+			return ret;
+	} while (pgd++, addr = next, addr != end);
+	return 0;
+}
+
+static int pps_swapoff(int type)
+{
+	struct mm_struct* mm;
+	struct vm_area_struct* vma;
+	struct list_head *pos, *lhtemp;
+	int ret = 0;
+
+	spin_lock(&mmlist_lock);
+	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
+		mm = list_entry(pos, struct mm_struct, mmlist);
+		if (atomic_inc_return(&mm->mm_users) == 1) {
+			atomic_dec(&mm->mm_users);
+			continue;
+		}
+		spin_unlock(&mmlist_lock);
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+			if (!(vma->vm_flags & VM_PURE_PRIVATE))
+				continue;
+			if (vma->vm_flags & VM_LOCKED)
+				continue;
+			ret = pps_swapoff_pgd_range(mm, vma, type);
+			if (ret == -ENOMEM)
+				break;
+		}
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+		spin_lock(&mmlist_lock);
+	}
+	spin_unlock(&mmlist_lock);
+	return ret;
+}
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
@@ -619,6 +777,12 @@
 	int reset_overflow = 0;
 	int shmem;

+	// Let's first read all pps pages back! Note, it's one-to-one mapping.
+	retval = pps_swapoff(type);
+	if (retval == -ENOMEM) // something was wrong.
+		return -ENOMEM;
+	// Now, the remain pages are shared pages, go ahead!
+
 	/*
 	 * When searching mms for an entry, a good strategy is to
 	 * start at the first mm we freed the previous entry from
@@ -848,16 +1012,20 @@
  */
 static void drain_mmlist(void)
 {
-	struct list_head *p, *next;
+	// struct list_head *p, *next;
 	unsigned int i;

 	for (i = 0; i < nr_swapfiles; i++)
 		if (swap_info[i].inuse_pages)
 			return;
+	/*
+	 * Now, init_mm.mmlist list not only is used by SwapDevice but also is used
+	 * by PPS.
 	spin_lock(&mmlist_lock);
 	list_for_each_safe(p, next, &init_mm.mmlist)
 		list_del_init(p);
 	spin_unlock(&mmlist_lock);
+	*/
 }

 /*
Index: linux-2.6.16.29/mm/vmscan.c
===================================================================
--- linux-2.6.16.29.orig/mm/vmscan.c	2006-12-29 13:56:51.000000000 +0800
+++ linux-2.6.16.29/mm/vmscan.c	2006-12-29 13:58:30.000000000 +0800
@@ -1514,6 +1514,398 @@
 	return ret;
 }

+struct series_t {
+	pte_t orig_ptes[MAX_SERIES_LENGTH];
+	pte_t* ptes[MAX_SERIES_LENGTH];
+	struct page* pages[MAX_SERIES_LENGTH];
+	int series_length;
+	int series_stage;
+} series;
+
+static int get_series_stage(pte_t* pte, int index)
+{
+	series.orig_ptes[index] = *pte;
+	series.ptes[index] = pte;
+	if (pte_present(series.orig_ptes[index])) {
+		struct page* page = pfn_to_page(pte_pfn(series.orig_ptes[index]));
+		series.pages[index] = page;
+		if (page == ZERO_PAGE(addr)) // reserved page is exclusive from us.
+			return 7;
+		if (pte_young(series.orig_ptes[index])) {
+			return 1;
+		} else
+			return 2;
+	} else if (pte_unmapped(series.orig_ptes[index])) {
+		struct page* page = pfn_to_page(pte_pfn(series.orig_ptes[index]));
+		series.pages[index] = page;
+		if (!PageSwapCache(page))
+			return 3;
+		else {
+			if (PageWriteback(page) || PageDirty(page))
+				return 4;
+			else
+				return 5;
+		}
+	} else // pte_swapped -- SwappedPTE
+		return 6;
+}
+
+static void find_series(pte_t** start, unsigned long* addr, unsigned long end)
+{
+	int i;
+	int series_stage = get_series_stage((*start)++, 0);
+	*addr += PAGE_SIZE;
+
+	for (i = 1; i < MAX_SERIES_LENGTH && *addr < end; i++, (*start)++,
*addr += PAGE_SIZE) {
+		if (series_stage != get_series_stage(*start, i))
+			break;
+	}
+	series.series_stage = series_stage;
+	series.series_length = i;
+}
+
+struct delay_tlb_task_t delay_tlb_tasks[32] = { [0 ... 31] = {0} };
+
+void timer_flush_tlb_tasks(void* data)
+{
+	// To x86, if we found there were some flushing tasks, we should do
it all together, that is, flush it once.
+	int i;
+#ifdef CONFIG_X86
+	int flag = 0;
+#endif
+	for (i = 0; i < 32; i++) {
+		if (delay_tlb_tasks[i].mm != NULL &&
+				cpu_isset(smp_processor_id(), delay_tlb_tasks[i].mm->cpu_vm_mask) &&
+				cpu_isset(smp_processor_id(), delay_tlb_tasks[i].cpu_mask)) {
+#ifdef CONFIG_X86
+			flag = 1;
+#elif
+			// smp::local_flush_tlb_range(delay_tlb_tasks[i]);
+#endif
+			cpu_clear(smp_processor_id(), delay_tlb_tasks[i].cpu_mask);
+		}
+	}
+#ifdef CONFIG_X86
+	if (flag)
+		local_flush_tlb();
+#endif
+}
+
+static struct delay_tlb_task_t* delay_task = NULL;
+static int vma_index = 0;
+
+static struct delay_tlb_task_t* search_free_tlb_tasks_slot(void)
+{
+	struct delay_tlb_task_t* ret = NULL;
+	int i;
+again:
+	for (i = 0; i < 32; i++) {
+		if (delay_tlb_tasks[i].mm != NULL) {
+			if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
+				mmput(delay_tlb_tasks[i].mm);
+				delay_tlb_tasks[i].mm = NULL;
+				ret = &delay_tlb_tasks[i];
+			}
+		} else
+			ret = &delay_tlb_tasks[i];
+	}
+	if (!ret) { // Force flush TLBs.
+		on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
+		goto again;
+	}
+	return ret;
+}
+
+static void init_delay_task(struct mm_struct* mm)
+{
+	cpus_clear(delay_task->cpu_mask);
+	vma_index = 0;
+	delay_task->mm = mm;
+}
+
+/*
+ * We will be working on the mm, so let's force to flush it if necessary.
+ */
+static void start_tlb_tasks(struct mm_struct* mm)
+{
+	int i, flag = 0;
+again:
+	for (i = 0; i < 32; i++) {
+		if (delay_tlb_tasks[i].mm == mm) {
+			if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
+				mmput(delay_tlb_tasks[i].mm);
+				delay_tlb_tasks[i].mm = NULL;
+			} else
+				flag = 1;
+		}
+	}
+	if (flag) { // Force flush TLBs.
+		on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
+		goto again;
+	}
+	BUG_ON(delay_task != NULL);
+	delay_task = search_free_tlb_tasks_slot();
+	init_delay_task(mm);
+}
+
+static void end_tlb_tasks(void)
+{
+	if (!cpus_empty(delay_task->cpu_mask)) {
+		atomic_inc(&delay_task->mm->mm_users);
+		delay_task->cpu_mask = delay_task->mm->cpu_vm_mask;
+	} else
+		delay_task->mm = NULL;
+	delay_task = NULL;
+}
+
+static void fill_in_tlb_tasks(struct vm_area_struct* vma, unsigned long addr,
+		unsigned long end)
+{
+	struct mm_struct* mm;
+fill_it:
+	if (vma_index != 32) {
+		delay_task->vma[vma_index] = vma;
+		delay_task->start[vma_index] = addr;
+		delay_task->end[vma_index] = end;
+		vma_index++;
+		return;
+	}
+	mm = delay_task->mm;
+	end_tlb_tasks();
+
+	delay_task = search_free_tlb_tasks_slot();
+	init_delay_task(mm);
+	goto fill_it;
+}
+
+static void shrink_pvma_scan_ptes(struct scan_control* sc,
+		struct mm_struct* mm, struct vm_area_struct* vma, pmd_t* pmd,
+		unsigned long addr, unsigned long end)
+{
+	int i;
+	spinlock_t* ptl = pte_lockptr(mm, pmd);
+	pte_t* pte = pte_offset_map(pmd, addr);
+	int anon_rss = 0;
+	struct pagevec freed_pvec;
+	int may_enter_fs = (sc->gfp_mask & (__GFP_FS | __GFP_IO));
+	struct address_space* mapping = &swapper_space;
+
+	pagevec_init(&freed_pvec, 1);
+	do {
+		memset(&series, 0, sizeof(struct series_t));
+		find_series(&pte, &addr, end);
+		switch (series.series_stage) {
+			case 1: // PTE -- untouched PTE.
+				for (i = 0; i < series.series_length; i++) {
+					struct page* page = series.pages[i];
+					lock_page(page);
+					spin_lock(ptl);
+					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+						if (pte_dirty(*series.ptes[i]))
+							set_page_dirty(page);
+						set_pte_at(mm, addr + i * PAGE_SIZE, series.ptes[i],
+								pte_mkold(pte_mkclean(*series.ptes[i])));
+					}
+					spin_unlock(ptl);
+					unlock_page(page);
+				}
+				fill_in_tlb_tasks(vma, addr, addr + (PAGE_SIZE * series.series_length));
+				break;
+			case 2: // untouched PTE -- UnmappedPTE.
+				/*
+				 * Note in stage 1, we've flushed TLB in fill_in_tlb_tasks, so
+				 * if it's still clear here, we can shift it to Unmapped type.
+				 *
+				 * If some architecture doesn't support atomic cmpxchg
+				 * instruction or can't atomically set the access bit after
+				 * they touch a pte at first, combine stage 1 with stage 2, and
+				 * send IPI immediately in fill_in_tlb_tasks.
+				 */
+				spin_lock(ptl);
+				for (i = 0; i < series.series_length; i++) {
+					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+						pte_t pte_unmapped = series.orig_ptes[i];
+						pte_unmapped.pte_low &= ~_PAGE_PRESENT;
+						pte_unmapped.pte_low |= _PAGE_UNMAPPED;
+						if (cmpxchg(&series.ptes[i]->pte_low,
+									series.orig_ptes[i].pte_low,
+									pte_unmapped.pte_low) !=
+								series.orig_ptes[i].pte_low)
+							continue;
+						page_remove_rmap(series.pages[i]);
+						anon_rss--;
+					}
+				}
+				spin_unlock(ptl);
+				break;
+			case 3: // Attach SwapPage to PrivatePage.
+				/*
+				 * A better arithmetic should be applied to Linux SwapDevice to
+				 * allocate fake continual SwapPages which are close to each
+				 * other, the offset between two close SwapPages is less than 8.
+				 */
+				if (sc->may_swap) {
+					for (i = 0; i < series.series_length; i++) {
+						lock_page(series.pages[i]);
+						if (!PageSwapCache(series.pages[i])) {
+							if (!add_to_swap(series.pages[i], GFP_ATOMIC)) {
+								unlock_page(series.pages[i]);
+								break;
+							}
+						}
+						unlock_page(series.pages[i]);
+					}
+				}
+				break;
+			case 4: // SwapPage isn't consistent with PrivatePage.
+				/*
+				 * A mini version pageout().
+				 *
+				 * Current swap space can't commit multiple pages together:(
+				 */
+				if (sc->may_writepage && may_enter_fs) {
+					for (i = 0; i < series.series_length; i++) {
+						struct page* page = series.pages[i];
+						int res;
+
+						if (!may_write_to_queue(mapping->backing_dev_info))
+							break;
+						lock_page(page);
+						if (!PageDirty(page) || PageWriteback(page)) {
+							unlock_page(page);
+							continue;
+						}
+						clear_page_dirty_for_io(page);
+						struct writeback_control wbc = {
+							.sync_mode = WB_SYNC_NONE,
+							.nr_to_write = SWAP_CLUSTER_MAX,
+							.nonblocking = 1,
+							.for_reclaim = 1,
+						};
+						page_cache_get(page);
+						SetPageReclaim(page);
+						res = swap_writepage(page, &wbc);
+						if (res < 0) {
+							handle_write_error(mapping, page, res);
+							ClearPageReclaim(page);
+							page_cache_release(page);
+							break;
+						}
+						if (!PageWriteback(page))
+							ClearPageReclaim(page);
+						page_cache_release(page);
+					}
+				}
+				break;
+			case 5: // UnmappedPTE -- SwappedPTE, reclaim PrivatePage.
+				for (i = 0; i < series.series_length; i++) {
+					struct page* page = series.pages[i];
+					lock_page(page);
+					spin_lock(ptl);
+					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+						spin_unlock(ptl);
+						unlock_page(page);
+						continue;
+					}
+					swp_entry_t entry = { .val = page_private(page) };
+					swap_duplicate(entry);
+					pte_t pte_swp = swp_entry_to_pte(entry);
+					set_pte_at(mm, addr + i * PAGE_SIZE, series.ptes[i], pte_swp);
+					spin_unlock(ptl);
+					if (PageSwapCache(page) && !PageWriteback(page))
+						delete_from_swap_cache(page);
+					unlock_page(page);
+
+					if (!pagevec_add(&freed_pvec, page))
+						__pagevec_release_nonlru(&freed_pvec);
+					sc->nr_reclaimed++;
+				}
+				break;
+			case 6:
+				// NULL operation!
+				break;
+		}
+	} while (addr < end);
+	add_mm_counter(mm, anon_rss, anon_rss);
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+}
+
+static void shrink_pvma_pmd_range(struct scan_control* sc, struct
mm_struct* mm,
+		struct vm_area_struct* vma, pud_t* pud,
+		unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	pmd_t* pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		shrink_pvma_scan_ptes(sc, mm, vma, pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void shrink_pvma_pud_range(struct scan_control* sc, struct
mm_struct* mm,
+		struct vm_area_struct* vma, pgd_t* pgd,
+		unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	pud_t* pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		shrink_pvma_pmd_range(sc, mm, vma, pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void shrink_pvma_pgd_range(struct scan_control* sc, struct
mm_struct* mm,
+		struct vm_area_struct* vma)
+{
+	unsigned long next;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	pgd_t* pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		shrink_pvma_pud_range(sc, mm, vma, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+static void shrink_private_vma(struct scan_control* sc)
+{
+	struct mm_struct* mm;
+	struct vm_area_struct* vma;
+	struct list_head *pos, *lhtemp;
+
+	spin_lock(&mmlist_lock);
+	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
+		mm = list_entry(pos, struct mm_struct, mmlist);
+		if (atomic_inc_return(&mm->mm_users) == 1) {
+			atomic_dec(&mm->mm_users);
+			continue;
+		}
+		spin_unlock(&mmlist_lock);
+		start_tlb_tasks(mm);
+		if (down_read_trylock(&mm->mmap_sem)) {
+			for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+				if (!(vma->vm_flags & VM_PURE_PRIVATE))
+					continue;
+				if (vma->vm_flags & VM_LOCKED)
+					continue;
+				shrink_pvma_pgd_range(sc, mm, vma);
+			}
+			up_read(&mm->mmap_sem);
+		}
+		end_tlb_tasks();
+		mmput(mm);
+		spin_lock(&mmlist_lock);
+	}
+	spin_unlock(&mmlist_lock);
+}
+
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1557,6 +1949,7 @@
 	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);

+	shrink_private_vma(&sc);
 	inc_page_state(pageoutrun);

 	for (i = 0; i < pgdat->nr_zones; i++) {

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-29  6:45   ` yunfeng zhang
@ 2006-12-29  9:15     ` Pavel Machek
  2006-12-29 15:50       ` Randy Dunlap
  0 siblings, 1 reply; 14+ messages in thread
From: Pavel Machek @ 2006-12-29  9:15 UTC (permalink / raw)
  To: yunfeng zhang; +Cc: linux-kernel, torvalds

On Fri 2006-12-29 14:45:33, yunfeng zhang wrote:
> I've re-published my work on quilt, sorry.

Your patch is still wordwrapped.

Do not cc linus on non-final version of the patch.

Patch should be against latest kernel.

Patch should have changelog and signed off by.

Why the change? Do you gain 5% on kernel compile on 20MB box?

								Pavel


> 
> Index: linux-2.6.16.29/Documentation/vm_pps.txt
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6.16.29/Documentation/vm_pps.txt	2006-12-29 
> 14:36:36.507332384 +0800
> @@ -0,0 +1,192 @@
> +                         Pure Private Page System (pps)
> +                     Copyright by Yunfeng Zhang on GFDL 1.2
> +                              zyf.zeroos@gmail.com
> +                              December 24-26, 2006
> +
> +// Purpose <([{
> +The file is used to document the idea which is published firstly at
> +http://www.ussg.iu.edu/hypermail/linux/kernel/0607.2/0451.html, as a part 
> of my
> +OS -- main page http://blog.chinaunix.net/u/21764/index.php. In brief, a 
> patch
> +of the document to enchance the performance of Linux swap subsystem. You 
> can
> +find the overview of the idea in section <How to Reclaim Pages more
> +Efficiently> and how I patch it into Linux 2.6.16.29 in section <Pure 
> Private
> +Page System -- pps>.
> +// }])>
> +
> +// How to Reclaim Pages more Efficiently <([{
> +Good idea originates from overall design and management ability, when you 
> look
> +down from a manager view, you will relief yourself from disordered code and
> +find some problem immediately.
> +
> +OK! to modern OS, its memory subsystem can be divided into three layers
> +1) Space layer (InodeSpace, UserSpace and CoreSpace).
> +2) VMA layer (PrivateVMA and SharedVMA, memory architecture-independent 
> layer).
> +3) PTE and page layer (architecture-dependent).
> +
> +Since the 2nd layer assembles the much statistic of page-acess 
> information, so
> +it's nature that swap subsystem should be deployed and implemented on the 
> 2nd
> +layer.
> +
> +Undoubtedly, there are some virtues about it
> +1) SwapDaemon can collect the statistic of process acessing pages and by it
> +   unmaps ptes, SMP specially benefits from it for we can use 
> flush_tlb_range
> +   to unmap ptes batchly rather than frequently TLB IPI interrupt per a 
> page in
> +   current Linux legacy swap subsystem.
> +2) Page-fault can issue better readahead requests since history data shows 
> all
> +   related pages have conglomerating affinity. In contrast, Linux 
> page-fault
> +   readaheads the pages relative to the SwapSpace position of current
> +   page-fault page.
> +3) It's conformable to POSIX madvise API family.
> +
> +Unfortunately, Linux 2.6.16.29 swap subsystem is based on the 3rd layer -- 
> a
> +system on zone::active_list/inactive_list.
> +
> +I've finished a patch, see section <Pure Private Page System -- pps>.
> Note, it ISN'T perfect.
> +// }])>
> +
> +// Pure Private Page System -- pps  <([{
> +As I've referred in previous section, perfectly applying my idea need to 
> unroot
> +page-surrounging swap subsystem to migrate it on VMA, but a huge gap has
> +defeated me -- active_list and inactive_list. In fact, you can find
> +lru_add_active code anywhere ... It's IMPOSSIBLE to me to complete it only 
> by
> +myself. It's also the difference between my design and Linux, in my OS, 
> page is
> +the charge of its new owner totally, however, to Linux, page management 
> system
> +is still tracing it by PG_active flag.
> +
> +So I conceive another solution:) That is, set up an independent 
> page-recycle
> +system rooted on Linux legacy page system -- pps, intercept all private 
> pages
> +belonging to PrivateVMA to pps, then use my pps to cycle them.  By the 
> way, the
> +whole job should be consist of two parts, here is the first --
> +PrivateVMA-oriented (PPS), other is SharedVMA-oriented (should be called 
> SPS)
> +scheduled in future. Of course, if all are done, it will empty Linux legacy
> +page system.
> +
> +In fact, pps is centered on how to better collect and unmap process private
> +pages in SwapDaemon mm/vmscan.c:shrink_private_vma, the whole process is
> +divided into six stages -- <Stage Definition>. Other sections show the 
> remain
> +aspects of pps
> +1) <Data Definition> is basic data definition.
> +2) <Concurrent racers of Shrinking pps> is focused on synchronization.
> +3) <Private Page Lifecycle of pps> -- how private pages enter in/go off 
> pps.
> +4) <VMA Lifecycle of pps> which VMA is belonging to pps.
> +
> +PPS uses init_mm.mm_list list to enumerate all swappable UserSpace.
> +
> +I'm also glad to highlight my a new idea -- dftlb which is described in
> +section <Delay to Flush TLB>.
> +// }])>
> +
> +// Delay to Flush TLB (dftlb) <([{
> +Delay to flush TLB is instroduced by me to enhance flushing TLB 
> efficiency, in
> +brief, when we want to unmap a page from the page table of a process, why 
> we
> +send TLB IPI to other CPUs immediately, since every CPU has timer 
> interrupt, we
> +can insert flushing tasks into timer interrupt route to implement a
> +free-charged TLB flushing.
> +
> +The trick is implemented in
> +1) TLB flushing task is added in fill_in_tlb_task of mm/vmscan.c.
> +2) timer_flush_tlb_tasks of kernel/timer.c is used by other CPUs to execute
> +   flushing tasks.
> +3) all data are defined in include/linux/mm.h.
> +
> +The restriction of dftlb. Following conditions must be met
> +1) atomic cmpxchg instruction.
> +2) atomically set the access bit after they touch a pte firstly.
> +3) To some architectures, vma parameter of flush_tlb_range is maybe 
> important,
> +   if it's true, since it's possible that the vma of a TLB flushing task 
> has
> +   gone when a CPU starts to execute the task in timer interrupt, so don't 
> use
> +   dftlb.
> +combine stage 1 with stage 2, and send IPI immediately in 
> fill_in_tlb_tasks.
> +// }])>
> +
> +// Stage Definition <([{
> +The whole process of private page page-out is divided into six stages, as
> +showed in shrink_pvma_scan_ptes of mm/vmscan.c
> +1) PTE to untouched PTE (access bit is cleared), append flushing
> tasks to dftlb.
> +2) Convert untouched PTE to UnmappedPTE.
> +3) Link SwapEntry to every UnmappedPTE.
> +4) Synchronize the page of a UnmappedPTE with its physical swap page.
> +5) Reclaimed the page and shift UnmappedPTE to SwappedPTE.
> +6) SwappedPTE stage.
> +// }])>
> +
> +// Data Definition <([{
> +New VMA flag (VM_PURE_PRIVATE) is appended into VMA in include/linux/mm.h.
> +
> +New PTE type (UnmappedPTE) is appended into PTE system in
> +include/asm-i386/pgtable.h.
> +// }])>
> +
> +// Concurrent Racers of Shrinking pps <([{
> +shrink_private_vma of mm/vmscan.c uses init_mm.mmlist to scan all swappable
> +mm_struct instances, during the process of scaning and reclaiming process, 
> it
> +readlockes every mm_struct object, which brings some potential concurrent
> +racers
> +1) mm/swapfile.c    pps_swapoff (swapoff API).
> +2) mm/memory.c  do_wp_page, handle_pte_fault::unmapped_pte, 
> do_anonymous_page
> +   (page-fault).
> +// }])>
> +
> +// Private Page Lifecycle of pps <([{
> +All pages belonging to pps are called as pure private page.
> +
> +IN (NOTE, when a pure private page enters into pps, it's also trimmed from
> +Linux legacy page system by commeting lru_cache_add_active clause)
> +1) fs/exec.c	install_arg_pages	(argument pages).
> +2) mm/memory	do_anonymous_page, do_wp_page, do_swap_page	(page fault).
> +3) mm/swap_state.c	read_swap_cache_async	(swap pages).
> +
> +OUT
> +1) mm/vmscan.c  shrink_pvma_scan_ptes   (stage 6, reclaim a private page).
> +2) mm/memory    zap_pte_range   (free a page).
> +3) kernel/fork.c	dup_mmap	(if someone uses fork, migrate all 
> pps pages
> +   back to let Linux legacy page system manage them).
> +
> +When a pure private page is in pps, it can be visited simultaneously by
> +page-fault and SwapDaemon.
> +// }])>
> +
> +// VMA Lifecycle of pps <([{
> +When a PrivateVMA enters into pps, it's or-ed a new flag -- 
> VM_PURE_PRIVATE,
> +the flag is used in the shrink_private_vma of mm/vmscan.c.  Other fields 
> are
> +left untouched.
> +
> +IN.
> +1) fs/exec.c	setup_arg_pages	(StackVMA).
> +2) mm/mmap.c	do_mmap_pgoff, do_brk	(DataVMA).
> +3) mm/mmap.c	split_vma, copy_vma	(in some cases, we need copy a VMA 
> from an
> +   exist VMA).
> +
> +OUT.
> +1) kernel/fork.c	dup_mmap	(if someone uses fork, return the 
> vma back to
> +   Linux legacy system).
> +2) mm/mmap.c	remove_vma, vma_adjust	(destroy VMA).
> +3) mm/mmap.c	do_mmap_pgoff	(delete VMA when some errors occur).
> +// }])>
> +
> +// Postscript <([{
> +Note, some circumstances aren't tested due to hardware restriction e.g. SMP
> +dftlb.
> +
> +Here are some improvements about pps
> +1) In fact, I recommend one-to-one private model -- PrivateVMA, (PTE,
> +   UnmappedPTE) and PrivatePage (SwapPage) which is described in my OS and 
> the
> +   aboved hyperlink of Linux kernel mail list. So it's a compromise to use
> +   Linux legacy SwapCache in my pps.
> +2) SwapCache should provide more flexible interfaces, shrink_pvma_scan_ptes
> +   need allocate swap entries in batch, exactly, allocate a batch of fake
> +   continual swap entries, see mm/pps_swapin_readahead.
> +3) pps statistic entry in /proc/meminfo.
> +4) a better arithmetic to pick mm out to scan and shrink in 
> shrink_private_vma.
> +5) It's better to execute the first 2 stages when system is idle, current
> +   SwapDaemon only is activated when free pages are low.
> +6) A scanning count should be added into mm_struct, so when the count is
> +   becoming enough old to open stage 3 and 4.
> +
> +I'm still working on improvement 4, 5 and 6 to find out how to maximum the
> +performance of swap subsystem.
> +
> +If Linux kernel group can't make a schedule to re-write their memory code,
> +however, pps maybe is the best solution until now.
> +// }])>
> +// vim: foldmarker=<([{,}])> foldmethod=marker et
> Index: linux-2.6.16.29/fs/exec.c
> ===================================================================
> --- linux-2.6.16.29.orig/fs/exec.c	2006-12-29 13:56:51.000000000 +0800
> +++ linux-2.6.16.29/fs/exec.c	2006-12-29 13:57:18.000000000 +0800
> @@ -321,7 +321,6 @@
> 		goto out;
> 	}
> 	inc_mm_counter(mm, anon_rss);
> -	lru_cache_add_active(page);
> 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
> 					page, vma->vm_page_prot))));
> 	page_add_new_anon_rmap(page, vma, address);
> @@ -436,6 +435,7 @@
> 			kmem_cache_free(vm_area_cachep, mpnt);
> 			return ret;
> 		}
> +		enter_pps(mm, mpnt);
> 		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
> 	}
> 
> Index: linux-2.6.16.29/include/asm-i386/pgtable-2level.h
> ===================================================================
> --- linux-2.6.16.29.orig/include/asm-i386/pgtable-2level.h	2006-12-29
> 13:56:53.000000000 +0800
> +++ linux-2.6.16.29/include/asm-i386/pgtable-2level.h	2006-12-29
> 13:57:19.612186872 +0800
> @@ -46,21 +46,21 @@
> }
> 
> /*
> - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
> + * Bits 0, 5, 6 and 7 are taken, split up the 28 bits of offset
>  * into this range:
>  */
> -#define PTE_FILE_MAX_BITS	29
> +#define PTE_FILE_MAX_BITS	28
> 
> #define pte_to_pgoff(pte) \
> -	((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
> +	((((pte).pte_low >> 1) & 0xf ) + (((pte).pte_low >> 8) << 4 ))
> 
> #define pgoff_to_pte(off) \
> -	((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE 
> })
> +	((pte_t) { (((off) & 0xf) << 1) + (((off) >> 4) << 8) + _PAGE_FILE })
> 
> /* Encode and de-code a swap entry */
> -#define __swp_type(x)			(((x).val >> 1) & 0x1f)
> +#define __swp_type(x)			(((x).val >> 1) & 0xf)
> #define __swp_offset(x)			((x).val >> 8)
> -#define __swp_entry(type, offset)	((swp_entry_t) { ((type) << 1) |
> ((offset) << 8) })
> +#define __swp_entry(type, offset)	((swp_entry_t) { ((type & 0xf) <<
> 1) | ((offset) << 8) | _PAGE_SWAPPED })
> #define __pte_to_swp_entry(pte)		((swp_entry_t) { 
> (pte).pte_low })
> #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
> 
> Index: linux-2.6.16.29/include/asm-i386/pgtable.h
> ===================================================================
> --- linux-2.6.16.29.orig/include/asm-i386/pgtable.h	2006-12-29
> 13:56:53.000000000 +0800
> +++ linux-2.6.16.29/include/asm-i386/pgtable.h	2006-12-29
> 13:57:19.846151304 +0800
> @@ -121,7 +121,11 @@
> #define _PAGE_UNUSED3	0x800
> 
> /* If _PAGE_PRESENT is clear, we use these: */
> -#define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; 
> unset:swap */
> +#define _PAGE_UNMAPPED	0x020	/* a special PTE type, hold its page 
> reference
> +								   even it's 
> unmapped, see more from
> +								   
> Documentation/vm_pps.txt. */
> +#define _PAGE_SWAPPED 0x040 /* swapped PTE. */
> +#define _PAGE_FILE	0x060	/* nonlinear file mapping, saved PTE; */
> #define _PAGE_PROTNONE	0x080	/* if the user mapped it with PROT_NONE;
> 				   pte_present gives true */
> #ifdef CONFIG_X86_PAE
> @@ -228,7 +232,9 @@
> /*
>  * The following only works if pte_present() is not true.
>  */
> -static inline int pte_file(pte_t pte)		{ return (pte).pte_low & 
> _PAGE_FILE; }
> +static inline int pte_unmapped(pte_t pte)	{ return ((pte).pte_low &
> 0x60) == _PAGE_UNMAPPED; }
> +static inline int pte_swapped(pte_t pte)	{ return ((pte).pte_low &
> 0x60) == _PAGE_SWAPPED; }
> +static inline int pte_file(pte_t pte)		{ return ((pte).pte_low &
> 0x60) == _PAGE_FILE; }
> 
> static inline pte_t pte_rdprotect(pte_t pte)	{ (pte).pte_low &=
> ~_PAGE_USER; return pte; }
> static inline pte_t pte_exprotect(pte_t pte)	{ (pte).pte_low &=
> ~_PAGE_USER; return pte; }
> Index: linux-2.6.16.29/include/linux/mm.h
> ===================================================================
> --- linux-2.6.16.29.orig/include/linux/mm.h	2006-12-29 
> 13:56:53.000000000 +0800
> +++ linux-2.6.16.29/include/linux/mm.h	2006-12-29 13:57:19.098265000 +0800
> @@ -166,6 +166,8 @@
> #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) 
> */
> #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu 
> mmap) */
> #define VM_INSERTPAGE	0x02000000	/* The vma has had
> "vm_insert_page()" done on it */
> +#define VM_PURE_PRIVATE	0x04000000	/* Is the vma is only 
> belonging to a mm,
> +									   
> see more from Documentation/vm_pps.txt */
> 
> #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
> #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
> @@ -1056,5 +1058,25 @@
> extern int randomize_va_space;
> #endif
> 
> +/* vmscan.c::delay flush TLB */
> +struct delay_tlb_task_t
> +{
> +	struct mm_struct* mm;
> +	cpumask_t cpu_mask;
> +	struct vm_area_struct* vma[32];
> +	unsigned long start[32];
> +	unsigned long end[32];
> +};
> +extern struct delay_tlb_task_t delay_tlb_tasks[32];
> +
> +// The prototype of the function is fit with the "func" of "int
> +// smp_call_function (void (*func) (void *info), void *info, int retry, int
> +// wait);" of include/linux/smp.h of 2.6.16.29. Call it with NULL.
> +void timer_flush_tlb_tasks(void* data /* = NULL */);
> +
> +void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma);
> +void leave_pps(struct vm_area_struct* vma, int migrate_flag);
> +
> +#define MAX_SERIES_LENGTH 8
> #endif /* __KERNEL__ */
> #endif /* _LINUX_MM_H */
> Index: linux-2.6.16.29/include/linux/swapops.h
> ===================================================================
> --- linux-2.6.16.29.orig/include/linux/swapops.h	2006-12-29
> 13:56:53.000000000 +0800
> +++ linux-2.6.16.29/include/linux/swapops.h	2006-12-29 
> 13:57:19.000000000 +0800
> @@ -50,7 +50,7 @@
> {
> 	swp_entry_t arch_entry;
> 
> -	BUG_ON(pte_file(pte));
> +	BUG_ON(!pte_swapped(pte));
> 	arch_entry = __pte_to_swp_entry(pte);
> 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
> }
> @@ -64,6 +64,6 @@
> 	swp_entry_t arch_entry;
> 
> 	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
> -	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
> +	BUG_ON(!pte_swapped(__swp_entry_to_pte(arch_entry)));
> 	return __swp_entry_to_pte(arch_entry);
> }
> Index: linux-2.6.16.29/kernel/fork.c
> ===================================================================
> --- linux-2.6.16.29.orig/kernel/fork.c	2006-12-29 13:56:52.000000000 +0800
> +++ linux-2.6.16.29/kernel/fork.c	2006-12-29 13:57:20.000000000 +0800
> @@ -229,6 +229,7 @@
> 		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
> 		if (!tmp)
> 			goto fail_nomem;
> +		leave_pps(mpnt, 1);
> 		*tmp = *mpnt;
> 		pol = mpol_copy(vma_policy(mpnt));
> 		retval = PTR_ERR(pol);
> Index: linux-2.6.16.29/kernel/timer.c
> ===================================================================
> --- linux-2.6.16.29.orig/kernel/timer.c	2006-12-29 
> 13:56:52.000000000 +0800
> +++ linux-2.6.16.29/kernel/timer.c	2006-12-29 13:57:20.000000000 +0800
> @@ -842,6 +842,8 @@
> 		rcu_check_callbacks(cpu, user_tick);
> 	scheduler_tick();
>  	run_posix_cpu_timers(p);
> +
> +	timer_flush_tlb_tasks(NULL);
> }
> 
> /*
> Index: linux-2.6.16.29/mm/fremap.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/fremap.c	2006-12-29 13:56:51.000000000 +0800
> +++ linux-2.6.16.29/mm/fremap.c	2006-12-29 13:57:21.000000000 +0800
> @@ -37,7 +37,7 @@
> 			page_cache_release(page);
> 		}
> 	} else {
> -		if (!pte_file(pte))
> +		if (pte_swapped(pte))
> 			free_swap_and_cache(pte_to_swp_entry(pte));
> 		pte_clear(mm, addr, ptep);
> 	}
> Index: linux-2.6.16.29/mm/memory.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/memory.c	2006-12-29 13:56:52.000000000 +0800
> +++ linux-2.6.16.29/mm/memory.c	2006-12-29 13:57:51.000000000 +0800
> @@ -436,7 +436,7 @@
> 
> 	/* pte contains position in swap or file, so copy. */
> 	if (unlikely(!pte_present(pte))) {
> -		if (!pte_file(pte)) {
> +		if (pte_swapped(pte)) {
> 			swap_duplicate(pte_to_swp_entry(pte));
> 			/* make sure dst_mm is on swapoff's mmlist. */
> 			if (unlikely(list_empty(&dst_mm->mmlist))) {
> @@ -658,6 +658,8 @@
> 						addr) != page->index)
> 				set_pte_at(mm, addr, pte,
> 					   pgoff_to_pte(page->index));
> +			// if (vma->vm_flags & VM_PURE_PRIVATE && page != 
> ZERO_PAGE(addr))
> +			// 	lru_cache_add_active(page);
> 			if (PageAnon(page))
> 				anon_rss--;
> 			else {
> @@ -677,7 +679,16 @@
> 		 */
> 		if (unlikely(details))
> 			continue;
> -		if (!pte_file(ptent))
> +		if (pte_unmapped(ptent)) {
> +			struct page *page;
> +			page = pfn_to_page(pte_pfn(ptent));
> +			pte_clear_full(mm, addr, pte, tlb->fullmm);
> +			// lru_cache_add_active(page);
> +			tlb_remove_page(tlb, page);
> +			anon_rss--;
> +			continue;
> +		}
> +		if (pte_swapped(ptent))
> 			free_swap_and_cache(pte_to_swp_entry(ptent));
> 		pte_clear_full(mm, addr, pte, tlb->fullmm);
> 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
> @@ -1508,7 +1519,8 @@
> 		ptep_establish(vma, address, page_table, entry);
> 		update_mmu_cache(vma, address, entry);
> 		lazy_mmu_prot_update(entry);
> -		lru_cache_add_active(new_page);
> +		if (!(vma->vm_flags & VM_PURE_PRIVATE))
> +			lru_cache_add_active(new_page);
> 		page_add_new_anon_rmap(new_page, vma, address);
> 
> 		/* Free the old page.. */
> @@ -1864,6 +1876,84 @@
> }
> 
> /*
> + * New read ahead code, mainly for VM_PURE_PRIVATE only.
> + */
> +static void pps_swapin_readahead(swp_entry_t entry, unsigned long
> addr,struct vm_area_struct *vma, pte_t* pte, pmd_t* pmd)
> +{
> +	struct page* page;
> +	pte_t *prev, *next;
> +	swp_entry_t temp;
> +	spinlock_t* ptl = pte_lockptr(vma->vm_mm, pmd);
> +	int swapType = swp_type(entry);
> +	int swapOffset = swp_offset(entry);
> +	int readahead = 1, abs;
> +
> +	if (!(vma->vm_flags & VM_PURE_PRIVATE)) {
> +		swapin_readahead(entry, addr, vma);
> +		return;
> +	}
> +
> +	page = read_swap_cache_async(entry, vma, addr);
> +	if (!page)
> +		return;
> +	page_cache_release(page);
> +
> +	// read ahead the whole series, first forward then backward.
> +	while (readahead < MAX_SERIES_LENGTH) {
> +		next = pte++;
> +		if (next - (pte_t*) pmd >= PTRS_PER_PTE)
> +			break;
> +		spin_lock(ptl);
> +        if (!(!pte_present(*next) && pte_swapped(*next))) {
> +			spin_unlock(ptl);
> +			break;
> +		}
> +		temp = pte_to_swp_entry(*next);
> +		spin_unlock(ptl);
> +		if (swp_type(temp) != swapType)
> +			break;
> +		abs = swp_offset(temp) - swapOffset;
> +		abs = abs < 0 ? -abs : abs;
> +		swapOffset = swp_offset(temp);
> +		if (abs > 8)
> +			// the two swap entries are too far, give up!
> +			break;
> +		page = read_swap_cache_async(temp, vma, addr);
> +		if (!page)
> +			return;
> +		page_cache_release(page);
> +		readahead++;
> +	}
> +
> +	swapOffset = swp_offset(entry);
> +	while (readahead < MAX_SERIES_LENGTH) {
> +		prev = pte--;
> +		if (prev - (pte_t*) pmd < 0)
> +			break;
> +		spin_lock(ptl);
> +        if (!(!pte_present(*prev) && pte_swapped(*prev))) {
> +			spin_unlock(ptl);
> +			break;
> +		}
> +		temp = pte_to_swp_entry(*prev);
> +		spin_unlock(ptl);
> +		if (swp_type(temp) != swapType)
> +			break;
> +		abs = swp_offset(temp) - swapOffset;
> +		abs = abs < 0 ? -abs : abs;
> +		swapOffset = swp_offset(temp);
> +		if (abs > 8)
> +			// the two swap entries are too far, give up!
> +			break;
> +		page = read_swap_cache_async(temp, vma, addr);
> +		if (!page)
> +			return;
> +		page_cache_release(page);
> +		readahead++;
> +	}
> +}
> +
> +/*
>  * We enter with non-exclusive mmap_sem (to exclude vma changes,
>  * but allow concurrent faults), and pte mapped but not yet locked.
>  * We return with mmap_sem still held, but pte unmapped and unlocked.
> @@ -1885,7 +1975,7 @@
> again:
> 	page = lookup_swap_cache(entry);
> 	if (!page) {
> - 		swapin_readahead(entry, address, vma);
> + 		pps_swapin_readahead(entry, address, vma, page_table, pmd);
>  		page = read_swap_cache_async(entry, vma, address);
> 		if (!page) {
> 			/*
> @@ -1904,10 +1994,15 @@
> 		grab_swap_token();
> 	}
> 
> -	mark_page_accessed(page);
> +	if (!(vma->vm_flags & VM_PURE_PRIVATE))
> +		mark_page_accessed(page);
> 	lock_page(page);
> 	if (!PageSwapCache(page)) {
> 		/* Page migration has occured */
> +		if (vma->vm_flags & VM_PURE_PRIVATE) {
> +			lru_cache_add_active(page);
> +			mark_page_accessed(page);
> +		}
> 		unlock_page(page);
> 		page_cache_release(page);
> 		goto again;
> @@ -1922,6 +2017,10 @@
> 
> 	if (unlikely(!PageUptodate(page))) {
> 		ret = VM_FAULT_SIGBUS;
> +		if (vma->vm_flags & VM_PURE_PRIVATE) {
> +			lru_cache_add_active(page);
> +			mark_page_accessed(page);
> +		}
> 		goto out_nomap;
> 	}
> 
> @@ -1993,8 +2092,9 @@
> 		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
> 		if (!pte_none(*page_table))
> 			goto release;
> +		if (!(vma->vm_flags & VM_PURE_PRIVATE))
> +			lru_cache_add_active(page);
> 		inc_mm_counter(mm, anon_rss);
> -		lru_cache_add_active(page);
> 		page_add_new_anon_rmap(page, vma, address);
> 	} else {
> 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
> @@ -2209,6 +2309,20 @@
> 
> 	old_entry = entry = *pte;
> 	if (!pte_present(entry)) {
> +		if (pte_unmapped(entry)) {
> +			BUG_ON(!(vma->vm_flags & VM_PURE_PRIVATE));
> +			struct page* page = pte_page(entry);
> +			pte_t temp_pte = mk_pte(page, vma->vm_page_prot);
> +			pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> +			if (unlikely(pte_same(*pte, entry))) {
> +				page_add_new_anon_rmap(page, vma, address);
> +				set_pte_at(mm, address, pte, temp_pte);
> +				update_mmu_cache(vma, address, temp_pte);
> +				lazy_mmu_prot_update(temp_pte);
> +			}
> +			pte_unmap_unlock(pte, ptl);
> +			return VM_FAULT_MINOR;
> +		}
> 		if (pte_none(entry)) {
> 			if (!vma->vm_ops || !vma->vm_ops->nopage)
> 				return do_anonymous_page(mm, vma, address,
> @@ -2445,3 +2559,112 @@
> }
> 
> #endif	/* __HAVE_ARCH_GATE_AREA */
> +
> +static void migrate_back_pte_range(struct mm_struct* mm, pmd_t *pmd, struct
> +		vm_area_struct *vma, unsigned long addr, unsigned long end)
> +{
> +	struct page* page;
> +	pte_t entry;
> +	pte_t* pte;
> +	spinlock_t* ptl;
> +
> +	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> +	do {
> +		if (!pte_present(*pte) && pte_unmapped(*pte)) {
> +			page = pte_page(*pte);
> +			entry = mk_pte(page, vma->vm_page_prot);
> +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +			set_pte_at(mm, addr, pte, entry);
> +			BUG_ON(page == ZERO_PAGE(addr));
> +			page_add_new_anon_rmap(page, vma, addr);
> +		}
> +		if (pte_present(*pte)) {
> +			page = pte_page(*pte);
> +			if (page == ZERO_PAGE(addr))
> +				continue;
> +			lru_cache_add_active(page);
> +		}
> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	pte_unmap_unlock(pte - 1, ptl);
> +	lru_add_drain();
> +}
> +
> +static void migrate_back_pmd_range(struct mm_struct* mm, pud_t *pud, struct
> +		vm_area_struct *vma, unsigned long addr, unsigned long end)
> +{
> +	pmd_t *pmd;
> +	unsigned long next;
> +
> +	pmd = pmd_offset(pud, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (pmd_none_or_clear_bad(pmd))
> +			continue;
> +		migrate_back_pte_range(mm, pmd, vma, addr, next);
> +	} while (pmd++, addr = next, addr != end);
> +}
> +
> +static void migrate_back_pud_range(struct mm_struct* mm, pgd_t *pgd, struct
> +		vm_area_struct *vma, unsigned long addr, unsigned long end)
> +{
> +	pud_t *pud;
> +	unsigned long next;
> +
> +	pud = pud_offset(pgd, addr);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (pud_none_or_clear_bad(pud))
> +			continue;
> +		migrate_back_pmd_range(mm, pud, vma, addr, next);
> +	} while (pud++, addr = next, addr != end);
> +}
> +
> +// migrate all pages of pure private vma back to Linux legacy memory
> management.
> +static void migrate_back_legacy_linux(struct mm_struct* mm, struct
> vm_area_struct* vma)
> +{
> +	pgd_t* pgd;
> +	unsigned long next;
> +	unsigned long addr = vma->vm_start;
> +	unsigned long end = vma->vm_end;
> +
> +	pgd = pgd_offset(mm, addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none_or_clear_bad(pgd))
> +			continue;
> +		migrate_back_pud_range(mm, pgd, vma, addr, next);
> +	} while (pgd++, addr = next, addr != end);
> +}
> +
> +LIST_HEAD(pps_head);
> +LIST_HEAD(pps_head_buddy);
> +
> +DEFINE_SPINLOCK(pps_lock);
> +
> +void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma)
> +{
> +	int condition = VM_READ | VM_WRITE | VM_EXEC | \
> +		 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \
> +		 VM_GROWSDOWN | VM_GROWSUP | \
> +		 VM_LOCKED | VM_SEQ_READ | VM_RAND_READ | VM_DONTCOPY | 
> VM_ACCOUNT;
> +	if (!(vma->vm_flags & ~condition) && vma->vm_file == NULL) {
> +		vma->vm_flags |= VM_PURE_PRIVATE;
> +		if (list_empty(&mm->mmlist)) {
> +			spin_lock(&mmlist_lock);
> +			if (list_empty(&mm->mmlist))
> +				list_add(&mm->mmlist, &init_mm.mmlist);
> +			spin_unlock(&mmlist_lock);
> +		}
> +	}
> +}
> +
> +void leave_pps(struct vm_area_struct* vma, int migrate_flag)
> +{
> +	struct mm_struct* mm = vma->vm_mm;
> +
> +	if (vma->vm_flags & VM_PURE_PRIVATE) {
> +		vma->vm_flags &= ~VM_PURE_PRIVATE;
> +		if (migrate_flag)
> +			migrate_back_legacy_linux(mm, vma);
> +	}
> +}
> Index: linux-2.6.16.29/mm/mmap.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/mmap.c	2006-12-29 13:56:51.000000000 +0800
> +++ linux-2.6.16.29/mm/mmap.c	2006-12-29 13:57:20.000000000 +0800
> @@ -206,6 +206,7 @@
> 	if (vma->vm_file)
> 		fput(vma->vm_file);
> 	mpol_free(vma_policy(vma));
> +	leave_pps(vma, 0);
> 	kmem_cache_free(vm_area_cachep, vma);
> 	return next;
> }
> @@ -593,6 +594,7 @@
> 			fput(file);
> 		mm->map_count--;
> 		mpol_free(vma_policy(next));
> +		leave_pps(next, 0);
> 		kmem_cache_free(vm_area_cachep, next);
> 		/*
> 		 * In mprotect's case 6 (see comments on vma_merge),
> @@ -1091,6 +1093,8 @@
> 	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
> 		vma->vm_flags &= ~VM_ACCOUNT;
> 
> +	enter_pps(mm, vma);
> +
> 	/* Can addr have changed??
> 	 *
> 	 * Answer: Yes, several device drivers can do it in their
> @@ -1113,6 +1117,7 @@
> 			fput(file);
> 		}
> 		mpol_free(vma_policy(vma));
> +		leave_pps(vma, 0);
> 		kmem_cache_free(vm_area_cachep, vma);
> 	}
> out:	
> @@ -1140,6 +1145,7 @@
> 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
> 	charged = 0;
> free_vma:
> +	leave_pps(vma, 0);
> 	kmem_cache_free(vm_area_cachep, vma);
> unacct_error:
> 	if (charged)
> @@ -1717,6 +1723,10 @@
> 
> 	/* most fields are the same, copy all, and then fixup */
> 	*new = *vma;
> +	if (new->vm_flags & VM_PURE_PRIVATE) {
> +		new->vm_flags &= ~VM_PURE_PRIVATE;
> +		enter_pps(mm, new);
> +	}
> 
> 	if (new_below)
> 		new->vm_end = addr;
> @@ -1917,6 +1927,7 @@
> 	vma->vm_pgoff = pgoff;
> 	vma->vm_flags = flags;
> 	vma->vm_page_prot = protection_map[flags & 0x0f];
> +	enter_pps(mm, vma);
> 	vma_link(mm, vma, prev, rb_link, rb_parent);
> out:
> 	mm->total_vm += len >> PAGE_SHIFT;
> @@ -2040,6 +2051,10 @@
> 				get_file(new_vma->vm_file);
> 			if (new_vma->vm_ops && new_vma->vm_ops->open)
> 				new_vma->vm_ops->open(new_vma);
> +			if (new_vma->vm_flags & VM_PURE_PRIVATE) {
> +				new_vma->vm_flags &= ~VM_PURE_PRIVATE;
> +				enter_pps(mm, new_vma);
> +			}
> 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
> 		}
> 	}
> Index: linux-2.6.16.29/mm/rmap.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/rmap.c	2006-12-29 13:56:51.000000000 +0800
> +++ linux-2.6.16.29/mm/rmap.c	2006-12-29 13:57:21.000000000 +0800
> @@ -633,7 +633,7 @@
> 			spin_unlock(&mmlist_lock);
> 		}
> 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
> -		BUG_ON(pte_file(*pte));
> +		BUG_ON(!pte_swapped(*pte));
> 		dec_mm_counter(mm, anon_rss);
> 	} else
> 		dec_mm_counter(mm, file_rss);
> Index: linux-2.6.16.29/mm/swap_state.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/swap_state.c	2006-12-29 
> 13:56:51.000000000 +0800
> +++ linux-2.6.16.29/mm/swap_state.c	2006-12-29 13:57:20.000000000 +0800
> @@ -354,7 +354,8 @@
> 			/*
> 			 * Initiate read into locked page and return.
> 			 */
> -			lru_cache_add_active(new_page);
> +			if (vma == NULL || !(vma->vm_flags & 
> VM_PURE_PRIVATE))
> +				lru_cache_add_active(new_page);
> 			swap_readpage(NULL, new_page);
> 			return new_page;
> 		}
> Index: linux-2.6.16.29/mm/swapfile.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/swapfile.c	2006-12-29 13:56:52.000000000 +0800
> +++ linux-2.6.16.29/mm/swapfile.c	2006-12-29 13:57:21.000000000 +0800
> @@ -7,6 +7,7 @@
> 
> #include <linux/config.h>
> #include <linux/mm.h>
> +#include <linux/mm_inline.h>
> #include <linux/hugetlb.h>
> #include <linux/mman.h>
> #include <linux/slab.h>
> @@ -417,6 +418,163 @@
> 	}
> }
> 
> +static int pps_test_swap_type(struct mm_struct* mm, pmd_t* pmd, pte_t* 
> pte, int
> +		type, struct page** ret_page)
> +{
> +	spinlock_t* ptl = pte_lockptr(mm, pmd);
> +	swp_entry_t entry;
> +	struct page* page;
> +
> +	spin_lock(ptl);
> +	if (!pte_present(*pte) && pte_swapped(*pte)) {
> +		entry = pte_to_swp_entry(*pte);
> +		if (swp_type(entry) == type) {
> +			*ret_page = NULL;
> +			spin_unlock(ptl);
> +			return 1;
> +		}
> +	} else {
> +		page = pfn_to_page(pte_pfn(*pte));
> +		if (PageSwapCache(page)) {
> +			entry.val = page_private(page);
> +			if (swp_type(entry) == type) {
> +				page_cache_get(page);
> +				*ret_page = page;
> +				spin_unlock(ptl);
> +				return 1;
> +			}
> +		}
> +	}
> +	spin_unlock(ptl);
> +	return 0;
> +}
> +
> +static int pps_swapoff_scan_ptes(struct mm_struct* mm, struct 
> vm_area_struct*
> +		vma, pmd_t* pmd, unsigned long addr, unsigned long end, int 
> type)
> +{
> +	pte_t *pte;
> +	struct page* page;
> +
> +	pte = pte_offset_map(pmd, addr);
> +	do {
> +		while (pps_test_swap_type(mm, pmd, pte, type, &page)) {
> +			if (page == NULL) {
> +				switch (__handle_mm_fault(mm, vma, addr, 0)) 
> {
> +				case VM_FAULT_SIGBUS:
> +				case VM_FAULT_OOM:
> +					return -ENOMEM;
> +				case VM_FAULT_MINOR:
> +				case VM_FAULT_MAJOR:
> +					break;
> +				default:
> +					BUG();
> +				}
> +			} else {
> +				wait_on_page_locked(page);
> +				wait_on_page_writeback(page);
> +				lock_page(page);
> +				if (!PageSwapCache(page)) {
> +					unlock_page(page);
> +					page_cache_release(page);
> +					break;
> +				}
> +				wait_on_page_writeback(page);
> +				delete_from_swap_cache(page);
> +				unlock_page(page);
> +				page_cache_release(page);
> +				break;
> +			}
> +		}
> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	return 0;
> +}
> +
> +static int pps_swapoff_pmd_range(struct mm_struct* mm, struct 
> vm_area_struct*
> +		vma, pud_t* pud, unsigned long addr, unsigned long end, int 
> type)
> +{
> +	unsigned long next;
> +	int ret;
> +	pmd_t* pmd = pmd_offset(pud, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (pmd_none_or_clear_bad(pmd))
> +			continue;
> +		ret = pps_swapoff_scan_ptes(mm, vma, pmd, addr, next, type);
> +		if (ret == -ENOMEM)
> +			return ret;
> +	} while (pmd++, addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int pps_swapoff_pud_range(struct mm_struct* mm, struct 
> vm_area_struct*
> +		vma, pgd_t* pgd, unsigned long addr, unsigned long end, int 
> type)
> +{
> +	unsigned long next;
> +	int ret;
> +	pud_t* pud = pud_offset(pgd, addr);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (pud_none_or_clear_bad(pud))
> +			continue;
> +		ret = pps_swapoff_pmd_range(mm, vma, pud, addr, next, type);
> +		if (ret == -ENOMEM)
> +			return ret;
> +	} while (pud++, addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int pps_swapoff_pgd_range(struct mm_struct* mm, struct 
> vm_area_struct*
> +		vma, int type)
> +{
> +	unsigned long next;
> +	unsigned long addr = vma->vm_start;
> +	unsigned long end = vma->vm_end;
> +	int ret;
> +	pgd_t* pgd = pgd_offset(mm, addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none_or_clear_bad(pgd))
> +			continue;
> +		ret = pps_swapoff_pud_range(mm, vma, pgd, addr, next, type);
> +		if (ret == -ENOMEM)
> +			return ret;
> +	} while (pgd++, addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int pps_swapoff(int type)
> +{
> +	struct mm_struct* mm;
> +	struct vm_area_struct* vma;
> +	struct list_head *pos, *lhtemp;
> +	int ret = 0;
> +
> +	spin_lock(&mmlist_lock);
> +	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
> +		mm = list_entry(pos, struct mm_struct, mmlist);
> +		if (atomic_inc_return(&mm->mm_users) == 1) {
> +			atomic_dec(&mm->mm_users);
> +			continue;
> +		}
> +		spin_unlock(&mmlist_lock);
> +		down_read(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
> +			if (!(vma->vm_flags & VM_PURE_PRIVATE))
> +				continue;
> +			if (vma->vm_flags & VM_LOCKED)
> +				continue;
> +			ret = pps_swapoff_pgd_range(mm, vma, type);
> +			if (ret == -ENOMEM)
> +				break;
> +		}
> +		up_read(&mm->mmap_sem);
> +		mmput(mm);
> +		spin_lock(&mmlist_lock);
> +	}
> +	spin_unlock(&mmlist_lock);
> +	return ret;
> +}
> +
> /*
>  * No need to decide whether this PTE shares the swap entry with others,
>  * just let do_wp_page work it out if a write is requested later - to
> @@ -619,6 +777,12 @@
> 	int reset_overflow = 0;
> 	int shmem;
> 
> +	// Let's first read all pps pages back! Note, it's one-to-one 
> mapping.
> +	retval = pps_swapoff(type);
> +	if (retval == -ENOMEM) // something was wrong.
> +		return -ENOMEM;
> +	// Now, the remain pages are shared pages, go ahead!
> +
> 	/*
> 	 * When searching mms for an entry, a good strategy is to
> 	 * start at the first mm we freed the previous entry from
> @@ -848,16 +1012,20 @@
>  */
> static void drain_mmlist(void)
> {
> -	struct list_head *p, *next;
> +	// struct list_head *p, *next;
> 	unsigned int i;
> 
> 	for (i = 0; i < nr_swapfiles; i++)
> 		if (swap_info[i].inuse_pages)
> 			return;
> +	/*
> +	 * Now, init_mm.mmlist list not only is used by SwapDevice but also 
> is used
> +	 * by PPS.
> 	spin_lock(&mmlist_lock);
> 	list_for_each_safe(p, next, &init_mm.mmlist)
> 		list_del_init(p);
> 	spin_unlock(&mmlist_lock);
> +	*/
> }
> 
> /*
> Index: linux-2.6.16.29/mm/vmscan.c
> ===================================================================
> --- linux-2.6.16.29.orig/mm/vmscan.c	2006-12-29 13:56:51.000000000 +0800
> +++ linux-2.6.16.29/mm/vmscan.c	2006-12-29 13:58:30.000000000 +0800
> @@ -1514,6 +1514,398 @@
> 	return ret;
> }
> 
> +struct series_t {
> +	pte_t orig_ptes[MAX_SERIES_LENGTH];
> +	pte_t* ptes[MAX_SERIES_LENGTH];
> +	struct page* pages[MAX_SERIES_LENGTH];
> +	int series_length;
> +	int series_stage;
> +} series;
> +
> +static int get_series_stage(pte_t* pte, int index)
> +{
> +	series.orig_ptes[index] = *pte;
> +	series.ptes[index] = pte;
> +	if (pte_present(series.orig_ptes[index])) {
> +		struct page* page = 
> pfn_to_page(pte_pfn(series.orig_ptes[index]));
> +		series.pages[index] = page;
> +		if (page == ZERO_PAGE(addr)) // reserved page is exclusive 
> from us.
> +			return 7;
> +		if (pte_young(series.orig_ptes[index])) {
> +			return 1;
> +		} else
> +			return 2;
> +	} else if (pte_unmapped(series.orig_ptes[index])) {
> +		struct page* page = 
> pfn_to_page(pte_pfn(series.orig_ptes[index]));
> +		series.pages[index] = page;
> +		if (!PageSwapCache(page))
> +			return 3;
> +		else {
> +			if (PageWriteback(page) || PageDirty(page))
> +				return 4;
> +			else
> +				return 5;
> +		}
> +	} else // pte_swapped -- SwappedPTE
> +		return 6;
> +}
> +
> +static void find_series(pte_t** start, unsigned long* addr, unsigned long 
> end)
> +{
> +	int i;
> +	int series_stage = get_series_stage((*start)++, 0);
> +	*addr += PAGE_SIZE;
> +
> +	for (i = 1; i < MAX_SERIES_LENGTH && *addr < end; i++, (*start)++,
> *addr += PAGE_SIZE) {
> +		if (series_stage != get_series_stage(*start, i))
> +			break;
> +	}
> +	series.series_stage = series_stage;
> +	series.series_length = i;
> +}
> +
> +struct delay_tlb_task_t delay_tlb_tasks[32] = { [0 ... 31] = {0} };
> +
> +void timer_flush_tlb_tasks(void* data)
> +{
> +	// To x86, if we found there were some flushing tasks, we should do
> it all together, that is, flush it once.
> +	int i;
> +#ifdef CONFIG_X86
> +	int flag = 0;
> +#endif
> +	for (i = 0; i < 32; i++) {
> +		if (delay_tlb_tasks[i].mm != NULL &&
> +				cpu_isset(smp_processor_id(), 
> delay_tlb_tasks[i].mm->cpu_vm_mask) &&
> +				cpu_isset(smp_processor_id(), 
> delay_tlb_tasks[i].cpu_mask)) {
> +#ifdef CONFIG_X86
> +			flag = 1;
> +#elif
> +			// smp::local_flush_tlb_range(delay_tlb_tasks[i]);
> +#endif
> +			cpu_clear(smp_processor_id(), 
> delay_tlb_tasks[i].cpu_mask);
> +		}
> +	}
> +#ifdef CONFIG_X86
> +	if (flag)
> +		local_flush_tlb();
> +#endif
> +}
> +
> +static struct delay_tlb_task_t* delay_task = NULL;
> +static int vma_index = 0;
> +
> +static struct delay_tlb_task_t* search_free_tlb_tasks_slot(void)
> +{
> +	struct delay_tlb_task_t* ret = NULL;
> +	int i;
> +again:
> +	for (i = 0; i < 32; i++) {
> +		if (delay_tlb_tasks[i].mm != NULL) {
> +			if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
> +				mmput(delay_tlb_tasks[i].mm);
> +				delay_tlb_tasks[i].mm = NULL;
> +				ret = &delay_tlb_tasks[i];
> +			}
> +		} else
> +			ret = &delay_tlb_tasks[i];
> +	}
> +	if (!ret) { // Force flush TLBs.
> +		on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
> +		goto again;
> +	}
> +	return ret;
> +}
> +
> +static void init_delay_task(struct mm_struct* mm)
> +{
> +	cpus_clear(delay_task->cpu_mask);
> +	vma_index = 0;
> +	delay_task->mm = mm;
> +}
> +
> +/*
> + * We will be working on the mm, so let's force to flush it if necessary.
> + */
> +static void start_tlb_tasks(struct mm_struct* mm)
> +{
> +	int i, flag = 0;
> +again:
> +	for (i = 0; i < 32; i++) {
> +		if (delay_tlb_tasks[i].mm == mm) {
> +			if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
> +				mmput(delay_tlb_tasks[i].mm);
> +				delay_tlb_tasks[i].mm = NULL;
> +			} else
> +				flag = 1;
> +		}
> +	}
> +	if (flag) { // Force flush TLBs.
> +		on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
> +		goto again;
> +	}
> +	BUG_ON(delay_task != NULL);
> +	delay_task = search_free_tlb_tasks_slot();
> +	init_delay_task(mm);
> +}
> +
> +static void end_tlb_tasks(void)
> +{
> +	if (!cpus_empty(delay_task->cpu_mask)) {
> +		atomic_inc(&delay_task->mm->mm_users);
> +		delay_task->cpu_mask = delay_task->mm->cpu_vm_mask;
> +	} else
> +		delay_task->mm = NULL;
> +	delay_task = NULL;
> +}
> +
> +static void fill_in_tlb_tasks(struct vm_area_struct* vma, unsigned long 
> addr,
> +		unsigned long end)
> +{
> +	struct mm_struct* mm;
> +fill_it:
> +	if (vma_index != 32) {
> +		delay_task->vma[vma_index] = vma;
> +		delay_task->start[vma_index] = addr;
> +		delay_task->end[vma_index] = end;
> +		vma_index++;
> +		return;
> +	}
> +	mm = delay_task->mm;
> +	end_tlb_tasks();
> +
> +	delay_task = search_free_tlb_tasks_slot();
> +	init_delay_task(mm);
> +	goto fill_it;
> +}
> +
> +static void shrink_pvma_scan_ptes(struct scan_control* sc,
> +		struct mm_struct* mm, struct vm_area_struct* vma, pmd_t* pmd,
> +		unsigned long addr, unsigned long end)
> +{
> +	int i;
> +	spinlock_t* ptl = pte_lockptr(mm, pmd);
> +	pte_t* pte = pte_offset_map(pmd, addr);
> +	int anon_rss = 0;
> +	struct pagevec freed_pvec;
> +	int may_enter_fs = (sc->gfp_mask & (__GFP_FS | __GFP_IO));
> +	struct address_space* mapping = &swapper_space;
> +
> +	pagevec_init(&freed_pvec, 1);
> +	do {
> +		memset(&series, 0, sizeof(struct series_t));
> +		find_series(&pte, &addr, end);
> +		switch (series.series_stage) {
> +			case 1: // PTE -- untouched PTE.
> +				for (i = 0; i < series.series_length; i++) {
> +					struct page* page = series.pages[i];
> +					lock_page(page);
> +					spin_lock(ptl);
> +					if 
> (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
> +						if 
> (pte_dirty(*series.ptes[i]))
> +							set_page_dirty(page);
> +						set_pte_at(mm, addr + i * 
> PAGE_SIZE, series.ptes[i],
> +							 
> pte_mkold(pte_mkclean(*series.ptes[i])));
> +					}
> +					spin_unlock(ptl);
> +					unlock_page(page);
> +				}
> +				fill_in_tlb_tasks(vma, addr, addr + 
> (PAGE_SIZE * series.series_length));
> +				break;
> +			case 2: // untouched PTE -- UnmappedPTE.
> +				/*
> +				 * Note in stage 1, we've flushed TLB in 
> fill_in_tlb_tasks, so
> +				 * if it's still clear here, we can shift it 
> to Unmapped type.
> +				 *
> +				 * If some architecture doesn't support 
> atomic cmpxchg
> +				 * instruction or can't atomically set the 
> access bit after
> +				 * they touch a pte at first, combine stage 
> 1 with stage 2, and
> +				 * send IPI immediately in fill_in_tlb_tasks.
> +				 */
> +				spin_lock(ptl);
> +				for (i = 0; i < series.series_length; i++) {
> +					if 
> (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
> +						pte_t pte_unmapped = 
> series.orig_ptes[i];
> +						pte_unmapped.pte_low &= 
> ~_PAGE_PRESENT;
> +						pte_unmapped.pte_low |= 
> _PAGE_UNMAPPED;
> +						if 
> (cmpxchg(&series.ptes[i]->pte_low,
> +								 
> series.orig_ptes[i].pte_low,
> +								 
> pte_unmapped.pte_low) !=
> +							 
> series.orig_ptes[i].pte_low)
> +							continue;
> +					 page_remove_rmap(series.pages[i]);
> +						anon_rss--;
> +					}
> +				}
> +				spin_unlock(ptl);
> +				break;
> +			case 3: // Attach SwapPage to PrivatePage.
> +				/*
> +				 * A better arithmetic should be applied to 
> Linux SwapDevice to
> +				 * allocate fake continual SwapPages which 
> are close to each
> +				 * other, the offset between two close 
> SwapPages is less than 8.
> +				 */
> +				if (sc->may_swap) {
> +					for (i = 0; i < 
> series.series_length; i++) {
> +						lock_page(series.pages[i]);
> +						if 
> (!PageSwapCache(series.pages[i])) {
> +							if 
> (!add_to_swap(series.pages[i], GFP_ATOMIC)) {
> +							 
> unlock_page(series.pages[i]);
> +								break;
> +							}
> +						}
> +						unlock_page(series.pages[i]);
> +					}
> +				}
> +				break;
> +			case 4: // SwapPage isn't consistent with 
> PrivatePage.
> +				/*
> +				 * A mini version pageout().
> +				 *
> +				 * Current swap space can't commit multiple 
> pages together:(
> +				 */
> +				if (sc->may_writepage && may_enter_fs) {
> +					for (i = 0; i < 
> series.series_length; i++) {
> +						struct page* page = 
> series.pages[i];
> +						int res;
> +
> +						if 
> (!may_write_to_queue(mapping->backing_dev_info))
> +							break;
> +						lock_page(page);
> +						if (!PageDirty(page) || 
> PageWriteback(page)) {
> +							unlock_page(page);
> +							continue;
> +						}
> +					 clear_page_dirty_for_io(page);
> +						struct writeback_control wbc 
> = {
> +							.sync_mode = 
> WB_SYNC_NONE,
> +							.nr_to_write = 
> SWAP_CLUSTER_MAX,
> +							.nonblocking = 1,
> +							.for_reclaim = 1,
> +						};
> +						page_cache_get(page);
> +						SetPageReclaim(page);
> +						res = swap_writepage(page, 
> &wbc);
> +						if (res < 0) {
> +						 handle_write_error(mapping, 
> page, res);
> +						 ClearPageReclaim(page);
> +						 page_cache_release(page);
> +							break;
> +						}
> +						if (!PageWriteback(page))
> +						 ClearPageReclaim(page);
> +						page_cache_release(page);
> +					}
> +				}
> +				break;
> +			case 5: // UnmappedPTE -- SwappedPTE, reclaim 
> PrivatePage.
> +				for (i = 0; i < series.series_length; i++) {
> +					struct page* page = series.pages[i];
> +					lock_page(page);
> +					spin_lock(ptl);
> +					if 
> (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
> +						spin_unlock(ptl);
> +						unlock_page(page);
> +						continue;
> +					}
> +					swp_entry_t entry = { .val = 
> page_private(page) };
> +					swap_duplicate(entry);
> +					pte_t pte_swp = 
> swp_entry_to_pte(entry);
> +					set_pte_at(mm, addr + i * PAGE_SIZE, 
> series.ptes[i], pte_swp);
> +					spin_unlock(ptl);
> +					if (PageSwapCache(page) && 
> !PageWriteback(page))
> +						delete_from_swap_cache(page);
> +					unlock_page(page);
> +
> +					if (!pagevec_add(&freed_pvec, page))
> +					 
> __pagevec_release_nonlru(&freed_pvec);
> +					sc->nr_reclaimed++;
> +				}
> +				break;
> +			case 6:
> +				// NULL operation!
> +				break;
> +		}
> +	} while (addr < end);
> +	add_mm_counter(mm, anon_rss, anon_rss);
> +	if (pagevec_count(&freed_pvec))
> +		__pagevec_release_nonlru(&freed_pvec);
> +}
> +
> +static void shrink_pvma_pmd_range(struct scan_control* sc, struct
> mm_struct* mm,
> +		struct vm_area_struct* vma, pud_t* pud,
> +		unsigned long addr, unsigned long end)
> +{
> +	unsigned long next;
> +	pmd_t* pmd = pmd_offset(pud, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (pmd_none_or_clear_bad(pmd))
> +			continue;
> +		shrink_pvma_scan_ptes(sc, mm, vma, pmd, addr, next);
> +	} while (pmd++, addr = next, addr != end);
> +}
> +
> +static void shrink_pvma_pud_range(struct scan_control* sc, struct
> mm_struct* mm,
> +		struct vm_area_struct* vma, pgd_t* pgd,
> +		unsigned long addr, unsigned long end)
> +{
> +	unsigned long next;
> +	pud_t* pud = pud_offset(pgd, addr);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (pud_none_or_clear_bad(pud))
> +			continue;
> +		shrink_pvma_pmd_range(sc, mm, vma, pud, addr, next);
> +	} while (pud++, addr = next, addr != end);
> +}
> +
> +static void shrink_pvma_pgd_range(struct scan_control* sc, struct
> mm_struct* mm,
> +		struct vm_area_struct* vma)
> +{
> +	unsigned long next;
> +	unsigned long addr = vma->vm_start;
> +	unsigned long end = vma->vm_end;
> +	pgd_t* pgd = pgd_offset(mm, addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none_or_clear_bad(pgd))
> +			continue;
> +		shrink_pvma_pud_range(sc, mm, vma, pgd, addr, next);
> +	} while (pgd++, addr = next, addr != end);
> +}
> +
> +static void shrink_private_vma(struct scan_control* sc)
> +{
> +	struct mm_struct* mm;
> +	struct vm_area_struct* vma;
> +	struct list_head *pos, *lhtemp;
> +
> +	spin_lock(&mmlist_lock);
> +	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
> +		mm = list_entry(pos, struct mm_struct, mmlist);
> +		if (atomic_inc_return(&mm->mm_users) == 1) {
> +			atomic_dec(&mm->mm_users);
> +			continue;
> +		}
> +		spin_unlock(&mmlist_lock);
> +		start_tlb_tasks(mm);
> +		if (down_read_trylock(&mm->mmap_sem)) {
> +			for (vma = mm->mmap; vma != NULL; vma = 
> vma->vm_next) {
> +				if (!(vma->vm_flags & VM_PURE_PRIVATE))
> +					continue;
> +				if (vma->vm_flags & VM_LOCKED)
> +					continue;
> +				shrink_pvma_pgd_range(sc, mm, vma);
> +			}
> +			up_read(&mm->mmap_sem);
> +		}
> +		end_tlb_tasks();
> +		mmput(mm);
> +		spin_lock(&mmlist_lock);
> +	}
> +	spin_unlock(&mmlist_lock);
> +}
> +
> /*
>  * For kswapd, balance_pgdat() will work across all this node's zones until
>  * they are all at pages_high.
> @@ -1557,6 +1949,7 @@
> 	sc.may_swap = 1;
> 	sc.nr_mapped = read_page_state(nr_mapped);
> 
> +	shrink_private_vma(&sc);
> 	inc_page_state(pageoutrun);
> 
> 	for (i = 0; i < pgdat->nr_zones; i++) {

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-29  9:15     ` Pavel Machek
@ 2006-12-29 15:50       ` Randy Dunlap
  0 siblings, 0 replies; 14+ messages in thread
From: Randy Dunlap @ 2006-12-29 15:50 UTC (permalink / raw)
  To: Pavel Machek; +Cc: yunfeng zhang, linux-kernel, torvalds

On Fri, 29 Dec 2006 10:15:51 +0100 Pavel Machek wrote:

> On Fri 2006-12-29 14:45:33, yunfeng zhang wrote:
> > I've re-published my work on quilt, sorry.
> 
> Your patch is still wordwrapped.
> 
> Do not cc linus on non-final version of the patch.
> 
> Patch should be against latest kernel.
> 
> Patch should have changelog and signed off by.
> 
> Why the change? Do you gain 5% on kernel compile on 20MB box?

+ Don't leave the entire email inline if you are not going to
  comment on it inline.

---
~Randy

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-27  3:33   ` yunfeng zhang
@ 2006-12-30  5:50     ` Zhou Yingchao
  2007-01-05  7:35       ` yunfeng zhang
  0 siblings, 1 reply; 14+ messages in thread
From: Zhou Yingchao @ 2006-12-30  5:50 UTC (permalink / raw)
  To: yunfeng zhang; +Cc: linux-kernel

2006/12/27, yunfeng zhang <zyf.zeroos@gmail.com>:
> To multiple address space, multiple memory inode architecture, we can introduce
> a new core object -- section which has several features
Do you mean "in-memory inode"  or "memory node(pglist_data)" by "memory inode" ?
> The idea issued by me is whether swap subsystem should be deployed on layer 2 or
> layer 3 which is described in Documentation/vm_pps.txt of my patch. To multiple
> memory inode architecture, the special memory model should be encapsulated on
> layer 3 (architecture-dependent), I think.
I guess that you are  wanting to do something to remove arch-dependent
code in swap subsystem.  Just like the pud introduced in the
page-table related codes. Is it right?
However, you should verify that your changes will not deteriorate
system performance. Also, you need to maintain it for a long time with
the evolution of mainline kernel before it is accepted.

Best regards
-- 
Yingchao Zhou
***********************************************
 Institute Of Computing Technology
 Chinese Academy of Sciences
***********************************************

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2006-12-30  5:50     ` Zhou Yingchao
@ 2007-01-05  7:35       ` yunfeng zhang
  2007-01-05  7:45         ` yunfeng zhang
  0 siblings, 1 reply; 14+ messages in thread
From: yunfeng zhang @ 2007-01-05  7:35 UTC (permalink / raw)
  To: Zhou Yingchao; +Cc: linux-kernel

No, a new idea to re-write swap subsystem at all. In fact, it's an
impossible task to me, so I provide a compromising solution -- pps
(pure private page system).

2006/12/30, Zhou Yingchao <yingchao.zhou@gmail.com>:
> 2006/12/27, yunfeng zhang <zyf.zeroos@gmail.com>:
> > To multiple address space, multiple memory inode architecture, we can introduce
> > a new core object -- section which has several features
> Do you mean "in-memory inode"  or "memory node(pglist_data)" by "memory inode" ?
> > The idea issued by me is whether swap subsystem should be deployed on layer 2 or
> > layer 3 which is described in Documentation/vm_pps.txt of my patch. To multiple
> > memory inode architecture, the special memory model should be encapsulated on
> > layer 3 (architecture-dependent), I think.
> I guess that you are  wanting to do something to remove arch-dependent
> code in swap subsystem.  Just like the pud introduced in the
> page-table related codes. Is it right?
> However, you should verify that your changes will not deteriorate
> system performance. Also, you need to maintain it for a long time with
> the evolution of mainline kernel before it is accepted.
>
> Best regards
> --
> Yingchao Zhou
> ***********************************************
>  Institute Of Computing Technology
>  Chinese Academy of Sciences
> ***********************************************
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2007-01-05  7:35       ` yunfeng zhang
@ 2007-01-05  7:45         ` yunfeng zhang
  0 siblings, 0 replies; 14+ messages in thread
From: yunfeng zhang @ 2007-01-05  7:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: pavel, rdunlap

A new patch has been done by me, based on the previous quilt patch
(2.6.16.29). Here is
changelog

--------------------------
>> NEW
New kernel thread kppsd is added to execute background scanning task
periodically (mm/vmscan.c).

PPS statistic is added into /proc/meminfo, its prototype is in
include/linux/mm.h.

Documentation/vm_pps.txt is also updated to show the aboved two new features,
some sections is re-written for comprehension.

>> BUG
New loop code is introduced in shrink_private_vma (mm/vmscan.c) and
pps_swapoff (mm/swapfile.c), contrast with old code, even lhtemp is freed
during loop, it's also safe.

A bug is catched in mm/memory.c:zap_pte_range -- if a PrivatePage is
being written back, it will be migrated back to Linux legacy page system.

A fault done by me in previous patch is remedied in stage 5, now stage 5 can
work.

>> MISCELLANEOUS
UP code has been separated from SMP code in dftlb.
--------------------------

Index: linux-2.6.16.29/Documentation/vm_pps.txt
===================================================================
--- linux-2.6.16.29.orig/Documentation/vm_pps.txt	2007-01-04
14:47:35.000000000 +0800
+++ linux-2.6.16.29/Documentation/vm_pps.txt	2007-01-04 14:49:36.000000000 +0800
@@ -6,11 +6,11 @@
 // Purpose <([{
 The file is used to document the idea which is published firstly at
 http://www.ussg.iu.edu/hypermail/linux/kernel/0607.2/0451.html, as a part of my
-OS -- main page http://blog.chinaunix.net/u/21764/index.php. In brief, a patch
-of the document to enchance the performance of Linux swap subsystem. You can
-find the overview of the idea in section <How to Reclaim Pages more
-Efficiently> and how I patch it into Linux 2.6.16.29 in section <Pure Private
-Page System -- pps>.
+OS -- main page http://blog.chinaunix.net/u/21764/index.php. In brief, the
+patch of the document is for enchancing the performance of Linux swap
+subsystem. You can find the overview of the idea in section <How to Reclaim
+Pages more Efficiently> and how I patch it into Linux 2.6.16.29 in section
+<Pure Private Page System -- pps>.
 // }])>

 // How to Reclaim Pages more Efficiently <([{
@@ -21,7 +21,9 @@
 OK! to modern OS, its memory subsystem can be divided into three layers
 1) Space layer (InodeSpace, UserSpace and CoreSpace).
 2) VMA layer (PrivateVMA and SharedVMA, memory architecture-independent layer).
-3) PTE and page layer (architecture-dependent).
+3) PTE, zone/memory inode layer (architecture-dependent).
+4) Maybe it makes you sense that Page should be placed on the 3rd layer, but
+   here, it's placed on the 2nd layer since it's the basic unit of VMA.

 Since the 2nd layer assembles the much statistic of page-acess information, so
 it's nature that swap subsystem should be deployed and implemented on the 2nd
@@ -41,7 +43,8 @@
 Unfortunately, Linux 2.6.16.29 swap subsystem is based on the 3rd layer -- a
 system on zone::active_list/inactive_list.

-I've finished a patch, see section <Pure Private Page System -- pps>.
Note, it ISN'T perfect.
+I've finished a patch, see section <Pure Private Page System -- pps>. Note, it
+ISN'T perfect.
 // }])>

 // Pure Private Page System -- pps  <([{
@@ -70,7 +73,18 @@
 3) <Private Page Lifecycle of pps> -- how private pages enter in/go off pps.
 4) <VMA Lifecycle of pps> which VMA is belonging to pps.

-PPS uses init_mm.mm_list list to enumerate all swappable UserSpace.
+PPS uses init_mm.mm_list list to enumerate all swappable UserSpace
+(shrink_private_vma).
+
+A new kernel thread -- kppsd is introduced in mm/vmscan.c, its task is to
+execute the stages of pps periodically, note an appropriate timeout ticks is
+necessary so we can give application a chance to re-map back its PrivatePage
+from UnmappedPTE to PTE, that is, show their conglomeration affinity.
+scan_control::pps_cmd field is used to control the behavior of kppsd, = 1 for
+accelerating scanning process and reclaiming pages, it's used in balance_pgdat.
+
+PPS statistic data is appended to /proc/meminfo entry, its prototype is in
+include/linux/mm.h.

 I'm also glad to highlight my a new idea -- dftlb which is described in
 section <Delay to Flush TLB>.
@@ -97,15 +111,19 @@
    gone when a CPU starts to execute the task in timer interrupt, so don't use
    dftlb.
 combine stage 1 with stage 2, and send IPI immediately in fill_in_tlb_tasks.
+
+dftlb increases mm_struct::mm_users to prevent the mm from being freed when
+other CPU works on it.
 // }])>

 // Stage Definition <([{
 The whole process of private page page-out is divided into six stages, as
-showed in shrink_pvma_scan_ptes of mm/vmscan.c
+showed in shrink_pvma_scan_ptes of mm/vmscan.c, the code groups the similar
+pages to a series.
 1) PTE to untouched PTE (access bit is cleared), append flushing
tasks to dftlb.
 2) Convert untouched PTE to UnmappedPTE.
 3) Link SwapEntry to every UnmappedPTE.
-4) Synchronize the page of a UnmappedPTE with its physical swap page.
+4) Flush PrivatePage of UnmappedPTE to its disk SwapPage.
 5) Reclaimed the page and shift UnmappedPTE to SwappedPTE.
 6) SwappedPTE stage.
 // }])>
@@ -114,7 +132,15 @@
 New VMA flag (VM_PURE_PRIVATE) is appended into VMA in include/linux/mm.h.

 New PTE type (UnmappedPTE) is appended into PTE system in
-include/asm-i386/pgtable.h.
+include/asm-i386/pgtable.h. Its prototype is
+struct UnmappedPTE {
+    int present : 1; // must be 0.
+    ...
+    int pageNum : 20;
+};
+The new PTE has a feature, it keeps a link to its PrivatePage and prevent the
+page from being visited by CPU, so you can use it in <Stage Definition> as a
+middleware.
 // }])>

 // Concurrent Racers of Shrinking pps <([{
@@ -125,10 +151,14 @@
 1) mm/swapfile.c    pps_swapoff (swapoff API).
 2) mm/memory.c  do_wp_page, handle_pte_fault::unmapped_pte, do_anonymous_page
    (page-fault).
+
+The VMAs of pps can coexist with madvise, mlock, mprotect, mmap and munmap,
+that is why new VMA created from mmap.c:split_vma can re-enter into pps.
 // }])>

 // Private Page Lifecycle of pps <([{
-All pages belonging to pps are called as pure private page.
+All pages belonging to pps are called as pure private page, its PTE type is PTE
+or UnmappedPTE.

 IN (NOTE, when a pure private page enters into pps, it's also trimmed from
 Linux legacy page system by commeting lru_cache_add_active clause)
@@ -147,9 +177,10 @@
 // }])>

 // VMA Lifecycle of pps <([{
-When a PrivateVMA enters into pps, it's or-ed a new flag -- VM_PURE_PRIVATE,
-the flag is used in the shrink_private_vma of mm/vmscan.c.  Other fields are
-left untouched.
+When a PrivateVMA enters into pps, it's or-ed a new flag -- VM_PURE_PRIVATE in
+memory.c:enter_pps, you can also find which VMA is fit with pps in it, the flag
+is used in the shrink_private_vma of mm/vmscan.c.  Other fields are left
+untouched.

 IN.
 1) fs/exec.c	setup_arg_pages	(StackVMA).
@@ -173,18 +204,9 @@
    UnmappedPTE) and PrivatePage (SwapPage) which is described in my OS and the
    aboved hyperlink of Linux kernel mail list. So it's a compromise to use
    Linux legacy SwapCache in my pps.
-2) SwapCache should provide more flexible interfaces, shrink_pvma_scan_ptes
+2) SwapSpace should provide more flexible interfaces, shrink_pvma_scan_ptes
    need allocate swap entries in batch, exactly, allocate a batch of fake
    continual swap entries, see mm/pps_swapin_readahead.
-3) pps statistic entry in /proc/meminfo.
-4) a better arithmetic to pick mm out to scan and shrink in shrink_private_vma.
-5) It's better to execute the first 2 stages when system is idle, current
-   SwapDaemon only is activated when free pages are low.
-6) A scanning count should be added into mm_struct, so when the count is
-   becoming enough old to open stage 3 and 4.
-
-I'm still working on improvement 4, 5 and 6 to find out how to maximum the
-performance of swap subsystem.

 If Linux kernel group can't make a schedule to re-write their memory code,
 however, pps maybe is the best solution until now.
Index: linux-2.6.16.29/fs/exec.c
===================================================================
--- linux-2.6.16.29.orig/fs/exec.c	2007-01-04 14:47:35.000000000 +0800
+++ linux-2.6.16.29/fs/exec.c	2007-01-04 14:49:36.000000000 +0800
@@ -320,6 +320,8 @@
 		pte_unmap_unlock(pte, ptl);
 		goto out;
 	}
+	atomic_inc(&pps_info.total);
+	atomic_inc(&pps_info.pte_count);
 	inc_mm_counter(mm, anon_rss);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
Index: linux-2.6.16.29/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.16.29.orig/fs/proc/proc_misc.c	2007-01-04 14:47:35.000000000 +0800
+++ linux-2.6.16.29/fs/proc/proc_misc.c	2007-01-04 14:49:36.000000000 +0800
@@ -174,7 +174,11 @@
 		"PageTables:   %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
 		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n",
+		"VmallocChunk: %8lu kB\n"
+		"PPS Total:    %8d kB\n"
+		"PPS PTE:      %8d kB\n"
+		"PPS Unmapped: %8d kB\n"
+		"PPS Swapped:  %8d kB\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
@@ -197,7 +201,11 @@
 		K(ps.nr_page_table_pages),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
-		vmi.largest_chunk >> 10
+		vmi.largest_chunk >> 10,
+		K(pps_info.total.counter),
+		K(pps_info.pte_count.counter),
+		K(pps_info.unmapped_count.counter),
+		K(pps_info.swapped_count.counter)
 		);

 		len += hugetlb_report_meminfo(page + len);
Index: linux-2.6.16.29/include/asm-i386/mmu_context.h
===================================================================
--- linux-2.6.16.29.orig/include/asm-i386/mmu_context.h	2007-01-04
14:47:35.000000000 +0800
+++ linux-2.6.16.29/include/asm-i386/mmu_context.h	2007-01-04
14:49:36.000000000 +0800
@@ -33,6 +33,9 @@
 		/* stop flush ipis for the previous mm */
 		cpu_clear(cpu, prev->cpu_vm_mask);
 #ifdef CONFIG_SMP
+		// vmscan.c::end_tlb_tasks maybe had copied cpu_vm_mask before we leave
+		// prev, so let's flush the trace of prev of delay_tlb_tasks.
+		timer_flush_tlb_tasks(NULL);
 		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
 		per_cpu(cpu_tlbstate, cpu).active_mm = next;
 #endif
Index: linux-2.6.16.29/include/linux/mm.h
===================================================================
--- linux-2.6.16.29.orig/include/linux/mm.h	2007-01-04 14:47:36.000000000 +0800
+++ linux-2.6.16.29/include/linux/mm.h	2007-01-04 14:49:37.000000000 +0800
@@ -1058,8 +1058,16 @@
 extern int randomize_va_space;
 #endif

+struct pps_info {
+	atomic_t total;
+	atomic_t pte_count; // stage 1 and 2.
+	atomic_t unmapped_count; // stage 3 and 4.
+	atomic_t swapped_count; // stage 6.
+};
+extern struct pps_info pps_info;
+
 /* vmscan.c::delay flush TLB */
-struct delay_tlb_task_t
+struct delay_tlb_task
 {
 	struct mm_struct* mm;
 	cpumask_t cpu_mask;
@@ -1067,7 +1075,7 @@
 	unsigned long start[32];
 	unsigned long end[32];
 };
-extern struct delay_tlb_task_t delay_tlb_tasks[32];
+extern struct delay_tlb_task delay_tlb_tasks[32];

 // The prototype of the function is fit with the "func" of "int
 // smp_call_function (void (*func) (void *info), void *info, int retry, int
Index: linux-2.6.16.29/kernel/timer.c
===================================================================
--- linux-2.6.16.29.orig/kernel/timer.c	2007-01-04 14:47:37.000000000 +0800
+++ linux-2.6.16.29/kernel/timer.c	2007-01-04 14:49:37.000000000 +0800
@@ -843,7 +843,9 @@
 	scheduler_tick();
  	run_posix_cpu_timers(p);

+#ifdef SMP
 	timer_flush_tlb_tasks(NULL);
+#endif
 }

 /*
Index: linux-2.6.16.29/mm/memory.c
===================================================================
--- linux-2.6.16.29.orig/mm/memory.c	2007-01-04 14:47:37.000000000 +0800
+++ linux-2.6.16.29/mm/memory.c	2007-01-04 14:49:37.000000000 +0800
@@ -615,6 +615,9 @@
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int pps_pte = 0;
+	int pps_unmapped = 0;
+	int pps_swapped = 0;

 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
@@ -658,8 +661,13 @@
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
-			// if (vma->vm_flags & VM_PURE_PRIVATE && page != ZERO_PAGE(addr))
-			// 	lru_cache_add_active(page);
+			if (vma->vm_flags & VM_PURE_PRIVATE) {
+				if (page != ZERO_PAGE(addr)) {
+					if (PageWriteback(page))
+						lru_cache_add_active(page);
+					pps_pte++;
+				}
+			}
 			if (PageAnon(page))
 				anon_rss--;
 			else {
@@ -682,18 +690,28 @@
 		if (pte_unmapped(ptent)) {
 			struct page *page;
 			page = pfn_to_page(pte_pfn(ptent));
+			BUG_ON(page == ZERO_PAGE(addr));
+			if (PageWriteback(page))
+				lru_cache_add_active(page);
+			pps_unmapped++;
 			pte_clear_full(mm, addr, pte, tlb->fullmm);
-			// lru_cache_add_active(page);
 			tlb_remove_page(tlb, page);
 			anon_rss--;
 			continue;
 		}
-		if (pte_swapped(ptent))
+		if (pte_swapped(ptent)) {
+			if (vma->vm_flags & VM_PURE_PRIVATE)
+				pps_swapped++;
 			free_swap_and_cache(pte_to_swp_entry(ptent));
+		}
 		pte_clear_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

 	add_mm_rss(mm, file_rss, anon_rss);
+	atomic_sub(pps_pte + pps_unmapped, &pps_info.total);
+	atomic_sub(pps_pte, &pps_info.pte_count);
+	atomic_sub(pps_unmapped, &pps_info.unmapped_count);
+	atomic_sub(pps_swapped, &pps_info.swapped_count);
 	pte_unmap_unlock(pte - 1, ptl);

 	return addr;
@@ -1521,6 +1539,10 @@
 		lazy_mmu_prot_update(entry);
 		if (!(vma->vm_flags & VM_PURE_PRIVATE))
 			lru_cache_add_active(new_page);
+		else {
+			atomic_inc(&pps_info.total);
+			atomic_inc(&pps_info.pte_count);
+		}
 		page_add_new_anon_rmap(new_page, vma, address);

 		/* Free the old page.. */
@@ -2041,6 +2063,11 @@
 	if (vm_swap_full())
 		remove_exclusive_swap_page(page);
 	unlock_page(page);
+	if (vma->vm_flags & VM_PURE_PRIVATE) {
+		atomic_dec(&pps_info.swapped_count);
+		atomic_inc(&pps_info.total);
+		atomic_inc(&pps_info.pte_count);
+	}

 	if (write_access) {
 		if (do_wp_page(mm, vma, address,
@@ -2094,6 +2121,10 @@
 			goto release;
 		if (!(vma->vm_flags & VM_PURE_PRIVATE))
 			lru_cache_add_active(page);
+		else {
+			atomic_inc(&pps_info.total);
+			atomic_inc(&pps_info.pte_count);
+		}
 		inc_mm_counter(mm, anon_rss);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
@@ -2311,6 +2342,8 @@
 	if (!pte_present(entry)) {
 		if (pte_unmapped(entry)) {
 			BUG_ON(!(vma->vm_flags & VM_PURE_PRIVATE));
+			atomic_dec(&pps_info.unmapped_count);
+			atomic_inc(&pps_info.pte_count);
 			struct page* page = pte_page(entry);
 			pte_t temp_pte = mk_pte(page, vma->vm_page_prot);
 			pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2565,8 +2598,11 @@
 {
 	struct page* page;
 	pte_t entry;
-	pte_t* pte;
+	pte_t *pte;
 	spinlock_t* ptl;
+	int pps_pte = 0;
+	int pps_unmapped = 0;
+	int pps_swapped = 0;

 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
@@ -2577,16 +2613,23 @@
 			set_pte_at(mm, addr, pte, entry);
 			BUG_ON(page == ZERO_PAGE(addr));
 			page_add_new_anon_rmap(page, vma, addr);
-		}
-		if (pte_present(*pte)) {
+			lru_cache_add_active(page);
+			pps_unmapped++;
+		} else if (pte_present(*pte)) {
 			page = pte_page(*pte);
 			if (page == ZERO_PAGE(addr))
 				continue;
 			lru_cache_add_active(page);
-		}
+			pps_pte++;
+		} else if (!pte_present(*pte) && pte_swapped(*pte))
+			pps_swapped++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
 	lru_add_drain();
+	atomic_sub(pps_pte + pps_unmapped, &pps_info.total);
+	atomic_sub(pps_pte, &pps_info.pte_count);
+	atomic_sub(pps_unmapped, &pps_info.unmapped_count);
+	atomic_sub(pps_swapped, &pps_info.swapped_count);
 }

 static void migrate_back_pmd_range(struct mm_struct* mm, pud_t *pud, struct
@@ -2636,17 +2679,13 @@
 	} while (pgd++, addr = next, addr != end);
 }

-LIST_HEAD(pps_head);
-LIST_HEAD(pps_head_buddy);
-
-DEFINE_SPINLOCK(pps_lock);
-
 void enter_pps(struct mm_struct* mm, struct vm_area_struct* vma)
 {
 	int condition = VM_READ | VM_WRITE | VM_EXEC | \
 		 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \
 		 VM_GROWSDOWN | VM_GROWSUP | \
-		 VM_LOCKED | VM_SEQ_READ | VM_RAND_READ | VM_DONTCOPY | VM_ACCOUNT;
+		 VM_LOCKED | VM_SEQ_READ | VM_RAND_READ | VM_DONTCOPY | VM_ACCOUNT | \
+		 VM_PURE_PRIVATE;
 	if (!(vma->vm_flags & ~condition) && vma->vm_file == NULL) {
 		vma->vm_flags |= VM_PURE_PRIVATE;
 		if (list_empty(&mm->mmlist)) {
Index: linux-2.6.16.29/mm/swapfile.c
===================================================================
--- linux-2.6.16.29.orig/mm/swapfile.c	2007-01-04 14:47:37.000000000 +0800
+++ linux-2.6.16.29/mm/swapfile.c	2007-01-04 14:49:37.000000000 +0800
@@ -544,19 +544,22 @@

 static int pps_swapoff(int type)
 {
-	struct mm_struct* mm;
 	struct vm_area_struct* vma;
-	struct list_head *pos, *lhtemp;
+	struct list_head *pos;
+	struct mm_struct *prev, *mm;
 	int ret = 0;

+	prev = mm = &init_mm;
+	pos = &init_mm.mmlist;
+	atomic_inc(&prev->mm_users);
 	spin_lock(&mmlist_lock);
-	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
+	while ((pos = pos->next) != &init_mm.mmlist) {
 		mm = list_entry(pos, struct mm_struct, mmlist);
-		if (atomic_inc_return(&mm->mm_users) == 1) {
-			atomic_dec(&mm->mm_users);
+		if (!atomic_add_unless(&mm->mm_users, 1, 0))
 			continue;
-		}
 		spin_unlock(&mmlist_lock);
+		mmput(prev);
+		prev = mm;
 		down_read(&mm->mmap_sem);
 		for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
 			if (!(vma->vm_flags & VM_PURE_PRIVATE))
@@ -568,10 +571,10 @@
 				break;
 		}
 		up_read(&mm->mmap_sem);
-		mmput(mm);
 		spin_lock(&mmlist_lock);
 	}
 	spin_unlock(&mmlist_lock);
+	mmput(prev);
 	return ret;
 }

Index: linux-2.6.16.29/mm/vmscan.c
===================================================================
--- linux-2.6.16.29.orig/mm/vmscan.c	2007-01-04 14:47:38.000000000 +0800
+++ linux-2.6.16.29/mm/vmscan.c	2007-01-05 11:42:05.795353536 +0800
@@ -79,6 +79,9 @@
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
 	int swap_cluster_max;
+
+	/* pps control command, 0: do stage 1-4, kppsd only; 1: full stages. */
+	int pps_cmd;
 };

 /*
@@ -1514,6 +1517,17 @@
 	return ret;
 }

+// pps fields.
+static wait_queue_head_t kppsd_wait;
+static struct scan_control wakeup_sc;
+struct pps_info pps_info = {
+	.total = ATOMIC_INIT(0),
+	.pte_count = ATOMIC_INIT(0), // stage 1 and 2.
+	.unmapped_count = ATOMIC_INIT(0), // stage 3 and 4.
+	.swapped_count = ATOMIC_INIT(0) // stage 6.
+};
+// pps end.
+
 struct series_t {
 	pte_t orig_ptes[MAX_SERIES_LENGTH];
 	pte_t* ptes[MAX_SERIES_LENGTH];
@@ -1564,11 +1578,10 @@
 	series.series_length = i;
 }

-struct delay_tlb_task_t delay_tlb_tasks[32] = { [0 ... 31] = {0} };
+struct delay_tlb_task delay_tlb_tasks[32] = { [0 ... 31] = {0} };

 void timer_flush_tlb_tasks(void* data)
 {
-	// To x86, if we found there were some flushing tasks, we should do
it all together, that is, flush it once.
 	int i;
 #ifdef CONFIG_X86
 	int flag = 0;
@@ -1591,12 +1604,12 @@
 #endif
 }

-static struct delay_tlb_task_t* delay_task = NULL;
+static struct delay_tlb_task* delay_task = NULL;
 static int vma_index = 0;

-static struct delay_tlb_task_t* search_free_tlb_tasks_slot(void)
+static struct delay_tlb_task* search_free_tlb_tasks_slot(void)
 {
-	struct delay_tlb_task_t* ret = NULL;
+	struct delay_tlb_task* ret = NULL;
 	int i;
 again:
 	for (i = 0; i < 32; i++) {
@@ -1650,18 +1663,24 @@

 static void end_tlb_tasks(void)
 {
-	if (!cpus_empty(delay_task->cpu_mask)) {
-		atomic_inc(&delay_task->mm->mm_users);
-		delay_task->cpu_mask = delay_task->mm->cpu_vm_mask;
-	} else
-		delay_task->mm = NULL;
+	atomic_inc(&delay_task->mm->mm_users);
+	delay_task->cpu_mask = delay_task->mm->cpu_vm_mask;
 	delay_task = NULL;
+#ifndef CONFIG_SMP
+	timer_flush_tlb_tasks(NULL);
+#endif
 }

 static void fill_in_tlb_tasks(struct vm_area_struct* vma, unsigned long addr,
 		unsigned long end)
 {
 	struct mm_struct* mm;
+	// First, try to combine the task with the previous.
+	if (vma_index != 0 && delay_task->vma[vma_index - 1] == vma &&
+			delay_task->end[vma_index - 1] == addr) {
+		delay_task->end[vma_index - 1] = end;
+		return;
+	}
 fill_it:
 	if (vma_index != 32) {
 		delay_task->vma[vma_index] = vma;
@@ -1678,11 +1697,11 @@
 	goto fill_it;
 }

-static void shrink_pvma_scan_ptes(struct scan_control* sc,
-		struct mm_struct* mm, struct vm_area_struct* vma, pmd_t* pmd,
-		unsigned long addr, unsigned long end)
+static void shrink_pvma_scan_ptes(struct scan_control* sc, struct mm_struct*
+		mm, struct vm_area_struct* vma, pmd_t* pmd, unsigned long addr,
+		unsigned long end)
 {
-	int i;
+	int i, statistic;
 	spinlock_t* ptl = pte_lockptr(mm, pmd);
 	pte_t* pte = pte_offset_map(pmd, addr);
 	int anon_rss = 0;
@@ -1694,6 +1713,8 @@
 	do {
 		memset(&series, 0, sizeof(struct series_t));
 		find_series(&pte, &addr, end);
+		if (sc->pps_cmd == 0 && series.series_stage == 5)
+			continue;
 		switch (series.series_stage) {
 			case 1: // PTE -- untouched PTE.
 				for (i = 0; i < series.series_length; i++) {
@@ -1722,6 +1743,7 @@
 				 * send IPI immediately in fill_in_tlb_tasks.
 				 */
 				spin_lock(ptl);
+				statistic = 0;
 				for (i = 0; i < series.series_length; i++) {
 					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
 						pte_t pte_unmapped = series.orig_ptes[i];
@@ -1734,8 +1756,11 @@
 							continue;
 						page_remove_rmap(series.pages[i]);
 						anon_rss--;
+						statistic++;
 					}
 				}
+				atomic_add(statistic, &pps_info.unmapped_count);
+				atomic_sub(statistic, &pps_info.pte_count);
 				spin_unlock(ptl);
 				break;
 			case 3: // Attach SwapPage to PrivatePage.
@@ -1798,15 +1823,17 @@
 				}
 				break;
 			case 5: // UnmappedPTE -- SwappedPTE, reclaim PrivatePage.
+				statistic = 0;
 				for (i = 0; i < series.series_length; i++) {
 					struct page* page = series.pages[i];
 					lock_page(page);
 					spin_lock(ptl);
-					if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+					if (unlikely(!pte_same(*series.ptes[i], series.orig_ptes[i]))) {
 						spin_unlock(ptl);
 						unlock_page(page);
 						continue;
 					}
+					statistic++;
 					swp_entry_t entry = { .val = page_private(page) };
 					swap_duplicate(entry);
 					pte_t pte_swp = swp_entry_to_pte(entry);
@@ -1820,6 +1847,9 @@
 						__pagevec_release_nonlru(&freed_pvec);
 					sc->nr_reclaimed++;
 				}
+				atomic_add(statistic, &pps_info.swapped_count);
+				atomic_sub(statistic, &pps_info.unmapped_count);
+				atomic_sub(statistic, &pps_info.total);
 				break;
 			case 6:
 				// NULL operation!
@@ -1831,9 +1861,9 @@
 		__pagevec_release_nonlru(&freed_pvec);
 }

-static void shrink_pvma_pmd_range(struct scan_control* sc, struct
mm_struct* mm,
-		struct vm_area_struct* vma, pud_t* pud,
-		unsigned long addr, unsigned long end)
+static void shrink_pvma_pmd_range(struct scan_control* sc, struct mm_struct*
+		mm, struct vm_area_struct* vma, pud_t* pud, unsigned long addr,
+		unsigned long end)
 {
 	unsigned long next;
 	pmd_t* pmd = pmd_offset(pud, addr);
@@ -1845,9 +1875,9 @@
 	} while (pmd++, addr = next, addr != end);
 }

-static void shrink_pvma_pud_range(struct scan_control* sc, struct
mm_struct* mm,
-		struct vm_area_struct* vma, pgd_t* pgd,
-		unsigned long addr, unsigned long end)
+static void shrink_pvma_pud_range(struct scan_control* sc, struct mm_struct*
+		mm, struct vm_area_struct* vma, pgd_t* pgd, unsigned long addr,
+		unsigned long end)
 {
 	unsigned long next;
 	pud_t* pud = pud_offset(pgd, addr);
@@ -1859,8 +1889,8 @@
 	} while (pud++, addr = next, addr != end);
 }

-static void shrink_pvma_pgd_range(struct scan_control* sc, struct
mm_struct* mm,
-		struct vm_area_struct* vma)
+static void shrink_pvma_pgd_range(struct scan_control* sc, struct mm_struct*
+		mm, struct vm_area_struct* vma)
 {
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
@@ -1876,18 +1906,21 @@

 static void shrink_private_vma(struct scan_control* sc)
 {
-	struct mm_struct* mm;
 	struct vm_area_struct* vma;
-	struct list_head *pos, *lhtemp;
+	struct list_head *pos;
+	struct mm_struct *prev, *mm;

+	prev = mm = &init_mm;
+	pos = &init_mm.mmlist;
+	atomic_inc(&prev->mm_users);
 	spin_lock(&mmlist_lock);
-	list_for_each_safe(pos, lhtemp, &init_mm.mmlist) {
+	while ((pos = pos->next) != &init_mm.mmlist) {
 		mm = list_entry(pos, struct mm_struct, mmlist);
-		if (atomic_inc_return(&mm->mm_users) == 1) {
-			atomic_dec(&mm->mm_users);
+		if (!atomic_add_unless(&mm->mm_users, 1, 0))
 			continue;
-		}
 		spin_unlock(&mmlist_lock);
+		mmput(prev);
+		prev = mm;
 		start_tlb_tasks(mm);
 		if (down_read_trylock(&mm->mmap_sem)) {
 			for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
@@ -1900,10 +1933,10 @@
 			up_read(&mm->mmap_sem);
 		}
 		end_tlb_tasks();
-		mmput(mm);
 		spin_lock(&mmlist_lock);
 	}
 	spin_unlock(&mmlist_lock);
+	mmput(prev);
 }

 /*
@@ -1949,7 +1982,10 @@
 	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);

-	shrink_private_vma(&sc);
+	wakeup_sc = sc;
+	wakeup_sc.pps_cmd = 1;
+	wake_up_interruptible(&kppsd_wait);
+
 	inc_page_state(pageoutrun);

 	for (i = 0; i < pgdat->nr_zones; i++) {
@@ -2086,6 +2122,33 @@
 	return total_reclaimed;
 }

+static int kppsd(void* p)
+{
+	struct task_struct *tsk = current;
+	int timeout;
+	DEFINE_WAIT(wait);
+	daemonize("kppsd");
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+	struct scan_control default_sc;
+	default_sc.gfp_mask = GFP_KERNEL;
+	default_sc.may_writepage = 1;
+	default_sc.may_swap = 1;
+	default_sc.pps_cmd = 0;
+
+	while (1) {
+		try_to_freeze();
+		prepare_to_wait(&kppsd_wait, &wait, TASK_INTERRUPTIBLE);
+		timeout = schedule_timeout(2000);
+		finish_wait(&kppsd_wait, &wait);
+
+		if (timeout)
+			shrink_private_vma(&wakeup_sc);
+		else
+			shrink_private_vma(&default_sc);
+	}
+	return 0;
+}
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
@@ -2230,6 +2293,15 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */

+static int __init kppsd_init(void)
+{
+	init_waitqueue_head(&kppsd_wait);
+	kernel_thread(kppsd, NULL, CLONE_KERNEL);
+	return 0;
+}
+
+module_init(kppsd_init)
+
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
       [not found] <200701092047.43248.a1426z@gawab.com>
@ 2007-01-10  2:51 ` yunfeng zhang
  0 siblings, 0 replies; 14+ messages in thread
From: yunfeng zhang @ 2007-01-10  2:51 UTC (permalink / raw)
  To: Al Boldi; +Cc: linux-kernel

Sorry, I can't be online regularly, that is, can't synchronize Linux CVS, so
only work on a fixed kernel version. Documentation/vm_pps.txt isn't only a patch
overview but also a changelog.

>
> Great!
>
> Do you have patch against 2.6.19?
>
>
> Thanks!
>
> --
> Al
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
  2007-01-05 23:10 zyf.zeroos
@ 2007-01-09  8:27 ` yunfeng zhang
  0 siblings, 0 replies; 14+ messages in thread
From: yunfeng zhang @ 2007-01-09  8:27 UTC (permalink / raw)
  To: linux-kernel; +Cc: pavel, rdunlap, akpm

Maybe, there should be a memory maintainer in linux kernel group.

Here, I show some content from my patch (Documentation/vm_pps.txt). In brief, I
make a revolution about Linux swap subsystem, the idea is described that
SwapDaemon should scan and reclaim pages on UserSpace::vmalist other than
current zone::active/inactive. The change will conspicuously enhance swap
subsystem performance by

1) SwapDaemon can collect the statistic of process acessing pages and by it
   unmaps ptes, SMP specially benefits from it for we can use flush_tlb_range
   to unmap ptes batchly rather than frequently TLB IPI interrupt per a page in
   current Linux legacy swap subsystem. In fact, in some cases, we can even
   flush TLB without sending IPI.
2) Page-fault can issue better readahead requests since history data shows all
   related pages have conglomerating affinity. In contrast, Linux page-fault
   readaheads the pages relative to the SwapSpace position of current
   page-fault page.
3) It's conformable to POSIX madvise API family.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem
@ 2007-01-05 23:10 zyf.zeroos
  2007-01-09  8:27 ` yunfeng zhang
  0 siblings, 1 reply; 14+ messages in thread
From: zyf.zeroos @ 2007-01-05 23:10 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 25044 bytes --]

Test mail with my signature, mail content is based on the second quilt patch (Linux 2.6.16.29), only two key files are re-sent 1) Documentation/vm_pps.txt 2) mm/vmscan.c

Index: test.signature/Documentation/vm_pps.txt
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ test.signature/Documentation/vm_pps.txt 2007-01-06 07:00:18.146480584 +0800
@@ -0,0 +1,214 @@
+                         Pure Private Page System (pps)
+                     Copyright by Yunfeng Zhang on GFDL 1.2
+                              zyf.zeroos@gmail.com
+                              December 24-26, 2006
+
+// Purpose <([{
+The file is used to document the idea which is published firstly at
+http://www.ussg.iu.edu/hypermail/linux/kernel/0607.2/0451.html, as a part of my
+OS -- main page http://blog.chinaunix.net/u/21764/index.php. In brief, the
+patch of the document is for enchancing the performance of Linux swap
+subsystem. You can find the overview of the idea in section <How to Reclaim
+Pages more Efficiently> and how I patch it into Linux 2.6.16.29 in section
+<Pure Private Page System -- pps>.
+// }])>
+
+// How to Reclaim Pages more Efficiently <([{
+Good idea originates from overall design and management ability, when you look
+down from a manager view, you will relief yourself from disordered code and
+find some problem immediately.
+
+OK! to modern OS, its memory subsystem can be divided into three layers
+1) Space layer (InodeSpace, UserSpace and CoreSpace).
+2) VMA layer (PrivateVMA and SharedVMA, memory architecture-independent layer).
+3) PTE, zone/memory inode layer (architecture-dependent).
+4) Maybe it makes you sense that Page should be placed on the 3rd layer, but
+   here, it's placed on the 2nd layer since it's the basic unit of VMA.
+
+Since the 2nd layer assembles the much statistic of page-acess information, so
+it's nature that swap subsystem should be deployed and implemented on the 2nd
+layer.
+
+Undoubtedly, there are some virtues about it
+1) SwapDaemon can collect the statistic of process acessing pages and by it
+   unmaps ptes, SMP specially benefits from it for we can use flush_tlb_range
+   to unmap ptes batchly rather than frequently TLB IPI interrupt per a page in
+   current Linux legacy swap subsystem.
+2) Page-fault can issue better readahead requests since history data shows all
+   related pages have conglomerating affinity. In contrast, Linux page-fault
+   readaheads the pages relative to the SwapSpace position of current
+   page-fault page.
+3) It's conformable to POSIX madvise API family.
+
+Unfortunately, Linux 2.6.16.29 swap subsystem is based on the 3rd layer -- a
+system on zone::active_list/inactive_list.
+
+I've finished a patch, see section <Pure Private Page System -- pps>. Note, it
+ISN'T perfect.
+// }])>
+
+// Pure Private Page System -- pps  <([{
+As I've referred in previous section, perfectly applying my idea need to unroot
+page-surrounging swap subsystem to migrate it on VMA, but a huge gap has
+defeated me -- active_list and inactive_list. In fact, you can find
+lru_add_active code anywhere ... It's IMPOSSIBLE to me to complete it only by
+myself. It's also the difference between my design and Linux, in my OS, page is
+the charge of its new owner totally, however, to Linux, page management system
+is still tracing it by PG_active flag.
+
+So I conceive another solution:) That is, set up an independent page-recycle
+system rooted on Linux legacy page system -- pps, intercept all private pages
+belonging to PrivateVMA to pps, then use my pps to cycle them.  By the way, the
+whole job should be consist of two parts, here is the first --
+PrivateVMA-oriented (PPS), other is SharedVMA-oriented (should be called SPS)
+scheduled in future. Of course, if all are done, it will empty Linux legacy
+page system.
+
+In fact, pps is centered on how to better collect and unmap process private
+pages in SwapDaemon mm/vmscan.c:shrink_private_vma, the whole process is
+divided into six stages -- <Stage Definition>. Other sections show the remain
+aspects of pps
+1) <Data Definition> is basic data definition.
+2) <Concurrent racers of Shrinking pps> is focused on synchronization.
+3) <Private Page Lifecycle of pps> -- how private pages enter in/go off pps.
+4) <VMA Lifecycle of pps> which VMA is belonging to pps.
+
+PPS uses init_mm.mm_list list to enumerate all swappable UserSpace
+(shrink_private_vma).
+
+A new kernel thread -- kppsd is introduced in mm/vmscan.c, its task is to
+execute the stages of pps periodically, note an appropriate timeout ticks is
+necessary so we can give application a chance to re-map back its PrivatePage
+from UnmappedPTE to PTE, that is, show their conglomeration affinity.
+scan_control::pps_cmd field is used to control the behavior of kppsd, = 1 for
+accelerating scanning process and reclaiming pages, it's used in balance_pgdat.
+
+PPS statistic data is appended to /proc/meminfo entry, its prototype is in
+include/linux/mm.h.
+
+I'm also glad to highlight my a new idea -- dftlb which is described in
+section <Delay to Flush TLB>.
+// }])>
+
+// Delay to Flush TLB (dftlb) <([{
+Delay to flush TLB is instroduced by me to enhance flushing TLB efficiency, in
+brief, when we want to unmap a page from the page table of a process, why we
+send TLB IPI to other CPUs immediately, since every CPU has timer interrupt, we
+can insert flushing tasks into timer interrupt route to implement a
+free-charged TLB flushing.
+
+The trick is implemented in
+1) TLB flushing task is added in fill_in_tlb_task of mm/vmscan.c.
+2) timer_flush_tlb_tasks of kernel/timer.c is used by other CPUs to execute
+   flushing tasks.
+3) all data are defined in include/linux/mm.h.
+
+The restriction of dftlb. Following conditions must be met
+1) atomic cmpxchg instruction.
+2) atomically set the access bit after they touch a pte firstly.
+3) To some architectures, vma parameter of flush_tlb_range is maybe important,
+   if it's true, since it's possible that the vma of a TLB flushing task has
+   gone when a CPU starts to execute the task in timer interrupt, so don't use
+   dftlb.
+combine stage 1 with stage 2, and send IPI immediately in fill_in_tlb_tasks.
+
+dftlb increases mm_struct::mm_users to prevent the mm from being freed when
+other CPU works on it.
+// }])>
+
+// Stage Definition <([{
+The whole process of private page page-out is divided into six stages, as
+showed in shrink_pvma_scan_ptes of mm/vmscan.c, the code groups the similar
+pages to a series.
+1) PTE to untouched PTE (access bit is cleared), append flushing tasks to dftlb.
+2) Convert untouched PTE to UnmappedPTE.
+3) Link SwapEntry to every UnmappedPTE.
+4) Flush PrivatePage of UnmappedPTE to its disk SwapPage.
+5) Reclaimed the page and shift UnmappedPTE to SwappedPTE.
+6) SwappedPTE stage.
+// }])>
+
+// Data Definition <([{
+New VMA flag (VM_PURE_PRIVATE) is appended into VMA in include/linux/mm.h.
+
+New PTE type (UnmappedPTE) is appended into PTE system in
+include/asm-i386/pgtable.h. Its prototype is
+struct UnmappedPTE {
+    int present : 1; // must be 0.
+    ...
+    int pageNum : 20;
+};
+The new PTE has a feature, it keeps a link to its PrivatePage and prevent the
+page from being visited by CPU, so you can use it in <Stage Definition> as a
+middleware.
+// }])>
+
+// Concurrent Racers of Shrinking pps <([{
+shrink_private_vma of mm/vmscan.c uses init_mm.mmlist to scan all swappable
+mm_struct instances, during the process of scaning and reclaiming process, it
+readlockes every mm_struct object, which brings some potential concurrent
+racers
+1) mm/swapfile.c    pps_swapoff (swapoff API).
+2) mm/memory.c  do_wp_page, handle_pte_fault::unmapped_pte, do_anonymous_page
+   (page-fault).
+
+The VMAs of pps can coexist with madvise, mlock, mprotect, mmap and munmap,
+that is why new VMA created from mmap.c:split_vma can re-enter into pps.
+// }])>
+
+// Private Page Lifecycle of pps <([{
+All pages belonging to pps are called as pure private page, its PTE type is PTE
+or UnmappedPTE.
+
+IN (NOTE, when a pure private page enters into pps, it's also trimmed from
+Linux legacy page system by commeting lru_cache_add_active clause)
+1) fs/exec.c install_arg_pages (argument pages).
+2) mm/memory do_anonymous_page, do_wp_page, do_swap_page (page fault).
+3) mm/swap_state.c read_swap_cache_async (swap pages).
+
+OUT
+1) mm/vmscan.c  shrink_pvma_scan_ptes   (stage 6, reclaim a private page).
+2) mm/memory    zap_pte_range   (free a page).
+3) kernel/fork.c dup_mmap (if someone uses fork, migrate all pps pages
+   back to let Linux legacy page system manage them).
+
+When a pure private page is in pps, it can be visited simultaneously by
+page-fault and SwapDaemon.
+// }])>
+
+// VMA Lifecycle of pps <([{
+When a PrivateVMA enters into pps, it's or-ed a new flag -- VM_PURE_PRIVATE in
+memory.c:enter_pps, you can also find which VMA is fit with pps in it, the flag
+is used in the shrink_private_vma of mm/vmscan.c.  Other fields are left
+untouched.
+
+IN.
+1) fs/exec.c setup_arg_pages (StackVMA).
+2) mm/mmap.c do_mmap_pgoff, do_brk (DataVMA).
+3) mm/mmap.c split_vma, copy_vma (in some cases, we need copy a VMA from an
+   exist VMA).
+
+OUT.
+1) kernel/fork.c dup_mmap (if someone uses fork, return the vma back to
+   Linux legacy system).
+2) mm/mmap.c remove_vma, vma_adjust (destroy VMA).
+3) mm/mmap.c do_mmap_pgoff (delete VMA when some errors occur).
+// }])>
+
+// Postscript <([{
+Note, some circumstances aren't tested due to hardware restriction e.g. SMP
+dftlb.
+
+Here are some improvements about pps
+1) In fact, I recommend one-to-one private model -- PrivateVMA, (PTE,
+   UnmappedPTE) and PrivatePage (SwapPage) which is described in my OS and the
+   aboved hyperlink of Linux kernel mail list. So it's a compromise to use
+   Linux legacy SwapCache in my pps.
+2) SwapSpace should provide more flexible interfaces, shrink_pvma_scan_ptes
+   need allocate swap entries in batch, exactly, allocate a batch of fake
+   continual swap entries, see mm/pps_swapin_readahead.
+
+If Linux kernel group can't make a schedule to re-write their memory code,
+however, pps maybe is the best solution until now.
+// }])>
+// vim: foldmarker=<([{,}])> foldmethod=marker et
Index: test.signature/mm/vmscan.c
===================================================================
--- test.signature.orig/mm/vmscan.c 2007-01-06 07:00:11.799445480 +0800
+++ test.signature/mm/vmscan.c 2007-01-06 07:00:23.326693072 +0800
@@ -79,6 +79,9 @@
   * In this context, it doesn't matter that we scan the
   * whole list at once. */
  int swap_cluster_max;
+
+ /* pps control command, 0: do stage 1-4, kppsd only; 1: full stages. */
+ int pps_cmd;
 };
 
 /*
@@ -1514,6 +1517,428 @@
  return ret;
 }
 
+// pps fields.
+static wait_queue_head_t kppsd_wait;
+static struct scan_control wakeup_sc;
+struct pps_info pps_info = {
+ .total = ATOMIC_INIT(0),
+ .pte_count = ATOMIC_INIT(0), // stage 1 and 2.
+ .unmapped_count = ATOMIC_INIT(0), // stage 3 and 4.
+ .swapped_count = ATOMIC_INIT(0) // stage 6.
+};
+// pps end.
+
+struct series_t {
+ pte_t orig_ptes[MAX_SERIES_LENGTH];
+ pte_t* ptes[MAX_SERIES_LENGTH];
+ struct page* pages[MAX_SERIES_LENGTH];
+ int series_length;
+ int series_stage;
+} series;
+
+static int get_series_stage(pte_t* pte, int index)
+{
+ series.orig_ptes[index] = *pte;
+ series.ptes[index] = pte;
+ if (pte_present(series.orig_ptes[index])) {
+  struct page* page = pfn_to_page(pte_pfn(series.orig_ptes[index]));
+  series.pages[index] = page;
+  if (page == ZERO_PAGE(addr)) // reserved page is exclusive from us.
+   return 7;
+  if (pte_young(series.orig_ptes[index])) {
+   return 1;
+  } else
+   return 2;
+ } else if (pte_unmapped(series.orig_ptes[index])) {
+  struct page* page = pfn_to_page(pte_pfn(series.orig_ptes[index]));
+  series.pages[index] = page;
+  if (!PageSwapCache(page))
+   return 3;
+  else {
+   if (PageWriteback(page) || PageDirty(page))
+    return 4;
+   else
+    return 5;
+  }
+ } else // pte_swapped -- SwappedPTE
+  return 6;
+}
+
+static void find_series(pte_t** start, unsigned long* addr, unsigned long end)
+{
+ int i;
+ int series_stage = get_series_stage((*start)++, 0);
+ *addr += PAGE_SIZE;
+
+ for (i = 1; i < MAX_SERIES_LENGTH && *addr < end; i++, (*start)++, *addr += PAGE_SIZE) {
+  if (series_stage != get_series_stage(*start, i))
+   break;
+ }
+ series.series_stage = series_stage;
+ series.series_length = i;
+}
+
+struct delay_tlb_task delay_tlb_tasks[32] = { [0 ... 31] = {0} };
+
+void timer_flush_tlb_tasks(void* data)
+{
+ int i;
+#ifdef CONFIG_X86
+ int flag = 0;
+#endif
+ for (i = 0; i < 32; i++) {
+  if (delay_tlb_tasks[i].mm != NULL &&
+    cpu_isset(smp_processor_id(), delay_tlb_tasks[i].mm->cpu_vm_mask) &&
+    cpu_isset(smp_processor_id(), delay_tlb_tasks[i].cpu_mask)) {
+#ifdef CONFIG_X86
+   flag = 1;
+#elif
+   // smp::local_flush_tlb_range(delay_tlb_tasks[i]);
+#endif
+   cpu_clear(smp_processor_id(), delay_tlb_tasks[i].cpu_mask);
+  }
+ }
+#ifdef CONFIG_X86
+ if (flag)
+  local_flush_tlb();
+#endif
+}
+
+static struct delay_tlb_task* delay_task = NULL;
+static int vma_index = 0;
+
+static struct delay_tlb_task* search_free_tlb_tasks_slot(void)
+{
+ struct delay_tlb_task* ret = NULL;
+ int i;
+again:
+ for (i = 0; i < 32; i++) {
+  if (delay_tlb_tasks[i].mm != NULL) {
+   if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
+    mmput(delay_tlb_tasks[i].mm);
+    delay_tlb_tasks[i].mm = NULL;
+    ret = &delay_tlb_tasks[i];
+   }
+  } else
+   ret = &delay_tlb_tasks[i];
+ }
+ if (!ret) { // Force flush TLBs.
+  on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
+  goto again;
+ }
+ return ret;
+}
+
+static void init_delay_task(struct mm_struct* mm)
+{
+ cpus_clear(delay_task->cpu_mask);
+ vma_index = 0;
+ delay_task->mm = mm;
+}
+
+/*
+ * We will be working on the mm, so let's force to flush it if necessary.
+ */
+static void start_tlb_tasks(struct mm_struct* mm)
+{
+ int i, flag = 0;
+again:
+ for (i = 0; i < 32; i++) {
+  if (delay_tlb_tasks[i].mm == mm) {
+   if (cpus_empty(delay_tlb_tasks[i].cpu_mask)) {
+    mmput(delay_tlb_tasks[i].mm);
+    delay_tlb_tasks[i].mm = NULL;
+   } else
+    flag = 1;
+  }
+ }
+ if (flag) { // Force flush TLBs.
+  on_each_cpu(timer_flush_tlb_tasks, NULL, 0, 1);
+  goto again;
+ }
+ BUG_ON(delay_task != NULL);
+ delay_task = search_free_tlb_tasks_slot();
+ init_delay_task(mm);
+}
+
+static void end_tlb_tasks(void)
+{
+ atomic_inc(&delay_task->mm->mm_users);
+ delay_task->cpu_mask = delay_task->mm->cpu_vm_mask;
+ delay_task = NULL;
+#ifndef CONFIG_SMP
+ timer_flush_tlb_tasks(NULL);
+#endif
+}
+
+static void fill_in_tlb_tasks(struct vm_area_struct* vma, unsigned long addr,
+  unsigned long end)
+{
+ struct mm_struct* mm;
+ // First, try to combine the task with the previous.
+ if (vma_index != 0 && delay_task->vma[vma_index - 1] == vma &&
+   delay_task->end[vma_index - 1] == addr) {
+  delay_task->end[vma_index - 1] = end;
+  return;
+ }
+fill_it:
+ if (vma_index != 32) {
+  delay_task->vma[vma_index] = vma;
+  delay_task->start[vma_index] = addr;
+  delay_task->end[vma_index] = end;
+  vma_index++;
+  return;
+ }
+ mm = delay_task->mm;
+ end_tlb_tasks();
+
+ delay_task = search_free_tlb_tasks_slot();
+ init_delay_task(mm);
+ goto fill_it;
+}
+
+static void shrink_pvma_scan_ptes(struct scan_control* sc, struct mm_struct*
+  mm, struct vm_area_struct* vma, pmd_t* pmd, unsigned long addr,
+  unsigned long end)
+{
+ int i, statistic;
+ spinlock_t* ptl = pte_lockptr(mm, pmd);
+ pte_t* pte = pte_offset_map(pmd, addr);
+ int anon_rss = 0;
+ struct pagevec freed_pvec;
+ int may_enter_fs = (sc->gfp_mask & (__GFP_FS | __GFP_IO));
+ struct address_space* mapping = &swapper_space;
+
+ pagevec_init(&freed_pvec, 1);
+ do {
+  memset(&series, 0, sizeof(struct series_t));
+  find_series(&pte, &addr, end);
+  if (sc->pps_cmd == 0 && series.series_stage == 5)
+   continue;
+  switch (series.series_stage) {
+   case 1: // PTE -- untouched PTE.
+    for (i = 0; i < series.series_length; i++) {
+     struct page* page = series.pages[i];
+     lock_page(page);
+     spin_lock(ptl);
+     if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+      if (pte_dirty(*series.ptes[i]))
+       set_page_dirty(page);
+      set_pte_at(mm, addr + i * PAGE_SIZE, series.ptes[i],
+        pte_mkold(pte_mkclean(*series.ptes[i])));
+     }
+     spin_unlock(ptl);
+     unlock_page(page);
+    }
+    fill_in_tlb_tasks(vma, addr, addr + (PAGE_SIZE * series.series_length));
+    break;
+   case 2: // untouched PTE -- UnmappedPTE.
+    /*
+     * Note in stage 1, we've flushed TLB in fill_in_tlb_tasks, so
+     * if it's still clear here, we can shift it to Unmapped type.
+     *
+     * If some architecture doesn't support atomic cmpxchg
+     * instruction or can't atomically set the access bit after
+     * they touch a pte at first, combine stage 1 with stage 2, and
+     * send IPI immediately in fill_in_tlb_tasks.
+     */
+    spin_lock(ptl);
+    statistic = 0;
+    for (i = 0; i < series.series_length; i++) {
+     if (unlikely(pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+      pte_t pte_unmapped = series.orig_ptes[i];
+      pte_unmapped.pte_low &= ~_PAGE_PRESENT;
+      pte_unmapped.pte_low |= _PAGE_UNMAPPED;
+      if (cmpxchg(&series.ptes[i]->pte_low,
+         series.orig_ptes[i].pte_low,
+         pte_unmapped.pte_low) !=
+        series.orig_ptes[i].pte_low)
+       continue;
+      page_remove_rmap(series.pages[i]);
+      anon_rss--;
+      statistic++;
+     }
+    }
+    atomic_add(statistic, &pps_info.unmapped_count);
+    atomic_sub(statistic, &pps_info.pte_count);
+    spin_unlock(ptl);
+    break;
+   case 3: // Attach SwapPage to PrivatePage.
+    /*
+     * A better arithmetic should be applied to Linux SwapDevice to
+     * allocate fake continual SwapPages which are close to each
+     * other, the offset between two close SwapPages is less than 8.
+     */
+    if (sc->may_swap) {
+     for (i = 0; i < series.series_length; i++) {
+      lock_page(series.pages[i]);
+      if (!PageSwapCache(series.pages[i])) {
+       if (!add_to_swap(series.pages[i], GFP_ATOMIC)) {
+        unlock_page(series.pages[i]);
+        break;
+       }
+      }
+      unlock_page(series.pages[i]);
+     }
+    }
+    break;
+   case 4: // SwapPage isn't consistent with PrivatePage.
+    /*
+     * A mini version pageout().
+     *
+     * Current swap space can't commit multiple pages together:(
+     */
+    if (sc->may_writepage && may_enter_fs) {
+     for (i = 0; i < series.series_length; i++) {
+      struct page* page = series.pages[i];
+      int res;
+
+      if (!may_write_to_queue(mapping->backing_dev_info))
+       break;
+      lock_page(page);
+      if (!PageDirty(page) || PageWriteback(page)) {
+       unlock_page(page);
+       continue;
+      }
+      clear_page_dirty_for_io(page);
+      struct writeback_control wbc = {
+       .sync_mode = WB_SYNC_NONE,
+       .nr_to_write = SWAP_CLUSTER_MAX,
+       .nonblocking = 1,
+       .for_reclaim = 1,
+      };
+      page_cache_get(page);
+      SetPageReclaim(page);
+      res = swap_writepage(page, &wbc);
+      if (res < 0) {
+       handle_write_error(mapping, page, res);
+       ClearPageReclaim(page);
+       page_cache_release(page);
+       break;
+      }
+      if (!PageWriteback(page))
+       ClearPageReclaim(page);
+      page_cache_release(page);
+     }
+    }
+    break;
+   case 5: // UnmappedPTE -- SwappedPTE, reclaim PrivatePage.
+    statistic = 0;
+    for (i = 0; i < series.series_length; i++) {
+     struct page* page = series.pages[i];
+     lock_page(page);
+     spin_lock(ptl);
+     if (unlikely(!pte_same(*series.ptes[i], series.orig_ptes[i]))) {
+      spin_unlock(ptl);
+      unlock_page(page);
+      continue;
+     }
+     statistic++;
+     swp_entry_t entry = { .val = page_private(page) };
+     swap_duplicate(entry);
+     pte_t pte_swp = swp_entry_to_pte(entry);
+     set_pte_at(mm, addr + i * PAGE_SIZE, series.ptes[i], pte_swp);
+     spin_unlock(ptl);
+     if (PageSwapCache(page) && !PageWriteback(page))
+      delete_from_swap_cache(page);
+     unlock_page(page);
+
+     if (!pagevec_add(&freed_pvec, page))
+      __pagevec_release_nonlru(&freed_pvec);
+     sc->nr_reclaimed++;
+    }
+    atomic_add(statistic, &pps_info.swapped_count);
+    atomic_sub(statistic, &pps_info.unmapped_count);
+    atomic_sub(statistic, &pps_info.total);
+    break;
+   case 6:
+    // NULL operation!
+    break;
+  }
+ } while (addr < end);
+ add_mm_counter(mm, anon_rss, anon_rss);
+ if (pagevec_count(&freed_pvec))
+  __pagevec_release_nonlru(&freed_pvec);
+}
+
+static void shrink_pvma_pmd_range(struct scan_control* sc, struct mm_struct*
+  mm, struct vm_area_struct* vma, pud_t* pud, unsigned long addr,
+  unsigned long end)
+{
+ unsigned long next;
+ pmd_t* pmd = pmd_offset(pud, addr);
+ do {
+  next = pmd_addr_end(addr, end);
+  if (pmd_none_or_clear_bad(pmd))
+   continue;
+  shrink_pvma_scan_ptes(sc, mm, vma, pmd, addr, next);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void shrink_pvma_pud_range(struct scan_control* sc, struct mm_struct*
+  mm, struct vm_area_struct* vma, pgd_t* pgd, unsigned long addr,
+  unsigned long end)
+{
+ unsigned long next;
+ pud_t* pud = pud_offset(pgd, addr);
+ do {
+  next = pud_addr_end(addr, end);
+  if (pud_none_or_clear_bad(pud))
+   continue;
+  shrink_pvma_pmd_range(sc, mm, vma, pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void shrink_pvma_pgd_range(struct scan_control* sc, struct mm_struct*
+  mm, struct vm_area_struct* vma)
+{
+ unsigned long next;
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ pgd_t* pgd = pgd_offset(mm, addr);
+ do {
+  next = pgd_addr_end(addr, end);
+  if (pgd_none_or_clear_bad(pgd))
+   continue;
+  shrink_pvma_pud_range(sc, mm, vma, pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void shrink_private_vma(struct scan_control* sc)
+{
+ struct vm_area_struct* vma;
+ struct list_head *pos;
+ struct mm_struct *prev, *mm;
+
+ prev = mm = &init_mm;
+ pos = &init_mm.mmlist;
+ atomic_inc(&prev->mm_users);
+ spin_lock(&mmlist_lock);
+ while ((pos = pos->next) != &init_mm.mmlist) {
+  mm = list_entry(pos, struct mm_struct, mmlist);
+  if (!atomic_add_unless(&mm->mm_users, 1, 0))
+   continue;
+  spin_unlock(&mmlist_lock);
+  mmput(prev);
+  prev = mm;
+  start_tlb_tasks(mm);
+  if (down_read_trylock(&mm->mmap_sem)) {
+   for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+    if (!(vma->vm_flags & VM_PURE_PRIVATE))
+     continue;
+    if (vma->vm_flags & VM_LOCKED)
+     continue;
+    shrink_pvma_pgd_range(sc, mm, vma);
+   }
+   up_read(&mm->mmap_sem);
+  }
+  end_tlb_tasks();
+  spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+ mmput(prev);
+}
+
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1557,6 +1982,10 @@
  sc.may_swap = 1;
  sc.nr_mapped = read_page_state(nr_mapped);
 
+ wakeup_sc = sc;
+ wakeup_sc.pps_cmd = 1;
+ wake_up_interruptible(&kppsd_wait);
+
  inc_page_state(pageoutrun);
 
  for (i = 0; i < pgdat->nr_zones; i++) {
@@ -1693,6 +2122,33 @@
  return total_reclaimed;
 }
 
+static int kppsd(void* p)
+{
+ struct task_struct *tsk = current;
+ int timeout;
+ DEFINE_WAIT(wait);
+ daemonize("kppsd");
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ struct scan_control default_sc;
+ default_sc.gfp_mask = GFP_KERNEL;
+ default_sc.may_writepage = 1;
+ default_sc.may_swap = 1;
+ default_sc.pps_cmd = 0;
+
+ while (1) {
+  try_to_freeze();
+  prepare_to_wait(&kppsd_wait, &wait, TASK_INTERRUPTIBLE);
+  timeout = schedule_timeout(2000);
+  finish_wait(&kppsd_wait, &wait);
+
+  if (timeout)
+   shrink_private_vma(&wakeup_sc);
+  else
+   shrink_private_vma(&default_sc);
+ }
+ return 0;
+}
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process. 
@@ -1837,6 +2293,15 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static int __init kppsd_init(void)
+{
+ init_waitqueue_head(&kppsd_wait);
+ kernel_thread(kppsd, NULL, CLONE_KERNEL);
+ return 0;
+}
+
+module_init(kppsd_init)
+
 static int __init kswapd_init(void)
 {
  pg_data_t *pgdat;

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 2884 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2007-01-10  2:51 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-12-26  8:18 [PATCH 2.6.16.29 1/1] memory: enhance Linux swap subsystem yunfeng zhang
2006-12-26  9:03 ` Zhou Yingchao
2006-12-27  3:33   ` yunfeng zhang
2006-12-30  5:50     ` Zhou Yingchao
2007-01-05  7:35       ` yunfeng zhang
2007-01-05  7:45         ` yunfeng zhang
2006-12-27  3:38 ` yunfeng zhang
2006-12-27 18:44 ` Pavel Machek
2006-12-29  6:45   ` yunfeng zhang
2006-12-29  9:15     ` Pavel Machek
2006-12-29 15:50       ` Randy Dunlap
2007-01-05 23:10 zyf.zeroos
2007-01-09  8:27 ` yunfeng zhang
     [not found] <200701092047.43248.a1426z@gawab.com>
2007-01-10  2:51 ` yunfeng zhang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.