linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Snapshot of shared page tables
@ 2002-10-02 14:57 Dave McCracken
  2002-10-02 16:51 ` Daniel Phillips
  2002-10-02 22:39 ` Paul Mackerras
  0 siblings, 2 replies; 6+ messages in thread
From: Dave McCracken @ 2002-10-02 14:57 UTC (permalink / raw)
  To: Linux Memory Management, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 499 bytes --]


Ok, here it is.  This patch works for my simple tests, both under UP and
SMP, including under memory pressure.  I'd appreciate anyone who'd like to
take it and beat on it.  Please let me know of any problems you find.

The patch is against this morning's 2.5 BK tree.

Dave McCracken

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: shpte-2.5.40-1.diff --]
[-- Type: text/plain, Size: 38601 bytes --]

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.674   -> 1.675  
#	  include/linux/mm.h	1.83.1.2 -> 1.86   
#	include/asm-i386/pgalloc.h	1.16.1.2 -> 1.19   
#	       kernel/fork.c	1.70.1.15 -> 1.76   
#	            Makefile	1.293.1.22 -> 1.302  
#	include/linux/rmap-locking.h	1.1     -> 1.2    
#	         init/main.c	1.64.1.6 -> 1.69   
#	       mm/swapfile.c	1.54.1.2 -> 1.58   
#	           fs/exec.c	1.45.1.4 -> 1.49   
#	         mm/memory.c	1.84.1.6 -> 1.92   
#	include/asm-generic/rmap.h	1.2.1.1 -> 1.6    
#	include/linux/page-flags.h	1.24.1.1 -> 1.27   
#	include/asm-i386/pgtable.h	1.17.1.3 -> 1.20   
#	           mm/rmap.c	1.12.1.4 -> 1.20   
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/10/02	dmc@baldur.austin.ibm.com	1.663.2.2
# Check in working snapshot
# --------------------------------------------
# 02/10/02	dmc@baldur.austin.ibm.com	1.675
# Merge baldur.austin.ibm.com:/home/dmc/linux/bk/linux-2.5
# into baldur.austin.ibm.com:/home/dmc/linux/bk/linux-2.5-shpte
# --------------------------------------------
#
diff -Nru a/Makefile b/Makefile
--- a/Makefile	Wed Oct  2 09:47:17 2002
+++ b/Makefile	Wed Oct  2 09:47:17 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 5
 SUBLEVEL = 40
-EXTRAVERSION =
+EXTRAVERSION =-shpte
 
 # *DOCUMENTATION*
 # Too see a list of typical targets execute "make help"
diff -Nru a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c	Wed Oct  2 09:47:17 2002
+++ b/fs/exec.c	Wed Oct  2 09:47:17 2002
@@ -308,8 +308,8 @@
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
 	page_add_rmap(page, pte);
+	increment_rss(virt_to_page(pte));
 	pte_unmap(pte);
-	tsk->mm->rss++;
 	spin_unlock(&tsk->mm->page_table_lock);
 
 	/* no need for flush_tlb */
diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h
--- a/include/asm-generic/rmap.h	Wed Oct  2 09:47:17 2002
+++ b/include/asm-generic/rmap.h	Wed Oct  2 09:47:17 2002
@@ -26,33 +26,6 @@
  */
 #include <linux/mm.h>
 
-static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address)
-{
-#ifdef BROKEN_PPC_PTE_ALLOC_ONE
-	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
-	extern int mem_init_done;
-
-	if (!mem_init_done)
-		return;
-#endif
-	page->mapping = (void *)mm;
-	page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
-	inc_page_state(nr_page_table_pages);
-}
-
-static inline void pgtable_remove_rmap(struct page * page)
-{
-	page->mapping = NULL;
-	page->index = 0;
-	dec_page_state(nr_page_table_pages);
-}
-
-static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
-{
-	struct page * page = kmap_atomic_to_page(ptep);
-	return (struct mm_struct *) page->mapping;
-}
-
 static inline unsigned long ptep_to_address(pte_t * ptep)
 {
 	struct page * page = kmap_atomic_to_page(ptep);
@@ -86,5 +59,11 @@
 	return;
 }
 #endif
+
+extern void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address);
+extern void pgtable_add_rmap_locked(struct page * page, struct mm_struct * mm, unsigned long address);
+extern void pgtable_remove_rmap(struct page * page, struct mm_struct *mm);
+extern void pgtable_remove_rmap_locked(struct page * page, struct mm_struct *mm);
+extern void increment_rss(struct page *ptepage);
 
 #endif /* _GENERIC_RMAP_H */
diff -Nru a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
--- a/include/asm-i386/pgalloc.h	Wed Oct  2 09:47:17 2002
+++ b/include/asm-i386/pgalloc.h	Wed Oct  2 09:47:17 2002
@@ -16,6 +16,13 @@
 		((unsigned long long)page_to_pfn(pte) <<
 			(unsigned long long) PAGE_SHIFT)));
 }
+
+static inline void pmd_populate_rdonly(struct mm_struct *mm, pmd_t *pmd, struct page *page)
+{
+	set_pmd(pmd, __pmd(_PAGE_TABLE_RDONLY +
+		((unsigned long long)page_to_pfn(page) <<
+			(unsigned long long) PAGE_SHIFT)));
+}
 /*
  * Allocate and free page tables.
  */
diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
--- a/include/asm-i386/pgtable.h	Wed Oct  2 09:47:17 2002
+++ b/include/asm-i386/pgtable.h	Wed Oct  2 09:47:17 2002
@@ -124,6 +124,7 @@
 #define _PAGE_PROTNONE	0x080	/* If not present */
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_TABLE_RDONLY	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
 
@@ -184,8 +185,8 @@
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
-#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
-
+#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_RW)) != \
+			(_KERNPG_TABLE & ~_PAGE_RW))
 
 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
 
@@ -209,6 +210,8 @@
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
+static inline int pmd_write(pmd_t pmd)		{ return (pmd).pmd & _PAGE_RW; }
+static inline pmd_t pmd_wrprotect(pmd_t pmd)	{ (pmd).pmd &= ~_PAGE_RW; return pmd; }
 
 static inline  int ptep_test_and_clear_dirty(pte_t *ptep)	{ return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); }
 static inline  int ptep_test_and_clear_young(pte_t *ptep)	{ return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); }
@@ -263,6 +266,10 @@
 	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address))
 #define pte_offset_map_nested(dir, address) \
 	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address))
+#define pte_page_map(__page, address) \
+	((pte_t *)kmap_atomic(__page,KM_PTE0) + __pte_offset(address))
+#define pte_page_map_nested(__page, address) \
+	((pte_t *)kmap_atomic(__page,KM_PTE1) + __pte_offset(address))
 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
 #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
 
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h	Wed Oct  2 09:47:17 2002
+++ b/include/linux/mm.h	Wed Oct  2 09:47:17 2002
@@ -163,6 +163,8 @@
 		struct pte_chain *chain;/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock */
 		pte_addr_t direct;
+		struct mm_chain *mmchain;/* Reverse mm_struct mapping pointer */
+		struct mm_struct *mmdirect;
 	} pte;
 	unsigned long private;		/* mapping-private opaque data */
 
diff -Nru a/include/linux/rmap-locking.h b/include/linux/rmap-locking.h
--- a/include/linux/rmap-locking.h	Wed Oct  2 09:47:17 2002
+++ b/include/linux/rmap-locking.h	Wed Oct  2 09:47:17 2002
@@ -31,3 +31,6 @@
 #endif
 	preempt_enable();
 }
+
+#define	pte_page_lock	pte_chain_lock
+#define	pte_page_unlock	pte_chain_unlock
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c	Wed Oct  2 09:47:17 2002
+++ b/init/main.c	Wed Oct  2 09:47:17 2002
@@ -511,9 +511,7 @@
 static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
-#ifdef CONFIG_SMP
-	extern int migration_init(void);
-
+#if CONFIG_SMP
 	migration_init();
 #endif
 	spawn_ksoftirqd();
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c	Wed Oct  2 09:47:17 2002
+++ b/kernel/fork.c	Wed Oct  2 09:47:17 2002
@@ -205,6 +205,7 @@
 	struct vm_area_struct * mpnt, *tmp, **pprev;
 	int retval;
 	unsigned long charge = 0;
+	pmd_t *prev_pmd = 0;
 
 	flush_cache_mm(current->mm);
 	mm->locked_vm = 0;
@@ -267,7 +268,7 @@
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 		mm->map_count++;
-		retval = copy_page_range(mm, current->mm, tmp);
+		retval = share_page_range(mm, current->mm, tmp, &prev_pmd);
 		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
diff -Nru a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c	Wed Oct  2 09:47:17 2002
+++ b/mm/memory.c	Wed Oct  2 09:47:17 2002
@@ -36,6 +36,18 @@
  *		(Gerhard.Wichert@pdb.siemens.de)
  */
 
+/*
+ * A note on locking of the page table structure:
+ *
+ *  The top level lock that protects the page table is the mm->page_table_lock.
+ *  This lock protects the pgd and pmd layer.  However, with the advent of shared
+ *  pte pages, this lock is not sufficient.  The pte layer is now protected by the
+ *  pte_page_lock, set in the struct page of the pte page.  Note that with this
+ *  locking scheme, once the pgd and pmd layers have been set in the page fault
+ *  path and the pte_page_lock has been taken, the page_table_lock can be released.
+ * 
+ */
+
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -44,6 +56,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/vcache.h>
+#include <linux/rmap-locking.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -83,7 +96,7 @@
  */
 static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir)
 {
-	struct page *page;
+	struct page *ptepage;
 
 	if (pmd_none(*dir))
 		return;
@@ -92,10 +105,13 @@
 		pmd_clear(dir);
 		return;
 	}
-	page = pmd_page(*dir);
+	ptepage = pmd_page(*dir);
 	pmd_clear(dir);
-	pgtable_remove_rmap(page);
-	pte_free_tlb(tlb, page);
+	pgtable_remove_rmap(ptepage, tlb->mm);
+	if (page_count(ptepage) == 1) {
+		dec_page_state(nr_page_table_pages);
+	}
+	pte_free_tlb(tlb, ptepage);
 }
 
 static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir)
@@ -136,6 +152,190 @@
 	} while (--nr);
 }
 
+/*
+ * This function makes the decision whether a pte page needs to be unshared
+ * or not.  Note that page_count() == 1 isn't even tested here.  The assumption
+ * is that if the pmd entry is marked writeable, then the page is either already
+ * unshared or doesn't need to be unshared.  This catches the situation where
+ * task B unshares the pte page, then task A faults and needs to unprotect the
+ * pmd entry.  This is actually done in pte_unshare.
+ *
+ * This function should be called with the page_table_lock held.
+ */
+static inline int pte_needs_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+				    pmd_t *pmd, unsigned long address, int write_access)
+{
+	struct page *ptepage;
+
+	/* It's not even there */
+	if (!pmd_present(*pmd))
+		return 0;
+
+	/* If it's already writable, then it doesn't need to be unshared. */
+	if (pmd_write(*pmd))
+		return 0;
+
+	/* If this isn't a write fault we don't need to unshare. */
+	if (!write_access)
+		return 0;
+
+	/*
+	 * If this page fits entirely inside a shared region, don't unshare it.
+	 */
+	ptepage = pmd_page(*pmd);
+	pte_page_lock(ptepage);
+	if (((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
+	    && (vma->vm_start <= ptepage->index)
+	    && (vma->vm_end >= (ptepage->index + PGDIR_SIZE))) {
+		pte_page_unlock(ptepage);
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * Here is where a pte page is actually unshared.  It actually covers a couple of
+ * possible conditions.  If the page_count() is already 1, then that means it just
+ * needs to be set writeable.  Otherwise, a new page needs to be allocated.
+ *
+ * When each pte entry is copied, it is evaluated for COW protection, as well as
+ * checking whether the swap count needs to be incremented.
+ *
+ * This function must be called with the page_table_lock held.  It will release
+ * the lock when it allocates a new page.
+ *
+ * When it returns, it has taken the pte_page_lock if there was no error.
+ */
+
+static pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	pte_t	*src_ptb, *dst_ptb;
+	struct page *oldpage, *newpage;
+	struct vm_area_struct *vma;
+	int	base, addr;
+	int	end, page_end;
+	int	src_unshare;
+
+	oldpage = pmd_page(*pmd);
+
+	/* If it's already unshared, we just need to set it writeable */
+	if (page_count(oldpage) == 1) {
+		pmd_populate(mm, pmd, oldpage);
+		flush_tlb_mm(mm);
+		goto out_map;
+	}
+
+	pte_page_unlock(oldpage);
+	spin_unlock(&mm->page_table_lock);
+	newpage = pte_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!newpage))
+		return NULL;
+
+	pte_page_lock(oldpage);
+
+	/* See if it got unshared while we dropped the lock */
+	if (page_count(oldpage) == 1) {
+		pte_free(newpage);
+		goto out_map;
+	}
+
+	pte_page_lock(newpage);
+
+	base = addr = oldpage->index;
+	page_end = base + PGDIR_SIZE;
+	vma = find_vma(mm, base);
+	if (!vma || (page_end <= vma->vm_start))
+		BUG(); 		/* No valid pages in this pte page */
+
+	src_unshare = page_count(oldpage) == 2;
+	dst_ptb = pte_page_map(newpage, base);
+	src_ptb = pte_page_map_nested(oldpage, base);
+
+	if (vma->vm_start > addr)
+		addr = vma->vm_start;
+
+	if (vma->vm_end < page_end)
+		end = vma->vm_end;
+	else
+		end = page_end;
+
+	do {
+		unsigned int cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+		pte_t *src_pte = src_ptb + __pte_offset(addr);
+		pte_t *dst_pte = dst_ptb + __pte_offset(addr);
+
+		do {
+			pte_t pte = *src_pte;
+			struct page *page;
+
+			if (pte_none(pte))
+				goto unshare_skip_set;
+
+			if (!pte_present(pte)) {
+				swap_duplicate(pte_to_swp_entry(pte));
+				set_pte(dst_pte, pte);
+				goto unshare_skip_set;
+			}
+			page = pte_page(pte);
+			if (!PageReserved(page)) {
+				/* COW mappings require write protecting both sides */
+				if (cow) {
+					pte = pte_wrprotect(pte);
+					if (src_unshare)
+						set_pte(src_pte, pte);
+				}
+				/* If it's a shared mapping,
+				 *  mark it clean in the new mapping
+				 */
+				if (vma->vm_flags & VM_SHARED)
+					pte = pte_mkclean(pte);
+				pte = pte_mkold(pte);
+				get_page(page);
+			}
+			set_pte(dst_pte, pte);
+			page_add_rmap(page, dst_pte);
+unshare_skip_set:
+			src_pte++;
+			dst_pte++;
+			addr += PAGE_SIZE;
+		} while (addr < end);
+
+		if (addr >= page_end)
+			break;
+
+		vma = vma->vm_next;
+		if (!vma)
+			break;
+
+		if (page_end <= vma->vm_start)
+			break;
+
+		addr = vma->vm_start;
+		if (vma->vm_end < page_end)
+			end = vma->vm_end;
+		else
+			end = page_end;
+	} while (1);
+
+	pte_unmap_nested(src_ptb);
+
+	pgtable_remove_rmap_locked(oldpage, mm);
+	pgtable_add_rmap_locked(newpage, mm, base);
+	pmd_populate(mm, pmd, newpage);
+	inc_page_state(nr_page_table_pages);
+
+	flush_tlb_mm(mm);
+
+	put_page(oldpage);
+	pte_page_unlock(oldpage);
+
+	return dst_ptb + __pte_offset(address);
+
+out_map:
+	return pte_offset_map(pmd, address);
+}
+
 pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	if (!pmd_present(*pmd)) {
@@ -157,11 +357,10 @@
 		}
 		pgtable_add_rmap(new, mm, address);
 		pmd_populate(mm, pmd, new);
+		inc_page_state(nr_page_table_pages);
 	}
 out:
-	if (pmd_present(*pmd))
-		return pte_offset_map(pmd, address);
-	return NULL;
+	return pte_offset_map(pmd, address);
 }
 
 pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
@@ -183,7 +382,6 @@
 			pte_free_kernel(new);
 			goto out;
 		}
-		pgtable_add_rmap(virt_to_page(new), mm, address);
 		pmd_populate_kernel(mm, pmd, new);
 	}
 out:
@@ -192,28 +390,14 @@
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
 #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
-/*
- * copy one vm_area from one task to the other. Assumes the page tables
- * already present in the new task to be cleared in the whole range
- * covered by this vma.
- *
- * 08Jan98 Merged into one routine from several inline routines to reduce
- *         variable count and make things faster. -jj
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within pmd_alloc() and pte_alloc_map().
- */
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
+int share_page_range(struct mm_struct *dst, struct mm_struct *src,
+	struct vm_area_struct *vma, pmd_t **prev_pmd)
 {
-	pgd_t * src_pgd, * dst_pgd;
+	pgd_t *src_pgd, *dst_pgd;
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-
-	if (is_vm_hugetlb_page(vma))
-		return copy_hugetlb_page_range(dst, src, vma);
-
+
 	src_pgd = pgd_offset(src, address)-1;
 	dst_pgd = pgd_offset(dst, address)-1;
 
@@ -222,14 +406,12 @@
 
 		src_pgd++; dst_pgd++;

-		/* copy_pmd_range */
-
 		if (pgd_none(*src_pgd))
-			goto skip_copy_pmd_range;
+			goto skip_share_pmd_range;
 		if (pgd_bad(*src_pgd)) {
 			pgd_ERROR(*src_pgd);
 			pgd_clear(src_pgd);
-skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
+skip_share_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
 			if (!address || (address >= end))
 				goto out;
 			continue;
@@ -240,84 +422,47 @@
 		if (!dst_pmd)
 			goto nomem;
 
+		spin_lock(&src->page_table_lock);
+
 		do {
-			pte_t * src_pte, * dst_pte;
-
-			/* copy_pte_range */
-
-			if (pmd_none(*src_pmd))
-				goto skip_copy_pte_range;
-			if (pmd_bad(*src_pmd)) {
+			pmd_t	pmdval = *src_pmd;
+			struct page *ptepage = pmd_page(pmdval);
+
+			if (pmd_none(pmdval))
+				goto skip_share_pte_range;
+			if (pmd_bad(pmdval)) {
 				pmd_ERROR(*src_pmd);
 				pmd_clear(src_pmd);
-skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
-				if (address >= end)
-					goto out;
-				goto cont_copy_pmd_range;
+				goto skip_share_pte_range;
 			}
 
-			dst_pte = pte_alloc_map(dst, dst_pmd, address);
-			if (!dst_pte)
-				goto nomem;
-			spin_lock(&src->page_table_lock);
-			src_pte = pte_offset_map_nested(src_pmd, address);
-			do {
-				pte_t pte = *src_pte;
-				struct page *ptepage;
-				unsigned long pfn;
-
-				/* copy_one_pte */
-
-				if (pte_none(pte))
-					goto cont_copy_pte_range_noset;
-				/* pte contains position in swap, so copy. */
-				if (!pte_present(pte)) {
-					swap_duplicate(pte_to_swp_entry(pte));
-					set_pte(dst_pte, pte);
-					goto cont_copy_pte_range_noset;
-				}
-				ptepage = pte_page(pte);
-				pfn = pte_pfn(pte);
-				if (!pfn_valid(pfn))
-					goto cont_copy_pte_range;
-				ptepage = pfn_to_page(pfn);
-				if (PageReserved(ptepage))
-					goto cont_copy_pte_range;
-
-				/* If it's a COW mapping, write protect it both in the parent and the child */
-				if (cow) {
-					ptep_set_wrprotect(src_pte);
-					pte = *src_pte;
-				}
+			if (cow) {
+				pmdval = pmd_wrprotect(pmdval);
+				set_pmd(src_pmd, pmdval);
+			}
+			set_pmd(dst_pmd, pmdval);
 
-				/* If it's a shared mapping, mark it clean in the child */
-				if (vma->vm_flags & VM_SHARED)
-					pte = pte_mkclean(pte);
-				pte = pte_mkold(pte);
+			/* Only do this if we haven't seen this pte page before */
+			if (src_pmd != *prev_pmd) {
 				get_page(ptepage);
-				dst->rss++;
+				pgtable_add_rmap(ptepage, dst, address);
+				*prev_pmd = src_pmd;
+				dst->rss += ptepage->private;
+			}
 
-cont_copy_pte_range:		set_pte(dst_pte, pte);
-				page_add_rmap(ptepage, dst_pte);
-cont_copy_pte_range_noset:	address += PAGE_SIZE;
-				if (address >= end) {
-					pte_unmap_nested(src_pte);
-					pte_unmap(dst_pte);
-					goto out_unlock;
-				}
-				src_pte++;
-				dst_pte++;
-			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
-			pte_unmap_nested(src_pte-1);
-			pte_unmap(dst_pte-1);
-			spin_unlock(&src->page_table_lock);
-
-cont_copy_pmd_range:	src_pmd++;
+skip_share_pte_range:	address = (address + PMD_SIZE) & PMD_MASK;
+			if (address >= end)
+				goto out_unlock;
+
+			src_pmd++;
 			dst_pmd++;
 		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
+		spin_unlock(&src->page_table_lock);
 	}
+
 out_unlock:
 	spin_unlock(&src->page_table_lock);
+
 out:
 	return 0;
 nomem:
@@ -326,6 +471,7 @@
 
 static void zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
+	struct page *ptepage;
 	unsigned long offset;
 	pte_t *ptep;
 
@@ -336,11 +482,34 @@
 		pmd_clear(pmd);
 		return;
 	}
-	ptep = pte_offset_map(pmd, address);
+
 	offset = address & ~PMD_MASK;
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
 	size &= PAGE_MASK;
+
+	/*
+	 * Check to see if the pte page is shared.  If it is and we're unmapping
+	 * the entire page, just decrement the reference count and we're done.
+	 * If we're only unmapping part of the page we'll have to unshare it the
+	 * slow way.
+	 */
+	ptepage = pmd_page(*pmd);
+	pte_page_lock(ptepage);
+	if (page_count(ptepage) > 1) {
+		if ((offset == 0) && (size == PMD_SIZE)) {
+			pmd_clear(pmd);
+			pgtable_remove_rmap_locked(ptepage, tlb->mm);
+			tlb->mm->rss -= ptepage->private;
+			put_page(ptepage);
+			pte_page_unlock(ptepage);
+			return;
+		}
+		ptep = pte_unshare(tlb->mm, pmd, address);
+		ptepage = pmd_page(*pmd);
+	} else {
+		ptep = pte_offset_map(pmd, address);
+	}
 	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
 		pte_t pte = *ptep;
 		if (pte_none(pte))
@@ -365,6 +534,7 @@
 			pte_clear(ptep);
 		}
 	}
+	pte_page_unlock(ptepage);
 	pte_unmap(ptep-1);
 }
 
@@ -460,6 +630,19 @@
 	spin_unlock(&mm->page_table_lock);
 }
 
+void unmap_all_pages(mmu_gather_t *tlb, struct mm_struct *mm, unsigned long address, unsigned long end)
+{
+	pgd_t * dir;
+
+	if (address >= end)
+		BUG();
+	dir = pgd_offset(mm, address);
+	do {
+		zap_pmd_range(tlb, dir, address, end - address);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && (address < end));
+}
 /*
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
@@ -1005,6 +1188,7 @@
 	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
 	struct page *old_page, *new_page;
+	struct page *ptepage = pmd_page(*pmd);
 	unsigned long pfn = pte_pfn(pte);
 
 	if (!pfn_valid(pfn))
@@ -1018,7 +1202,7 @@
 			flush_cache_page(vma, address);
 			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			pte_page_unlock(ptepage);
 			return VM_FAULT_MINOR;
 		}
 	}
@@ -1028,7 +1212,7 @@
 	 * Ok, we need to copy. Oh, well..
 	 */
 	page_cache_get(old_page);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 
 	new_page = alloc_page(GFP_HIGHUSER);
 	if (!new_page)
@@ -1038,11 +1222,11 @@
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	spin_lock(&mm->page_table_lock);
+	pte_page_lock(ptepage);
 	page_table = pte_offset_map(pmd, address);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
-			++mm->rss;
+			increment_rss(ptepage);
 		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
 		page_add_rmap(new_page, page_table);
@@ -1052,14 +1236,14 @@
 		new_page = old_page;
 	}
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return VM_FAULT_MINOR;
 
 bad_wp_page:
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address);
 	/*
 	 * This should really halt the system so it can be debugged or
@@ -1188,12 +1372,13 @@
 	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
 {
 	struct page *page;
+	struct page *ptepage = pmd_page(*pmd);
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	page = lookup_swap_cache(entry);
 	if (!page) {
 		swapin_readahead(entry);
@@ -1203,14 +1388,14 @@
 			 * Back out if somebody else faulted in this pte while
 			 * we released the page table lock.
 			 */
-			spin_lock(&mm->page_table_lock);
+			pte_page_lock(ptepage);
 			page_table = pte_offset_map(pmd, address);
 			if (pte_same(*page_table, orig_pte))
 				ret = VM_FAULT_OOM;
 			else
 				ret = VM_FAULT_MINOR;
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			pte_page_unlock(ptepage);
 			return ret;
 		}
 
@@ -1226,11 +1411,11 @@
 	 * Back out if somebody else faulted in this pte while we
 	 * released the page table lock.
 	 */
-	spin_lock(&mm->page_table_lock);
+	pte_page_lock(ptepage);
 	page_table = pte_offset_map(pmd, address);
 	if (!pte_same(*page_table, orig_pte)) {
 		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
+		pte_page_unlock(ptepage);
 		unlock_page(page);
 		page_cache_release(page);
 		return VM_FAULT_MINOR;
@@ -1242,7 +1427,7 @@
 	if (vm_swap_full())
 		remove_exclusive_swap_page(page);
 
-	mm->rss++;
+	increment_rss(ptepage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page))
 		pte = pte_mkdirty(pte_mkwrite(pte));
@@ -1256,7 +1441,7 @@
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	return ret;
 }
 
@@ -1269,6 +1454,7 @@
 {
 	pte_t entry;
 	struct page * page = ZERO_PAGE(addr);
+	struct page *ptepage = pmd_page(*pmd);
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
@@ -1277,23 +1463,23 @@
 	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
+		pte_page_unlock(ptepage);
 
 		page = alloc_page(GFP_HIGHUSER);
 		if (!page)
 			goto no_mem;
 		clear_user_highpage(page, addr);
 
-		spin_lock(&mm->page_table_lock);
+		pte_page_lock(ptepage);
 		page_table = pte_offset_map(pmd, addr);
 
 		if (!pte_none(*page_table)) {
 			pte_unmap(page_table);
 			page_cache_release(page);
-			spin_unlock(&mm->page_table_lock);
+			pte_page_unlock(ptepage);
 			return VM_FAULT_MINOR;
 		}
-		mm->rss++;
+		increment_rss(ptepage);
 		flush_page_to_ram(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add(page);
@@ -1306,7 +1492,7 @@
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	return VM_FAULT_MINOR;
 
 no_mem:
@@ -1329,12 +1515,13 @@
 	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
+	struct page *ptepage = pmd_page(*pmd);
 	pte_t entry;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
 		return do_anonymous_page(mm, vma, page_table, pmd, write_access, address);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 
@@ -1359,7 +1546,7 @@
 		new_page = page;
 	}
 
-	spin_lock(&mm->page_table_lock);
+	pte_page_lock(ptepage);
 	page_table = pte_offset_map(pmd, address);
 
 	/*
@@ -1374,7 +1561,7 @@
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
-		++mm->rss;
+		increment_rss(ptepage);
 		flush_page_to_ram(new_page);
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1387,13 +1574,13 @@
 		/* One of our sibling threads was faster, back out. */
 		pte_unmap(page_table);
 		page_cache_release(new_page);
-		spin_unlock(&mm->page_table_lock);
+		pte_page_unlock(ptepage);
 		return VM_FAULT_MINOR;
 	}
 
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	return VM_FAULT_MAJOR;
 }
 
@@ -1445,7 +1632,7 @@
 	entry = pte_mkyoung(entry);
 	establish_pte(vma, address, pte, entry);
 	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(pmd_page(*pmd));
 	return VM_FAULT_MINOR;
 }
 
@@ -1470,9 +1657,19 @@
 	pmd = pmd_alloc(mm, pgd, address);
 
 	if (pmd) {
-		pte_t * pte = pte_alloc_map(mm, pmd, address);
-		if (pte)
+		pte_t * pte;
+
+		if (pte_needs_unshare(mm, vma, pmd, address, write_access))
+			pte = pte_unshare(mm, pmd, address);
+		else {
+			pte = pte_alloc_map(mm, pmd, address);
+			pte_page_lock(pmd_page(*pmd));
+		}
+
+		if (pte) {
+			spin_unlock(&mm->page_table_lock);
 			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+		}
 	}
 	spin_unlock(&mm->page_table_lock);
 	return VM_FAULT_OOM;
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c	Wed Oct  2 09:47:17 2002
+++ b/mm/rmap.c	Wed Oct  2 09:47:17 2002
@@ -45,11 +45,17 @@
  */
 #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t))
 
+struct mm_chain {
+	struct mm_chain *next;
+	struct mm_struct *mm;
+};
+
 struct pte_chain {
 	struct pte_chain *next;
 	pte_addr_t ptes[NRPTE];
 };
 
+static kmem_cache_t	*mm_chain_cache;
 static kmem_cache_t	*pte_chain_cache;
 
 /*
@@ -102,6 +108,25 @@
 	kmem_cache_free(pte_chain_cache, pte_chain);
 }
 
+static inline struct mm_chain *mm_chain_alloc(void)
+{
+	struct mm_chain *ret;
+
+	ret = kmem_cache_alloc(mm_chain_cache, GFP_ATOMIC);
+	return ret;
+}
+
+static void mm_chain_free(struct mm_chain *mc,
+		struct mm_chain *prev_mc, struct page *page)
+{
+	if (prev_mc)
+		prev_mc->next = mc->next;
+	else if (page)
+		page->pte.mmchain = mc->next;
+
+	kmem_cache_free(mm_chain_cache, mc);
+}
+
 /**
  ** VM stuff below this comment
  **/
@@ -161,6 +186,94 @@
 	return referenced;
 }
 
+void pgtable_add_rmap_locked(struct page * page, struct mm_struct * mm,
+			     unsigned long address)
+{
+	struct mm_chain *mc;
+
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+	extern int mem_init_done;
+
+	if (!mem_init_done)
+		return;
+#endif
+#ifdef RMAP_DEBUG
+	BUG_ON(mm == NULL);
+#endif
+
+	if (PageDirect(page)) {
+		mc = mm_chain_alloc();
+		mc->mm = page->pte.mmdirect;
+		mc->next = NULL;
+		page->pte.mmchain = mc;
+		ClearPageDirect(page);
+	}
+	if (page->pte.mmchain) {
+		/* Hook up the mm_chain to the page. */
+		mc = mm_chain_alloc();
+		mc->mm = mm;
+		mc->next = page->pte.mmchain;
+		page->pte.mmchain = mc;
+	} else {
+		page->pte.mmdirect = mm;
+		SetPageDirect(page);
+		page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+	}
+}
+
+void pgtable_remove_rmap_locked(struct page * page, struct mm_struct *mm)
+{
+	struct mm_chain * mc, * prev_mc = NULL;
+
+#ifdef DEBUG_RMAP
+	BUG_ON(mm == NULL);
+#endif
+
+	if (PageDirect(page)) {
+		if (page->pte.mmdirect == mm) {
+			page->pte.mmdirect = NULL;
+			ClearPageDirect(page);
+			page->index = 0;
+			goto out;
+		}
+	} else {
+#ifdef DEBUG_RMAP
+		BUG_ON(page->pte.mmchain->next == NULL);
+#endif
+		for (mc = page->pte.mmchain; mc; prev_mc = mc, mc = mc->next) {
+			if (mc->mm == mm) {
+				mm_chain_free(mc, prev_mc, page);
+				/* Check whether we can convert to direct */
+				mc = page->pte.mmchain;
+				if (!mc->next) {
+					page->pte.mmdirect = mc->mm;
+					SetPageDirect(page);
+					mm_chain_free(mc, NULL, NULL);
+				}
+				goto out;
+			}
+		}
+	}
+	BUG();
+out:
+}
+
+void pgtable_add_rmap(struct page * page, struct mm_struct * mm,
+			     unsigned long address)
+{
+	pte_page_lock(page);
+	pgtable_add_rmap_locked(page, mm, address);
+	pte_page_unlock(page);
+}
+
+void pgtable_remove_rmap(struct page * page, struct mm_struct *mm)
+{
+	pte_page_lock(page);
+	pgtable_remove_rmap_locked(page, mm);
+	pte_page_unlock(page);
+}
+
 /**
  * page_add_rmap - add reverse mapping entry to a page
  * @page: the page to add the mapping to
@@ -180,8 +293,6 @@
 		BUG();
 	if (!pte_present(*ptep))
 		BUG();
-	if (!ptep_to_mm(ptep))
-		BUG();
 #endif
 
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
@@ -199,12 +310,15 @@
 			if (page->pte.direct == pte_paddr)
 				BUG();
 		} else {
+			int count = 0;
 			for (pc = page->pte.chain; pc; pc = pc->next) {
-				for (i = 0; i < NRPTE; i++) {
+				for (i = 0; i < NRPTE; i++, count++) {
 					pte_addr_t p = pc->ptes[i];
 
-					if (p && p == pte_paddr)
+					if (p && p == pte_paddr) {
+						printk(KERN_ERR "page_add_rmap: page %08lx (count %d), ptep %08lx, rmap count %d\n", page, page_count(page), ptep, count);
 						BUG();
+					}
 				}
 			}
 		}
@@ -342,6 +456,98 @@
 	return;
 }
 
+static inline int pgtable_check_mlocked_mm(struct mm_struct *mm, unsigned long address)
+{
+	struct vm_area_struct *vma;
+	int ret = SWAP_SUCCESS;
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out;
+	}
+
+	/* The page is mlock()d, we cannot swap it out. */
+	if (vma->vm_flags & VM_LOCKED) {
+		ret = SWAP_FAIL;
+	}
+out:
+	return ret;
+}
+
+static inline int pgtable_check_mlocked(struct page *ptepage, unsigned long address)
+{
+	struct mm_chain *mc;
+	int ret = SWAP_SUCCESS;
+
+	if (PageDirect(ptepage)) {
+		ret = pgtable_check_mlocked_mm(ptepage->pte.mmdirect, address);
+		goto out;
+	}
+
+	for (mc = ptepage->pte.mmchain; mc; mc = mc->next) {
+#ifdef DEBUG_RMAP
+		BUG_ON(mc->mm == NULL);
+#endif
+		ret = pgtable_check_mlocked_mm(mc->mm, address);
+		if (ret != SWAP_SUCCESS)
+			goto out;
+	}
+out:
+	return ret;
+}
+
+static inline int pgtable_unmap_one_mm(struct mm_struct *mm, unsigned long address)
+{
+	struct vm_area_struct *vma;
+	int ret = SWAP_SUCCESS;
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out;
+	}
+	flush_tlb_page(vma, address);
+	flush_cache_page(vma, address);
+	mm->rss--;
+
+out:
+	return ret;
+}
+
+static inline int pgtable_unmap_one(struct page *ptepage, unsigned long address)
+{
+	struct mm_chain *mc;
+	int ret = SWAP_SUCCESS;
+
+	if (PageDirect(ptepage)) {
+		ret = pgtable_unmap_one_mm(ptepage->pte.mmdirect, address);
+		if (ret != SWAP_SUCCESS)
+			goto out;
+	} else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) {
+		ret = pgtable_unmap_one_mm(mc->mm, address);
+		if (ret != SWAP_SUCCESS)
+			goto out;
+	}
+	ptepage->private--;
+out:
+	return ret;
+}
+
+void increment_rss(struct page *ptepage)
+{
+	struct mm_chain *mc;
+
+	if (PageDirect(ptepage))
+		ptepage->pte.mmdirect->rss++;
+	else for (mc = ptepage->pte.mmchain; mc; mc = mc->next)
+		mc->mm->rss++;
+
+	ptepage->private++;
+}
+
 /**
  * try_to_unmap_one - worker function for try_to_unmap
  * @page: page to unmap
@@ -360,42 +566,24 @@
 static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
 {
 	pte_t *ptep = rmap_ptep_map(paddr);
-	unsigned long address = ptep_to_address(ptep);
-	struct mm_struct * mm = ptep_to_mm(ptep);
-	struct vm_area_struct * vma;
 	pte_t pte;
+	struct page *ptepage = kmap_atomic_to_page(ptep);
+	unsigned long address = ptep_to_address(ptep);
 	int ret;
 
-	if (!mm)
-		BUG();
+	pte_page_lock(ptepage);
 
-	/*
-	 * We need the page_table_lock to protect us from page faults,
-	 * munmap, fork, etc...
-	 */
-	if (!spin_trylock(&mm->page_table_lock)) {
-		rmap_ptep_unmap(ptep);
-		return SWAP_AGAIN;
-	}
-
-
-	/* During mremap, it's possible pages are not in a VMA. */
-	vma = find_vma(mm, address);
-	if (!vma) {
-		ret = SWAP_FAIL;
+	ret = pgtable_check_mlocked(ptepage, address);
+	if (ret != SWAP_SUCCESS)
 		goto out_unlock;
-	}
+	pte = ptep_get_and_clear(ptep);
 
-	/* The page is mlock()d, we cannot swap it out. */
-	if (vma->vm_flags & VM_LOCKED) {
-		ret = SWAP_FAIL;
+	ret = pgtable_unmap_one(ptepage, address);
+	if (ret != SWAP_SUCCESS) {
+		set_pte(ptep, pte);
 		goto out_unlock;
 	}
-
-	/* Nuke the page table entry. */
-	pte = ptep_get_and_clear(ptep);
-	flush_tlb_page(vma, address);
-	flush_cache_page(vma, address);
+	pte_page_unlock(ptepage);
 
 	/* Store the swap location in the pte. See handle_pte_fault() ... */
 	if (PageSwapCache(page)) {
@@ -408,13 +596,15 @@
 	if (pte_dirty(pte))
 		set_page_dirty(page);
 
-	mm->rss--;
 	page_cache_release(page);
 	ret = SWAP_SUCCESS;
+	goto out;
 
 out_unlock:
+	pte_page_unlock(ptepage);
+
+out:
 	rmap_ptep_unmap(ptep);
-	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 
@@ -523,6 +713,17 @@
 
 void __init pte_chain_init(void)
 {
+
+	mm_chain_cache = kmem_cache_create(	"mm_chain",
+						sizeof(struct mm_chain),
+						0,
+						0,
+						NULL,
+						NULL);
+
+	if (!mm_chain_cache)
+		panic("failed to create mm_chain cache!\n");
+
 	pte_chain_cache = kmem_cache_create(	"pte_chain",
 						sizeof(struct pte_chain),
 						0,
diff -Nru a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c	Wed Oct  2 09:47:17 2002
+++ b/mm/swapfile.c	Wed Oct  2 09:47:17 2002
@@ -371,7 +371,7 @@
  */
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
-	pte_t *dir, swp_entry_t entry, struct page* page)
+	pte_t *dir, swp_entry_t entry, struct page* page, pmd_t *pmd)
 {
 	pte_t pte = *dir;
 
@@ -383,7 +383,7 @@
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
 	page_add_rmap(page, dir);
 	swap_free(entry);
-	++vma->vm_mm->rss;
+	increment_rss(pmd_page(*pmd));
 }
 
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
@@ -408,7 +408,7 @@
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
+		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page, dir);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Snapshot of shared page tables
  2002-10-02 14:57 [PATCH] Snapshot of shared page tables Dave McCracken
@ 2002-10-02 16:51 ` Daniel Phillips
  2002-10-02 16:57   ` Dave McCracken
  2002-10-02 17:00   ` Daniel Phillips
  2002-10-02 22:39 ` Paul Mackerras
  1 sibling, 2 replies; 6+ messages in thread
From: Daniel Phillips @ 2002-10-02 16:51 UTC (permalink / raw)
  To: Dave McCracken, Linux Memory Management, Linux Kernel

On Wednesday 02 October 2002 16:57, Dave McCracken wrote:
> 
> Ok, here it is.  This patch works for my simple tests, both under UP and
> SMP, including under memory pressure.  I'd appreciate anyone who'd like to
> take it and beat on it.  Please let me know of any problems you find.
> 
> The patch is against this morning's 2.5 BK tree.

Interesting, you substituted pte_page_lock(ptepage) for mm->page_table_lock.
Could you wax poetic about that, please?

-- 
Daniel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Snapshot of shared page tables
  2002-10-02 16:51 ` Daniel Phillips
@ 2002-10-02 16:57   ` Dave McCracken
  2002-10-02 17:00   ` Daniel Phillips
  1 sibling, 0 replies; 6+ messages in thread
From: Dave McCracken @ 2002-10-02 16:57 UTC (permalink / raw)
  To: Daniel Phillips, Linux Memory Management, Linux Kernel


--On Wednesday, October 02, 2002 18:51:41 +0200 Daniel Phillips
<phillips@arcor.de> wrote:

> Interesting, you substituted pte_page_lock(ptepage) for
> mm->page_table_lock. Could you wax poetic about that, please?

Sure.  If a pte page is shared, the mm->page_table_lock is not sufficient
to protect the rest of the page fault.  Therefore we need a lock at the pte
page level.  The mm->page_table_lock is held during the page fault until we
have a valid and locked pte page we're working on, then it's dropped for
the rest of the fault.

Feel free to poke holes in my logic, but I think it's the right locking
model for shared pte pages.

Dave McCracken

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Snapshot of shared page tables
  2002-10-02 16:51 ` Daniel Phillips
  2002-10-02 16:57   ` Dave McCracken
@ 2002-10-02 17:00   ` Daniel Phillips
  1 sibling, 0 replies; 6+ messages in thread
From: Daniel Phillips @ 2002-10-02 17:00 UTC (permalink / raw)
  To: Dave McCracken, Linux Memory Management, Linux Kernel

On Wednesday 02 October 2002 18:51, Daniel Phillips wrote:
> On Wednesday 02 October 2002 16:57, Dave McCracken wrote:
> > 
> > Ok, here it is.  This patch works for my simple tests, both under UP and
> > SMP, including under memory pressure.  I'd appreciate anyone who'd like to
> > take it and beat on it.  Please let me know of any problems you find.
> > 
> > The patch is against this morning's 2.5 BK tree.
> 
> Interesting, you substituted pte_page_lock(ptepage) for mm->page_table_lock.
> Could you wax poetic about that, please?

Never mind, I see the logic.  This reflects the fact that page_table_lock
is insufficient protection when pte pages are shared.  So you solved that
problem and at the same time improved the scalability for the general case
immensely, without adding any new overhead.  Very nice!

-- 
Daniel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Snapshot of shared page tables
  2002-10-02 14:57 [PATCH] Snapshot of shared page tables Dave McCracken
  2002-10-02 16:51 ` Daniel Phillips
@ 2002-10-02 22:39 ` Paul Mackerras
  2002-10-02 22:48   ` Dave McCracken
  1 sibling, 1 reply; 6+ messages in thread
From: Paul Mackerras @ 2002-10-02 22:39 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Linux Memory Management, Linux Kernel

Dave McCracken writes:

> Ok, here it is.  This patch works for my simple tests, both under UP and
> SMP, including under memory pressure.  I'd appreciate anyone who'd like to
> take it and beat on it.  Please let me know of any problems you find.

Interesting.  I notice that you are using the _PAGE_RW bit in the
PMDs.  Are you relying on the hardware to do anything with that bit,
or is it only used by software?

(If you are relying on the hardware to do something different when
_PAGE_RW is clear in the PMD, then your approach isn't portable.)

Paul.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Snapshot of shared page tables
  2002-10-02 22:39 ` Paul Mackerras
@ 2002-10-02 22:48   ` Dave McCracken
  0 siblings, 0 replies; 6+ messages in thread
From: Dave McCracken @ 2002-10-02 22:48 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linux Memory Management, Linux Kernel


--On Thursday, October 03, 2002 08:39:20 +1000 Paul Mackerras
<paulus@samba.org> wrote:

> Interesting.  I notice that you are using the _PAGE_RW bit in the
> PMDs.  Are you relying on the hardware to do anything with that bit,
> or is it only used by software?
> 
> (If you are relying on the hardware to do something different when
> _PAGE_RW is clear in the PMD, then your approach isn't portable.)

Yes, I am relying on the hardware.  I was under the impression that it was
pretty much universal that making the pmd read-only would make the hardware
treat all ptes under it as read-only.  This came out of a discussion on
lkml last winter where this assertion was made.

Do you know of a page table-based architecture that doesn't have and honor
read-only protections at the pmd level?

Dave McCracken

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2002-10-02 22:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-02 14:57 [PATCH] Snapshot of shared page tables Dave McCracken
2002-10-02 16:51 ` Daniel Phillips
2002-10-02 16:57   ` Dave McCracken
2002-10-02 17:00   ` Daniel Phillips
2002-10-02 22:39 ` Paul Mackerras
2002-10-02 22:48   ` Dave McCracken

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).