linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] Implement shared page tables
@ 2005-08-30 22:13 Dave McCracken
  2005-08-31 11:44 ` Hugh Dickins
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Dave McCracken @ 2005-08-30 22:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Kernel, Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 814 bytes --]


This patch implements page table sharing for all shared memory regions that
span an entire page table page.  It supports sharing at multiple page
levels, depending on the architecture.

Performance testing has shown no degradation with this patch for tests with
small processes.  Preliminary tests with large benchmarks have shown as
much as 3% improvement in overall results.

For those familiar with the shared page table patch I did a couple of years
ago, this patch does not implement copy-on-write page tables for private
mappings.  Analysis showed the cost and complexity far outweighed any
potential benefit.

This version of the patch supports i386 and x86_64.  I have additional
patches to support ppc64, but they are not quite ready for public
consumption.

The patch is against 2.6.13.

Dave McCracken

[-- Attachment #2: shpt-generic-2.6.13-3.diff --]
[-- Type: text/plain, Size: 33107 bytes --]

--- 2.6.13/./arch/i386/Kconfig	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./arch/i386/Kconfig	2005-08-29 10:02:47.000000000 -0500
@@ -748,6 +748,18 @@ config X86_PAE
 	depends on HIGHMEM64G
 	default y
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+config PTSHARE_PTE
+	bool
+	depends on PTSHARE
+	default y
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
--- 2.6.13/./arch/x86_64/Kconfig	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./arch/x86_64/Kconfig	2005-08-29 10:02:47.000000000 -0500
@@ -240,6 +240,38 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+menu "Page table levels to share"
+	depends on PTSHARE
+
+config PTSHARE_PTE
+	bool "Bottom level table (PTE)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PMD
+	bool "Middle level table (PMD)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PUD
+	bool "Upper level table (PUD)"
+	depends on PTSHARE
+	default n
+
+endmenu
+
+config PTSHARE_HUGEPAGE
+	bool
+	depends on PTSHARE && PTSHARE_PMD
+	default y
+
 config ARCH_DISCONTIGMEM_ENABLE
        bool
        depends on NUMA
--- 2.6.13/./arch/x86_64/mm/fault.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./arch/x86_64/mm/fault.c	2005-08-29 10:02:47.000000000 -0500
@@ -153,7 +153,7 @@ void dump_pagetable(unsigned long addres
 	if (bad_address(pgd)) goto bad;
 	if (!pgd_present(*pgd)) goto ret; 
 
-	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
+	pud = __pud_offset_k((pud_t *)pgd_page_kernel(*pgd), address);
 	if (bad_address(pud)) goto bad;
 	printk("PUD %lx ", pud_val(*pud));
 	if (!pud_present(*pud))	goto ret;
@@ -259,7 +259,7 @@ static int vmalloc_fault(unsigned long a
 	pud_ref = pud_offset(pgd_ref, address);
 	if (pud_none(*pud_ref))
 		return -1;
-	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+	if (pud_none(*pud) || pud_page_kernel(*pud) != pud_page_kernel(*pud_ref))
 		BUG();
 	pmd = pmd_offset(pud, address);
 	pmd_ref = pmd_offset(pud_ref, address);
--- 2.6.13/./include/asm-x86_64/pgtable.h	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./include/asm-x86_64/pgtable.h	2005-08-29 10:02:47.000000000 -0500
@@ -100,9 +100,6 @@ extern inline void pgd_clear (pgd_t * pg
 	set_pgd(pgd, __pgd(0));
 }
 
-#define pud_page(pud) \
-((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
-
 #define ptep_get_and_clear(mm,addr,xp)	__pte(xchg(&(xp)->pte, 0))
 #define pte_same(a, b)		((a).pte == (b).pte)
 
@@ -309,7 +306,8 @@ static inline int pmd_large(pmd_t pte) {
 /*
  * Level 4 access.
  */
-#define pgd_page(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
+#define pgd_page_kernel(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
+#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
@@ -317,9 +315,11 @@ static inline int pmd_large(pmd_t pte) {
 #define mk_kernel_pgd(address) ((pgd_t){ (address) | _KERNPG_TABLE })
 
 /* PUD - Level3 access */
+#define pud_page_kernel(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
 /* to find an entry in a page-table-directory. */
 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
+#define pud_offset(pgd, address) ((pud_t *) pgd_page_kernel(*(pgd)) + pud_index(address))
 #define pud_offset_k(pgd, addr) pud_offset(pgd, addr)
 #define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
 
@@ -333,7 +333,7 @@ static inline pud_t *__pud_offset_k(pud_
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
+#define pmd_offset(dir, address) ((pmd_t *) pud_page_kernel(*(dir)) + \
 			pmd_index(address))
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
--- 2.6.13/./include/linux/ptshare.h	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.13-shpt/./include/linux/ptshare.h	2005-08-29 10:02:47.000000000 -0500
@@ -0,0 +1,274 @@
+#ifndef _LINUX_PTSHARE_H
+#define _LINUX_PTSHARE_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <dmccr@us.ibm.com>
+ */
+
+#ifdef CONFIG_PTSHARE
+static inline int pt_is_shared(struct page *page)
+{
+	return (page_mapcount(page) > 1);
+}
+
+static inline void pt_increment_share(struct page *page)
+{
+	atomic_inc(&page->_mapcount);
+}
+
+static inline void pt_decrement_share(struct page *page)
+{
+	atomic_dec(&page->_mapcount);
+}
+
+static inline int
+pt_shareable_vma(struct vm_area_struct *vma)
+{
+	/* We can't share anonymous memory */
+	if (!vma->vm_file)
+		return 0;
+
+	/* No sharing of nonlinear areas */
+	if (vma->vm_flags & VM_NONLINEAR)
+		return 0;
+
+	/* Only share shared mappings or read-only mappings */
+	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) == VM_WRITE)
+		return 0;
+
+	/* If it's smaller than the smallest shareable unit, don't bother
+	   calling it shareable */
+	if ((vma->vm_end - vma->vm_start) < PMD_SIZE)
+		return 0;
+
+	return 1;
+}
+extern void pt_unshare_range(struct mm_struct *mm,
+			     unsigned long address,
+			     unsigned long end);
+#else /* CONFIG_PTSHARE */
+#define	pt_is_shared(page)	(0)
+#define	pt_increment_share(page)
+#define	pt_decrement_share(page)
+#define pt_shareable_vma(vma)	(0)
+#define	pt_unshare_range(mm, address, end)
+#endif /* CONFIG_PTSHARE */
+
+#ifdef CONFIG_PTSHARE_PTE
+static inline int
+pt_is_shared_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	return pt_is_shared(page);
+}
+
+static inline void
+pt_increment_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	pt_increment_share(page);
+	return;
+}
+
+static inline void
+pt_decrement_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	pt_decrement_share(page);
+	return;
+}
+
+static inline int
+pt_shareable_pte(struct vm_area_struct *vma,
+		 unsigned long address)
+{
+	unsigned long base = address & PMD_MASK;
+	unsigned long end = base + (PMD_SIZE-1);
+
+		if ((vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+extern pte_t * pt_share_pte(struct vm_area_struct *vma,
+			    unsigned long address,
+			    pmd_t *pmd,
+			    struct address_space *mapping);
+extern void pt_copy_pte(struct mm_struct *mm,
+			pmd_t *dst_pmd,
+			pmd_t *src_pmd);
+#else /* CONFIG_PTSHARE_PTE */
+static inline int
+pt_is_shared_pte(pmd_t pmdval)
+{
+	return 0;
+}
+#define	pt_increment_pte(pmdval)
+#define	pt_decrement_pte(pmdval)
+#define	pt_copy_pte(mm, dst_pmd, src_pmd)
+#define	pt_shareable_pte(vma, address) (0)
+#define	pt_share_pte(vma, address, pmd, mapping) pte_alloc_map(vma->vm_mm, pmd, address)
+#endif /* CONFIG_PTSHARE_PTE */
+
+#ifdef CONFIG_PTSHARE_PMD
+static inline int
+pt_is_shared_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	return pt_is_shared(page);
+}
+
+static inline void
+pt_increment_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	pt_increment_share(page);
+	return;
+}
+
+static inline void
+pt_decrement_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	pt_decrement_share(page);
+	return;
+}
+
+static inline int
+pt_shareable_pmd(struct vm_area_struct *vma,
+		 unsigned long address)
+{
+	unsigned long base = address & PUD_MASK;
+	unsigned long end = base + (PUD_SIZE-1);
+
+		if ((vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+extern pmd_t * pt_share_pmd(struct vm_area_struct *vma,
+			    unsigned long address,
+			    pud_t *pud,
+			    struct address_space *mapping);
+extern void pt_copy_pmd(struct mm_struct *mm,
+			pud_t *dst_pud,
+			pud_t *src_pud);
+#else /* CONFIG_PTSHARE_PMD */
+static inline int
+pt_is_shared_pmd(pud_t pudval)
+{
+	return 0;
+}
+#define	pt_increment_pmd(pudval)
+#define	pt_decrement_pmd(pudval)
+#define	pt_copy_pmd(mm, dst_pud, src_pud)
+#define	pt_shareable_pmd(vma, address) (0)
+#define	pt_share_pmd(vma, address, pud, mapping) pmd_alloc(vma->vm_mm, pud, address)
+#endif /* CONFIG_PTSHARE_PMD */
+
+#ifdef CONFIG_PTSHARE_PUD
+static inline int
+pt_is_shared_pud(pgd_t pgdval)
+{
+	struct page *page;
+
+	page = pgd_page(pgdval);
+	return pt_is_shared(page);
+}
+
+static inline void
+pt_increment_pud(pgd_t pgdval)
+{
+	struct page *page;
+
+	page = pgd_page(pgdval);
+	pt_increment_share(page);
+	return;
+}
+
+static inline void
+pt_decrement_pud(pgd_t pgdval)
+{
+	struct page *page;
+
+	page = pgd_page(pgdval);
+	pt_decrement_share(page);
+	return;
+}
+
+static inline int
+pt_shareable_pud(struct vm_area_struct *vma,
+		 unsigned long address)
+{
+	unsigned long base = address & PGDIR_MASK;
+	unsigned long end = base + (PGDIR_SIZE-1);
+
+		if ((vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+extern pud_t * pt_share_pud(struct vm_area_struct *vma,
+			    unsigned long address,
+			    pgd_t *pgd,
+			    struct address_space *mapping);
+extern void pt_copy_pud(struct mm_struct *mm,
+			pgd_t *dst_pgd,
+			pgd_t *src_pgd);
+#else /* CONFIG_PTSHARE_PUD */
+static inline int
+pt_is_shared_pud(pgd_t pgdval)
+{
+	return 0;
+}
+#define	pt_increment_pud(pgdval)
+#define	pt_decrement_pud(pgdval)
+#define	pt_copy_pud(mm, dst_pgd, src_pgd)
+#define	pt_shareable_pud(vma, address) (0)
+#define	pt_share_pud(vma, address, pgd, mapping) pud_alloc(vma->vm_mm, pgd, address)
+#endif /* CONFIG_PTSHARE_PUD */
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+extern pte_t *pt_share_hugepage(struct mm_struct *mm,
+			       struct vm_area_struct *vma,
+			       unsigned long address);
+extern void pt_unshare_huge_range(struct mm_struct *mm,
+				  unsigned long address,
+				  unsigned long end);
+#else
+#define	pt_share_hugepage(mm, vma, address)	huge_pte_alloc(mm, address)
+#define	pt_unshare_huge_range(mm, address, end)
+#endif	/* CONFIG_PTSHARE_HUGEPAGE */
+
+#endif /* _LINUX_PTSHARE_H */
--- 2.6.13/./mm/Makefile	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/Makefile	2005-08-29 10:02:47.000000000 -0500
@@ -18,5 +18,6 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_PTSHARE) += ptshare.o
 
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
--- 2.6.13/./mm/fremap.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/fremap.c	2005-08-29 10:02:47.000000000 -0500
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -226,6 +227,9 @@ asmlinkage long sys_remap_file_pages(uns
 				has_write_lock = 1;
 				goto retry;
 			}
+			if (pt_shareable_vma(vma))
+				pt_unshare_range(vma->vm_mm, vma->vm_start, vma->vm_end);
+
 			mapping = vma->vm_file->f_mapping;
 			spin_lock(&mapping->i_mmap_lock);
 			flush_dcache_mmap_lock(mapping);
--- 2.6.13/./mm/hugetlb.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/hugetlb.c	2005-08-29 10:02:47.000000000 -0500
@@ -11,6 +11,7 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/ptshare.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -278,7 +279,7 @@ int copy_hugetlb_page_range(struct mm_st
 	unsigned long end = vma->vm_end;
 
 	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
+		dst_pte = pt_share_hugepage(dst, vma, addr);
 		if (!dst_pte)
 			goto nomem;
 		src_pte = huge_pte_offset(src, addr);
@@ -309,6 +310,7 @@ void unmap_hugepage_range(struct vm_area
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	pt_unshare_huge_range(mm, start, end);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (! ptep)
@@ -353,7 +355,7 @@ int hugetlb_prefault(struct address_spac
 	spin_lock(&mm->page_table_lock);
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
+		pte_t *pte = pt_share_hugepage(mm, vma, addr);
 		struct page *page;
 
 		if (!pte) {
--- 2.6.13/./mm/memory.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/memory.c	2005-08-30 13:14:46.000000000 -0500
@@ -48,6 +48,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ptshare.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -113,10 +114,16 @@ void pmd_clear_bad(pmd_t *pmd)
 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = pmd_page(*pmd);
-	pmd_clear(pmd);
-	pte_free_tlb(tlb, page);
-	dec_page_state(nr_page_table_pages);
-	tlb->mm->nr_ptes--;
+	pmd_t pmdval= *pmd;
+	int share;
+
+	share = pt_is_shared_pte(pmdval);
+  	pmd_clear(pmd);
+	pt_decrement_pte(pmdval);
+	if (!share) {
+		pte_free_tlb(tlb, page);
+		dec_page_state(nr_page_table_pages);
+	}
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -124,17 +131,22 @@ static inline void free_pmd_range(struct
 				unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
-	unsigned long next;
-	unsigned long start;
-
-	start = addr;
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		free_pte_range(tlb, pmd);
-	} while (pmd++, addr = next, addr != end);
+	pud_t pudval = *pud;
+  	unsigned long next;
+  	unsigned long start;
+	int share;
+  
+	share = pt_is_shared_pmd(pudval);
+  	start = addr;
+  	pmd = pmd_offset(pud, addr);
+	if (!share) {
+		do {
+			next = pmd_addr_end(addr, end);
+			if (pmd_none_or_clear_bad(pmd))
+				continue;
+			free_pte_range(tlb, pmd);
+		} while (pmd++, addr = next, addr != end);
+	}
 
 	start &= PUD_MASK;
 	if (start < floor)
@@ -149,7 +161,10 @@ static inline void free_pmd_range(struct
 
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd);
+	pt_decrement_pmd(pudval);
+	if (!share)
+		pmd_free_tlb(tlb, pmd);
+
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -157,17 +172,22 @@ static inline void free_pud_range(struct
 				unsigned long floor, unsigned long ceiling)
 {
 	pud_t *pud;
+	pgd_t pgdval = *pgd;
 	unsigned long next;
 	unsigned long start;
+	int share;
 
+	share = pt_is_shared_pud(pgdval);
 	start = addr;
 	pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
-	} while (pud++, addr = next, addr != end);
+	if (!share) {
+		do {
+			next = pud_addr_end(addr, end);
+			if (pud_none_or_clear_bad(pud))
+				continue;
+			free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		} while (pud++, addr = next, addr != end);
+	}
 
 	start &= PGDIR_MASK;
 	if (start < floor)
@@ -182,7 +202,10 @@ static inline void free_pud_range(struct
 
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
-	pud_free_tlb(tlb, pud);
+	pt_decrement_pud(pgdval);
+	if (!share)
+		pud_free_tlb(tlb, pud);
+
 }
 
 /*
@@ -299,9 +322,13 @@ pte_t fastcall *pte_alloc_map(struct mm_
 			pte_free(new);
 			goto out;
 		}
+#if 0
 		mm->nr_ptes++;
+#endif
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
+
+		pt_increment_pte(*pmd);
 	}
 out:
 	return pte_offset_map(pmd, address);
@@ -327,6 +354,8 @@ pte_t fastcall * pte_alloc_kernel(struct
 			goto out;
 		}
 		pmd_populate_kernel(mm, pmd, new);
+
+		pt_increment_pte(*pmd);
 	}
 out:
 	return pte_offset_kernel(pmd, address);
@@ -448,7 +477,7 @@ again:
 
 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end)
+		unsigned long addr, unsigned long end, int shareable)
 {
 	pmd_t *src_pmd, *dst_pmd;
 	unsigned long next;
@@ -461,16 +490,20 @@ static inline int copy_pmd_range(struct 
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
-		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
-						vma, addr, next))
-			return -ENOMEM;
+		if (shareable && pt_shareable_pte(vma, addr)) {
+			pt_copy_pte(dst_mm, dst_pmd, src_pmd);
+		} else {
+			if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+					   vma, addr, next))
+				return -ENOMEM;
+		}
 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end)
+		unsigned long addr, unsigned long end, int shareable)
 {
 	pud_t *src_pud, *dst_pud;
 	unsigned long next;
@@ -483,9 +516,13 @@ static inline int copy_pud_range(struct 
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
-		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
-						vma, addr, next))
-			return -ENOMEM;
+		if (shareable && pt_shareable_pmd(vma, addr)) {
+			pt_copy_pmd(dst_mm, dst_pud, src_pud);
+		} else {
+			if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+					   vma, addr, next, shareable))
+				return -ENOMEM;
+		}
 	} while (dst_pud++, src_pud++, addr = next, addr != end);
 	return 0;
 }
@@ -497,19 +534,26 @@ int copy_page_range(struct mm_struct *ds
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	int shareable;
 
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	shareable = pt_shareable_vma(vma);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
-		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-						vma, addr, next))
-			return -ENOMEM;
+		if (shareable && pt_shareable_pud(vma, addr)) {
+			pt_copy_pud(dst_mm, dst_pgd, src_pgd);
+		} else {
+			if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+					   vma, addr, next, shareable))
+				return -ENOMEM;
+		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 	return 0;
 }
@@ -520,6 +564,12 @@ static void zap_pte_range(struct mmu_gat
 {
 	pte_t *pte;
 
+	if (pt_is_shared_pte(*pmd)) {
+		pt_decrement_pte(*pmd);
+		pmd_clear(pmd);
+		return;
+	}
+
 	pte = pte_offset_map(pmd, addr);
 	do {
 		pte_t ptent = *pte;
@@ -591,6 +641,12 @@ static inline void zap_pmd_range(struct 
 	pmd_t *pmd;
 	unsigned long next;
 
+	if (pt_is_shared_pmd(*pud)) {
+		pt_decrement_pmd(*pud);
+		pud_clear(pud);
+		return;
+	}
+
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
@@ -607,6 +663,12 @@ static inline void zap_pud_range(struct 
 	pud_t *pud;
 	unsigned long next;
 
+	if (pt_is_shared_pud(*pgd)) {
+		pt_decrement_pud(*pgd);
+		pgd_clear(pgd);
+		return;
+	}
+
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
@@ -2028,6 +2090,7 @@ int __handle_mm_fault(struct mm_struct *
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	struct address_space *mapping = NULL;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -2036,6 +2099,9 @@ int __handle_mm_fault(struct mm_struct *
 	if (is_vm_hugetlb_page(vma))
 		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
 
+	if (pt_shareable_vma(vma))
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.
@@ -2043,18 +2109,18 @@ int __handle_mm_fault(struct mm_struct *
 	pgd = pgd_offset(mm, address);
 	spin_lock(&mm->page_table_lock);
 
-	pud = pud_alloc(mm, pgd, address);
+	pud = pt_share_pud(vma, address, pgd, mapping);
 	if (!pud)
 		goto oom;
 
-	pmd = pmd_alloc(mm, pud, address);
+	pmd = pt_share_pmd(vma, address, pud, mapping);
 	if (!pmd)
 		goto oom;
 
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pt_share_pte(vma, address, pmd, mapping);
 	if (!pte)
 		goto oom;
-
+
 	return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
 
  oom:
@@ -2088,6 +2154,8 @@ pud_t fastcall *__pud_alloc(struct mm_st
 		goto out;
 	}
 	pgd_populate(mm, pgd, new);
+
+	pt_increment_pud(*pgd);
  out:
 	return pud_offset(pgd, address);
 }
@@ -2128,6 +2196,8 @@ pmd_t fastcall *__pmd_alloc(struct mm_st
 	pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 
+	pt_increment_pmd(*pud);
+
  out:
 	return pmd_offset(pud, address);
 }
--- 2.6.13/./mm/mmap.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/mmap.c	2005-08-30 13:33:54.000000000 -0500
@@ -1969,7 +1969,6 @@ void exit_mmap(struct mm_struct *mm)
 		vma = next;
 	}
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
--- 2.6.13/./mm/mprotect.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/mprotect.c	2005-08-29 10:02:47.000000000 -0500
@@ -19,6 +19,7 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -116,6 +117,9 @@ mprotect_fixup(struct vm_area_struct *vm
 		return 0;
 	}
 
+	if (pt_shareable_vma(vma))
+		pt_unshare_range(vma->vm_mm, start, end);
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
--- 2.6.13/./mm/mremap.c	2005-08-28 18:41:01.000000000 -0500
+++ 2.6.13-shpt/./mm/mremap.c	2005-08-29 10:02:47.000000000 -0500
@@ -17,6 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -163,6 +164,9 @@ static unsigned long move_page_tables(st
 
 	flush_cache_range(vma, old_addr, old_addr + len);
 
+	if (pt_shareable_vma(vma))
+		pt_unshare_range(vma->vm_mm, old_addr, old_addr + len);
+
 	/*
 	 * This is not the clever way to do this, but we're taking the
 	 * easy way out on the assumption that most remappings will be
--- 2.6.13/./mm/ptshare.c	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.13-shpt/./mm/ptshare.c	2005-08-29 10:02:47.000000000 -0500
@@ -0,0 +1,366 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <dmccr@us.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/prio_tree.h>
+#include <linux/mm.h>
+#include <linux/ptshare.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+#undef	PT_DEBUG
+
+#ifndef __PAGETABLE_PMD_FOLDED
+static void
+pt_unshare_pmd(pud_t *pud,
+	       unsigned long address,
+	       unsigned long end)
+{
+	pmd_t *pmd;
+	struct page *page;
+
+	pmd = pmd_offset(pud, address);
+	end = pud_addr_end(address, end);
+	while (address <= end) {
+		if (pmd_present(*pmd)) {
+			page = pmd_page(*pmd);
+			if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Unsharing pte page at address 0x%lx\n",
+				       address);
+#endif
+				pt_decrement_share(page);
+				pmd_clear(pmd);
+			}
+		}
+		pmd++;
+		address += PMD_SIZE;
+	}
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void
+pt_unshare_pud(pgd_t *pgd,
+	       unsigned long address,
+	       unsigned long end,
+	       int hugepage)
+{
+	pud_t *pud;
+	struct page *page;
+
+	pud = pud_offset(pgd, address);
+	end = pgd_addr_end(address, end);
+	while (address <= end) {
+		if (pud_present(*pud)) {
+			page = pud_page(*pud);
+			if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Unsharing pmd page at address 0x%lx\n",
+				       address);
+#endif
+				pt_decrement_share(page);
+				pud_clear(pud);
+			} else if (!hugepage) {
+				pt_unshare_pmd(pud, address, end);
+			}
+		}
+		pud++;
+		address += PUD_SIZE;
+	}
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+static void
+pt_unshare_pgd(struct mm_struct *mm,
+		 unsigned long address,
+		 unsigned long end,
+		 int hugepage)
+{
+	pgd_t *pgd;
+	struct page *page;
+
+	pgd = pgd_offset(mm, address);
+
+	while (address <= end) {
+		if (pgd_present(*pgd)) {
+			page = pgd_page(*pgd);
+			if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Unsharing pud page at address 0x%lx\n",
+				       address);
+#endif
+				pt_decrement_share(page);
+				pgd_clear(pgd);
+#ifndef __PAGETABLE_PMD_FOLDED
+			} else {
+#ifndef __PAGETABLE_PUD_FOLDED
+				pt_unshare_pud(pgd, address, end, hugepage);
+#else /* __PAGETABLE_PUD_FOLDED */
+				if (!hugepage)
+					pt_unshare_pmd((pud_t *)pgd, address, end);
+#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* __PAGETABLE_PMD_FOLDED */
+			}
+		}
+		pgd++;
+		address += PGDIR_SIZE;
+	}
+}
+
+void
+pt_unshare_range(struct mm_struct *mm,
+		 unsigned long address,
+		 unsigned long end)
+{
+	pt_unshare_pgd(mm, address, end, 0);
+}
+
+static struct vm_area_struct *
+next_shareable_vma(struct vm_area_struct *vma,
+		   struct vm_area_struct *svma,
+		   struct prio_tree_iter *iter)
+{
+	while ((svma = vma_prio_tree_next(svma, iter))) {
+		if ((svma != vma) &&
+		    (vma->vm_flags == svma->vm_flags) &&
+		    (vma->vm_start == svma->vm_start) &&
+		    (vma->vm_end == svma->vm_end) &&
+		    (vma->vm_pgoff == svma->vm_pgoff))
+			break;
+	}
+	return svma;
+}
+
+#ifdef CONFIG_PTSHARE_PTE
+pte_t *
+pt_share_pte(struct vm_area_struct *vma,
+	     unsigned long address,
+	     pmd_t *pmd,
+	     struct address_space *mapping)
+{
+	struct prio_tree_iter iter;
+	struct page *page;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd;
+	pud_t *spud;
+	pmd_t *spmd;
+	pte_t *pte;
+
+	if (pmd_none(*pmd) &&
+	    mapping &&
+	    pt_shareable_pte(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pte page at address 0x%lx\n",
+		       address);
+#endif
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_start, vma->vm_end);
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			if (pgd_none(*spgd))
+				continue;
+
+			spud = pud_offset(spgd, address);
+			if (pud_none(*spud))
+				continue;
+
+			spmd = pmd_offset(spud, address);
+			if (pmd_none(*spmd))
+				continue;
+
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Sharing pte page at address 0x%lx\n",
+			       address);
+#endif
+			page = pmd_page(*spmd);
+			pt_increment_share(page);
+			pmd_populate(vma->vm_mm, pmd, page);
+		}
+	}
+	pte = pte_alloc_map(vma->vm_mm, pmd, address);
+
+	return pte;
+}
+
+void
+pt_copy_pte(struct mm_struct *mm,
+	    pmd_t *dst_pmd,
+	    pmd_t *src_pmd)
+{
+	struct page *page;
+
+	page = pmd_page(*src_pmd);
+	pmd_populate(mm, dst_pmd, page);
+	pt_increment_share(page);
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_PMD
+pmd_t *
+pt_share_pmd(struct vm_area_struct *vma,
+	     unsigned long address,
+	     pud_t *pud,
+	     struct address_space *mapping)
+{
+	struct prio_tree_iter iter;
+	struct page *page;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd;
+	pud_t *spud;
+	pmd_t *pmd;
+
+	if (pud_none(*pud) &&
+	    mapping && 
+	    pt_shareable_pmd(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pmd page at address 0x%lx\n",
+		       address);
+#endif
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_start, vma->vm_end);
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			if (pgd_none(*spgd))
+				continue;
+
+			spud = pud_offset(spgd, address);
+			if (pud_none(*spud))
+				continue;
+
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Sharing pmd page at address 0x%lx\n",
+			       address);
+#endif
+			page = pud_page(*spud);
+			pt_increment_share(page);
+			pud_populate(vma->vm_mm, pud, page);
+		}
+	}
+	pmd = pmd_alloc(vma->vm_mm, pud, address);
+
+	return pmd;
+}
+
+void
+pt_copy_pmd(struct mm_struct *mm,
+	    pud_t *dst_pud,
+	    pud_t *src_pud)
+{
+	struct page *page;
+
+	page = pud_page(*src_pud);
+	pud_populate(mm, dst_pud, page);
+	pt_increment_share(page);
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_PUD
+pud_t *
+pt_share_pud(struct vm_area_struct *vma,
+	     unsigned long address,
+	     pgd_t *pgd,
+	     struct address_space *mapping)
+{
+	struct prio_tree_iter iter;
+	struct page *page;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd;
+	pud_t *pud;
+
+	if (pgd_none(*pgd) &&
+	    mapping && 
+	    pt_shareable_pud(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pud page at address 0x%lx\n",
+		       address);
+#endif
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_start, vma->vm_end);
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			if (pgd_none(*spgd))
+				continue;
+
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Sharing pud page at address 0x%lx\n",
+			       address);
+#endif
+			page = pgd_page(*spgd);
+			pt_increment_share(page);
+			pgd_populate(vma->vm_mm, pgd, page);
+		}
+	}
+	pud = pud_alloc(vma->vm_mm, pgd, address);
+
+	return pud;
+}
+
+void
+pt_copy_pud(struct mm_struct *mm,
+	    pgd_t *dst_pgd,
+	    pgd_t *src_pgd)
+{
+	struct page *page;
+
+	page = pgd_page(*src_pgd);
+	pgd_populate(mm, dst_pgd, page);
+	pt_increment_share(page);
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+
+void
+pt_unshare_huge_range(struct mm_struct *mm,
+		      unsigned long address,
+		      unsigned long end)
+{
+	pt_unshare_pgd(mm, address, end, 1);
+}
+
+pte_t *
+pt_share_hugepage(struct mm_struct *mm,
+		  struct vm_area_struct *vma,
+		  unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pte_t *pte;
+	struct address_space *mapping = NULL;
+
+	if (pt_shareable_vma(vma))
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+
+	pgd = pgd_offset(mm, address);
+
+	pud = pt_share_pud(vma, address, pgd, mapping);
+	if (!pud)
+		return NULL;
+
+	pte = (pte_t *)pt_share_pmd(vma, address, pud, mapping);
+
+	return pte;
+}
+#endif

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-30 22:13 [PATCH 1/1] Implement shared page tables Dave McCracken
@ 2005-08-31 11:44 ` Hugh Dickins
  2005-08-31 11:51   ` Arjan van de Ven
  2005-08-31 16:40   ` Dave McCracken
  2005-09-02  1:58 ` Chen, Kenneth W
  2005-09-02  4:26 ` Chen, Kenneth W
  2 siblings, 2 replies; 12+ messages in thread
From: Hugh Dickins @ 2005-08-31 11:44 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management

On Tue, 30 Aug 2005, Dave McCracken wrote:
> 
> This patch implements page table sharing for all shared memory regions that
> span an entire page table page.  It supports sharing at multiple page
> levels, depending on the architecture.
> 
> Performance testing has shown no degradation with this patch for tests with
> small processes.  Preliminary tests with large benchmarks have shown as
> much as 3% improvement in overall results.

Hmm.  A few points.

> The patch is against 2.6.13.

So you don't have Nick's test at the start of copy_page_range():
	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
		if (!vma->anon_vma)
			return 0;
	}
Experimental, yes, but Linus likes it enough to have fast-tracked it into
his tree for 2.6.14.  My guess is that that patch (if its downsides prove
manageable) takes away a lot of the point of shared page tables -
I wonder how much of your "3% improvement".

I was going to say, doesn't randomize_va_space take away the rest of
the point?  But no, it appears "randomize_va_space", as it currently
appears in mainline anyway, is somewhat an exaggeration: it just shifts
the stack a little, with no effect on the rest of the va space.
But if it is to do more later, it may conflict with your interest.

The pud sharing and pmd sharing: perhaps they complicate the patch for
negligible benefit?

> +		if ((vma->vm_start <= base) &&
> +	    (vma->vm_end >= end))
> +		return 1;
> 
New Adventures in Coding Style ;)

But most seriously: search the patch for the string "lock" and I find
no change whatever to locking.  You're introducing page tables shared
between different mms yet relying on the old mm->page_table_lock?
You're searching a prio_tree for suitable matches to share, but
taking no lock on that?  You're counting shares in an atomic,
but not detecting when the count falls to 0 atomically?

And allied with that point on locking mms: there's no change to rmap.c,
so how is its TLB flushing and cache flushing now supposed to work?
page_referenced_one and try_to_unmap_one will visit all the vmas
sharing the page table, yes, but (usually) only the first will
satisfy the conditions and get flushed.

I'm not sure if it's worth pursuing shared page tables again or not.

You certainly need to sort the locking out to do so.  Wait a couple
of weeks and I should have sent all the per-page-table-page locking
in to -mm (to replace the pte xchging currently there): that should
give what you need for locking pts independent of the mm.

Hugh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 11:44 ` Hugh Dickins
@ 2005-08-31 11:51   ` Arjan van de Ven
  2005-08-31 13:42     ` Hugh Dickins
  2005-08-31 16:40   ` Dave McCracken
  1 sibling, 1 reply; 12+ messages in thread
From: Arjan van de Ven @ 2005-08-31 11:51 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Dave McCracken, Andrew Morton, Linux Kernel, Linux Memory Management

On Wed, 2005-08-31 at 12:44 +0100, Hugh Dickins wrote:
> I was going to say, doesn't randomize_va_space take away the rest of
> the point?  But no, it appears "randomize_va_space", as it currently
> appears in mainline anyway, is somewhat an exaggeration: it just shifts
> the stack a little, with no effect on the rest of the va space.

it also randomizes mmaps



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 11:51   ` Arjan van de Ven
@ 2005-08-31 13:42     ` Hugh Dickins
  2005-08-31 14:31       ` Martin J. Bligh
  0 siblings, 1 reply; 12+ messages in thread
From: Hugh Dickins @ 2005-08-31 13:42 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Dave McCracken, Andrew Morton, Linux Kernel, Linux Memory Management

On Wed, 31 Aug 2005, Arjan van de Ven wrote:
> On Wed, 2005-08-31 at 12:44 +0100, Hugh Dickins wrote:
> > I was going to say, doesn't randomize_va_space take away the rest of
> > the point?  But no, it appears "randomize_va_space", as it currently
> > appears in mainline anyway, is somewhat an exaggeration: it just shifts
> > the stack a little, with no effect on the rest of the va space.
> 
> it also randomizes mmaps

Ah, via PF_RANDOMIZE, yes, thanks: so long as certain conditions are
fulfilled - and my RLIM_INFINITY RLIMIT_STACK has been preventing it.

And mmaps include shmats: so unless the process specifies non-NULL
shmaddr to attach at, it'll choose a randomized address for that too
(subject to those various conditions).

Which is indeed a further disincentive against shared page tables.

Hugh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 13:42     ` Hugh Dickins
@ 2005-08-31 14:31       ` Martin J. Bligh
  2005-08-31 14:41         ` Arjan van de Ven
  2005-08-31 15:06         ` Hugh Dickins
  0 siblings, 2 replies; 12+ messages in thread
From: Martin J. Bligh @ 2005-08-31 14:31 UTC (permalink / raw)
  To: Hugh Dickins, Arjan van de Ven
  Cc: Dave McCracken, Andrew Morton, Linux Kernel, Linux Memory Management

--Hugh Dickins <hugh@veritas.com> wrote (on Wednesday, August 31, 2005 14:42:38 +0100):

> On Wed, 31 Aug 2005, Arjan van de Ven wrote:
>> On Wed, 2005-08-31 at 12:44 +0100, Hugh Dickins wrote:
>> > I was going to say, doesn't randomize_va_space take away the rest of
>> > the point?  But no, it appears "randomize_va_space", as it currently
>> > appears in mainline anyway, is somewhat an exaggeration: it just shifts
>> > the stack a little, with no effect on the rest of the va space.
>> 
>> it also randomizes mmaps
> 
> Ah, via PF_RANDOMIZE, yes, thanks: so long as certain conditions are
> fulfilled - and my RLIM_INFINITY RLIMIT_STACK has been preventing it.
> 
> And mmaps include shmats: so unless the process specifies non-NULL
> shmaddr to attach at, it'll choose a randomized address for that too
> (subject to those various conditions).
> 
> Which is indeed a further disincentive against shared page tables.

Or shared pagetables a disincentive to randomizing the mmap space ;-)
They're incompatible, but you could be left to choose one or the other
via config option.

3% on "a certain industry-standard database benchmark" (cough) is huge,
and we expect the benefit for PPC64 will be larger as we can share the
underlying hardware PTEs without TLB flushing as well.

M.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 14:31       ` Martin J. Bligh
@ 2005-08-31 14:41         ` Arjan van de Ven
  2005-08-31 15:06         ` Hugh Dickins
  1 sibling, 0 replies; 12+ messages in thread
From: Arjan van de Ven @ 2005-08-31 14:41 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Hugh Dickins, Dave McCracken, Andrew Morton, Linux Kernel,
	Linux Memory Management


> > Which is indeed a further disincentive against shared page tables.
> 
> Or shared pagetables a disincentive to randomizing the mmap space ;-)
> They're incompatible, but you could be left to choose one or the other
> via config option.
> 
> 3% on "a certain industry-standard database benchmark" (cough) is huge,
> and we expect the benefit for PPC64 will be larger as we can share the
> underlying hardware PTEs without TLB flushing as well.
> 

surely the benchmark people know that the database in question always
mmaps the shared area at the address where the first one started it?
(if not, could make it so ;)




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 14:31       ` Martin J. Bligh
  2005-08-31 14:41         ` Arjan van de Ven
@ 2005-08-31 15:06         ` Hugh Dickins
  2005-08-31 15:39           ` Martin J. Bligh
  1 sibling, 1 reply; 12+ messages in thread
From: Hugh Dickins @ 2005-08-31 15:06 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Arjan van de Ven, Dave McCracken, Andrew Morton, Linux Kernel,
	Linux Memory Management

On Wed, 31 Aug 2005, Martin J. Bligh wrote:
> --Hugh Dickins <hugh@veritas.com> wrote (on Wednesday, August 31, 2005 14:42:38 +0100):
> > 
> > Which is indeed a further disincentive against shared page tables.
> 
> Or shared pagetables a disincentive to randomizing the mmap space ;-)

Fair point!

> They're incompatible, but you could be left to choose one or the other
> via config option.

Wouldn't need config option: there's /proc/sys/kernel/randomize_va_space
for the whole running system, compatibility check on the ELFs run, and
the infinite stack rlimit: enough ways to suppress randomization if it
doesn't suit you.

> 3% on "a certain industry-standard database benchmark" (cough) is huge,
> and we expect the benefit for PPC64 will be larger as we can share the
> underlying hardware PTEs without TLB flushing as well.

Okay - and you're implying that 3% comes from _using_ the shared page
tables, rather than from avoiding the fork/exit overhead of setting
them up and tearing them down.  And it can't use huge TLB pages
because...  fragmentation?

Hugh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 15:06         ` Hugh Dickins
@ 2005-08-31 15:39           ` Martin J. Bligh
  0 siblings, 0 replies; 12+ messages in thread
From: Martin J. Bligh @ 2005-08-31 15:39 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Arjan van de Ven, Dave McCracken, Andrew Morton, Linux Kernel,
	Linux Memory Management

>> They're incompatible, but you could be left to choose one or the other
>> via config option.
> 
> Wouldn't need config option: there's /proc/sys/kernel/randomize_va_space
> for the whole running system, compatibility check on the ELFs run, and
> the infinite stack rlimit: enough ways to suppress randomization if it
> doesn't suit you.

Even better - much easier to deal with distro stuff if we can do it at
runtime.
 
>> 3% on "a certain industry-standard database benchmark" (cough) is huge,
>> and we expect the benefit for PPC64 will be larger as we can share the
>> underlying hardware PTEs without TLB flushing as well.
> 
> Okay - and you're implying that 3% comes from _using_ the shared page
> tables, rather than from avoiding the fork/exit overhead of setting
> them up and tearing them down.  And it can't use huge TLB pages
> because...  fragmentation?

Yes - as I understand it, that was a straight measurement with/without the
patch, and the shmem segment was already using hugetlb (in both cases). 
Yes, I find that a bit odd as to why as well - they are still trying 
to get some detailed profiling to explain. 

M.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/1] Implement shared page tables
  2005-08-31 11:44 ` Hugh Dickins
  2005-08-31 11:51   ` Arjan van de Ven
@ 2005-08-31 16:40   ` Dave McCracken
  1 sibling, 0 replies; 12+ messages in thread
From: Dave McCracken @ 2005-08-31 16:40 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management


--On Wednesday, August 31, 2005 12:44:24 +0100 Hugh Dickins
<hugh@veritas.com> wrote:

> So you don't have Nick's test at the start of copy_page_range():
> 	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
> 		if (!vma->anon_vma)
> 			return 0;
> 	}
> Experimental, yes, but Linus likes it enough to have fast-tracked it into
> his tree for 2.6.14.  My guess is that that patch (if its downsides prove
> manageable) takes away a lot of the point of shared page tables -
> I wonder how much of your "3% improvement".

Very little, actually.  The test does not create new processes as part of
the run.  The improvement is due to sharing of existing areas.

> I was going to say, doesn't randomize_va_space take away the rest of
> the point?  But no, it appears "randomize_va_space", as it currently
> appears in mainline anyway, is somewhat an exaggeration: it just shifts
> the stack a little, with no effect on the rest of the va space.
> But if it is to do more later, it may conflict with your interest.

I've been considering a future enhancement to my patch where it could share
page tables of any areas that share alignment, not just the same virtual
address.  That might allow sharing with randomization if the randomization
aligns things properly.

> The pud sharing and pmd sharing: perhaps they complicate the patch for
> negligible benefit?

The pmd sharing is necessary for ppc64 since it has to share at segment
size, plus it will be useful for very large regions.  I did pud for
completeness but you may be right that it's not useful.  It's all
configurable in any event.

>> +		if ((vma->vm_start <= base) &&
>> +	    (vma->vm_end >= end))
>> +		return 1;
>> 
> New Adventures in Coding Style ;)

New Adventures in Typos, actually :)  I'll fix.

> But most seriously: search the patch for the string "lock" and I find
> no change whatever to locking.  You're introducing page tables shared
> between different mms yet relying on the old mm->page_table_lock?
> You're searching a prio_tree for suitable matches to share, but
> taking no lock on that?  You're counting shares in an atomic,
> but not detecting when the count falls to 0 atomically?
> 
> And allied with that point on locking mms: there's no change to rmap.c,
> so how is its TLB flushing and cache flushing now supposed to work?
> page_referenced_one and try_to_unmap_one will visit all the vmas
> sharing the page table, yes, but (usually) only the first will
> satisfy the conditions and get flushed.

I'll go over the locking again.

> I'm not sure if it's worth pursuing shared page tables again or not.

The immediate clear benefits I see are a reduction in the number of page
table pages and a reduction in minor faults.  Keep in mind that faulting a
page into a shared page table makes it available to all other processes
sharing that area, eliminating the need for them to also take faults on it.

> You certainly need to sort the locking out to do so.  Wait a couple
> of weeks and I should have sent all the per-page-table-page locking
> in to -mm (to replace the pte xchging currently there): that should
> give what you need for locking pts independent of the mm.

I'll look things over in more detail.  I thought I had the locking issues
settled, but you raised some points I should revisit.

Dave McCracken


^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/1] Implement shared page tables
  2005-08-30 22:13 [PATCH 1/1] Implement shared page tables Dave McCracken
  2005-08-31 11:44 ` Hugh Dickins
@ 2005-09-02  1:58 ` Chen, Kenneth W
  2005-09-02 16:40   ` Dave McCracken
  2005-09-02  4:26 ` Chen, Kenneth W
  2 siblings, 1 reply; 12+ messages in thread
From: Chen, Kenneth W @ 2005-09-02  1:58 UTC (permalink / raw)
  To: 'Dave McCracken', Andrew Morton
  Cc: Linux Kernel, Linux Memory Management

Dave McCracken wrote on Tuesday, August 30, 2005 3:13 PM
> This patch implements page table sharing for all shared memory regions that
> span an entire page table page.  It supports sharing at multiple page
> levels, depending on the architecture.
> 
> 
> This version of the patch supports i386 and x86_64.  I have additional
> patches to support ppc64, but they are not quite ready for public
> consumption.
> 
>  ....
> +		prio_tree_iter_init(&iter, &mapping->i_mmap,
> +				    vma->vm_start, vma->vm_end);


I think this is a bug.  The radix priority tree for address_space->
i_mmap is keyed on vma->vm_pgoff.  Your patch uses the vma virtual
address to find a shareable range, Which will always fail a match
even though there is one.  The following is a quick hack I did to
make it work.

- Ken

--- linux-2.6.13/mm/ptshare.c.orig	2005-09-01 18:58:12.299321918 -0700
+++ linux-2.6.13/mm/ptshare.c	2005-09-01 18:58:39.846196580 -0700
@@ -26,6 +26,11 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 
+#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
+#define VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define HEAP_INDEX(vma)	  ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
 #undef	PT_DEBUG
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -173,7 +178,7 @@ pt_share_pte(struct vm_area_struct *vma,
 		       address);
 #endif
 		prio_tree_iter_init(&iter, &mapping->i_mmap,
-				    vma->vm_start, vma->vm_end);
+				    RADIX_INDEX(vma), HEAP_INDEX(vma));
 
 		while ((svma = next_shareable_vma(vma, svma, &iter))) {
 			spgd = pgd_offset(svma->vm_mm, address);



^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/1] Implement shared page tables
  2005-08-30 22:13 [PATCH 1/1] Implement shared page tables Dave McCracken
  2005-08-31 11:44 ` Hugh Dickins
  2005-09-02  1:58 ` Chen, Kenneth W
@ 2005-09-02  4:26 ` Chen, Kenneth W
  2 siblings, 0 replies; 12+ messages in thread
From: Chen, Kenneth W @ 2005-09-02  4:26 UTC (permalink / raw)
  To: 'Dave McCracken', Andrew Morton
  Cc: Linux Kernel, Linux Memory Management

Dave McCracken wrote on Tuesday, August 30, 2005 3:13 PM
> This patch implements page table sharing for all shared memory regions that
> span an entire page table page.  It supports sharing at multiple page
> levels, depending on the architecture.

In function pt_share_pte():

> +		while ((svma = next_shareable_vma(vma, svma, &iter))) {
> +			spgd = pgd_offset(svma->vm_mm, address);
> +			if (pgd_none(*spgd))
> +				continue;
> +
> +			spud = pud_offset(spgd, address);
> +			if (pud_none(*spud))
> +				continue;
> +
> +			spmd = pmd_offset(spud, address);
> +			if (pmd_none(*spmd))
> +				continue;
....
> +			page = pmd_page(*spmd);
> +			pt_increment_share(page);
> +			pmd_populate(vma->vm_mm, pmd, page);
> +		}


Do you really have to iterate through all the vma?  Can't you just break
out of the while loop on first successful match and populating the pmd?
I would think you will find them to be the same pte page. Or did I miss
some thing?


--- ./mm/ptshare.c.orig	2005-09-01 21:16:35.311915518 -0700
+++ ./mm/ptshare.c	2005-09-01 21:18:24.629296992 -0700
@@ -200,6 +200,7 @@ pt_share_pte(struct vm_area_struct *vma,
 			page = pmd_page(*spmd);
 			pt_increment_share(page);
 			pmd_populate(vma->vm_mm, pmd, page);
+			break;
 		}
 	}
 	pte = pte_alloc_map(vma->vm_mm, pmd, address);


^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/1] Implement shared page tables
  2005-09-02  1:58 ` Chen, Kenneth W
@ 2005-09-02 16:40   ` Dave McCracken
  0 siblings, 0 replies; 12+ messages in thread
From: Dave McCracken @ 2005-09-02 16:40 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management


--On Thursday, September 01, 2005 18:58:23 -0700 "Chen, Kenneth W"
<kenneth.w.chen@intel.com> wrote:

>> +		prio_tree_iter_init(&iter, &mapping->i_mmap,
>> +				    vma->vm_start, vma->vm_end);
> 
> 
> I think this is a bug.  The radix priority tree for address_space->
> i_mmap is keyed on vma->vm_pgoff.  Your patch uses the vma virtual
> address to find a shareable range, Which will always fail a match
> even though there is one.
>
> Do you really have to iterate through all the vma?  Can't you just break
> out of the while loop on first successful match and populating the pmd?
> I would think you will find them to be the same pte page. Or did I miss
> some thing?

Man, I spaced that whole search code.  I was sure I'd tested to make sure
it was finding matches.  I'll fix all that up in my next release.

Dave McCracken


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2005-09-02 16:40 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-08-30 22:13 [PATCH 1/1] Implement shared page tables Dave McCracken
2005-08-31 11:44 ` Hugh Dickins
2005-08-31 11:51   ` Arjan van de Ven
2005-08-31 13:42     ` Hugh Dickins
2005-08-31 14:31       ` Martin J. Bligh
2005-08-31 14:41         ` Arjan van de Ven
2005-08-31 15:06         ` Hugh Dickins
2005-08-31 15:39           ` Martin J. Bligh
2005-08-31 16:40   ` Dave McCracken
2005-09-02  1:58 ` Chen, Kenneth W
2005-09-02 16:40   ` Dave McCracken
2005-09-02  4:26 ` Chen, Kenneth W

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).