linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH/RFC] Shared page tables
@ 2006-01-05 16:19 Dave McCracken
  2006-01-07 12:25 ` Heiko Carstens
                   ` (4 more replies)
  0 siblings, 5 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-05 16:19 UTC (permalink / raw)
  To: Hugh Dickins, Andrew Morton; +Cc: Linux Kernel, Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 1063 bytes --]


Here's a new version of my shared page tables patch.

The primary purpose of sharing page tables is improved performance for
large applications that share big memory areas between multiple processes.
It eliminates the redundant page tables and significantly reduces the
number of minor page faults.  Tests show significant performance
improvement for large database applications, including those using large
pages.  There is no measurable performance degradation for small processes.

This version of the patch uses Hugh's new locking mechanism, extending it
up the page table tree as far as necessary for proper concurrency control.

The patch also includes the proper locking for following the vma chains.

Hugh, I believe I have all the lock points nailed down.  I'd appreciate
your input on any I might have missed.

The architectures supported are i386 and x86_64.  I'm working on 64 bit
ppc, but there are still some issues around proper segment handling that
need more testing.  This will be available in a separate patch once it's
solid.

Dave McCracken

[-- Attachment #2: shpt-generic-2.6.15-1.diff --]
[-- Type: text/plain, Size: 42192 bytes --]

--- 2.6.15/./arch/i386/Kconfig	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./arch/i386/Kconfig	2006-01-03 10:30:01.000000000 -0600
@@ -458,6 +458,18 @@ config X86_PAE
 	depends on HIGHMEM64G
 	default y
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+config PTSHARE_PTE
+	bool
+	depends on PTSHARE
+	default y
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
--- 2.6.15/./arch/x86_64/Kconfig	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./arch/x86_64/Kconfig	2006-01-03 10:30:01.000000000 -0600
@@ -267,6 +267,38 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+menu "Page table levels to share"
+	depends on PTSHARE
+
+config PTSHARE_PTE
+	bool "Bottom level table (PTE)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PMD
+	bool "Middle level table (PMD)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PUD
+	bool "Upper level table (PUD)"
+	depends on PTSHARE
+	default n
+
+endmenu
+
+config PTSHARE_HUGEPAGE
+	bool
+	depends on PTSHARE && PTSHARE_PMD
+	default y
+
 config ARCH_DISCONTIGMEM_ENABLE
        bool
        depends on NUMA
--- 2.6.15/./arch/x86_64/mm/fault.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./arch/x86_64/mm/fault.c	2006-01-03 10:30:01.000000000 -0600
@@ -154,7 +154,7 @@ void dump_pagetable(unsigned long addres
 	if (bad_address(pgd)) goto bad;
 	if (!pgd_present(*pgd)) goto ret; 
 
-	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
+	pud = __pud_offset_k((pud_t *)pgd_page_kernel(*pgd), address);
 	if (bad_address(pud)) goto bad;
 	printk("PUD %lx ", pud_val(*pud));
 	if (!pud_present(*pud))	goto ret;
@@ -261,7 +261,7 @@ static int vmalloc_fault(unsigned long a
 	pud_ref = pud_offset(pgd_ref, address);
 	if (pud_none(*pud_ref))
 		return -1;
-	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+	if (pud_none(*pud) || pud_page_kernel(*pud) != pud_page_kernel(*pud_ref))
 		BUG();
 	pmd = pmd_offset(pud, address);
 	pmd_ref = pmd_offset(pud_ref, address);
--- 2.6.15/./include/asm-generic/pgtable.h	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./include/asm-generic/pgtable.h	2006-01-03 10:30:01.000000000 -0600
@@ -173,6 +173,23 @@ static inline void ptep_set_wrprotect(st
 #endif
 
 /*
+ * Some architectures might need flushes when higher levels of page table
+ * are cleared.
+ */
+
+#ifndef __HAVE_ARCH_PMD_CLEAR_FLUSH
+#define pmd_clear_flush(__mm, __addr, __pmd)  pmd_clear(__pmd)
+#endif
+
+#ifndef __HAVE_ARCH_PUD_CLEAR_FLUSH
+#define pud_clear_flush(__mm, __addr, __pud)  pud_clear(__pud)
+#endif
+
+#ifndef __HAVE_ARCH_PGD_CLEAR_FLUSH
+#define pgd_clear_flush(__mm, __addr, __pgd)  pgd_clear(__pgd)
+#endif
+
+/*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
  * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
--- 2.6.15/./include/asm-x86_64/pgtable.h	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./include/asm-x86_64/pgtable.h	2006-01-03 10:30:01.000000000 -0600
@@ -101,9 +101,6 @@ static inline void pgd_clear (pgd_t * pg
 	set_pgd(pgd, __pgd(0));
 }
 
-#define pud_page(pud) \
-((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
-
 #define ptep_get_and_clear(mm,addr,xp)	__pte(xchg(&(xp)->pte, 0))
 
 struct mm_struct;
@@ -324,7 +321,8 @@ static inline int pmd_large(pmd_t pte) {
 /*
  * Level 4 access.
  */
-#define pgd_page(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
+#define pgd_page_kernel(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
+#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
@@ -332,9 +330,11 @@ static inline int pmd_large(pmd_t pte) {
 #define mk_kernel_pgd(address) ((pgd_t){ (address) | _KERNPG_TABLE })
 
 /* PUD - Level3 access */
+#define pud_page_kernel(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
 /* to find an entry in a page-table-directory. */
 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
+#define pud_offset(pgd, address) ((pud_t *) pgd_page_kernel(*(pgd)) + pud_index(address))
 #define pud_offset_k(pgd, addr) pud_offset(pgd, addr)
 #define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
 
@@ -348,7 +348,7 @@ static inline pud_t *__pud_offset_k(pud_
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
+#define pmd_offset(dir, address) ((pmd_t *) pud_page_kernel(*(dir)) + \
 			pmd_index(address))
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
--- 2.6.15/./include/linux/mm.h	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./include/linux/mm.h	2006-01-03 10:30:01.000000000 -0600
@@ -230,7 +230,7 @@ struct page {
 					 * When page is free, this indicates
 					 * order in the buddy system.
 					 */
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 		spinlock_t ptl;
 #endif
 	} u;
@@ -767,19 +767,20 @@ static inline pmd_t *pmd_alloc(struct mm
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 /*
  * We tuck a spinlock to guard each pagetable page into its struct page,
  * at page->private, with BUILD_BUG_ON to make sure that this will not
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
-#define __pte_lockptr(page)	&((page)->u.ptl)
+#define __pt_lockptr(page)	&((page)->u.ptl)
+
 #define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
+	spin_lock_init(__pt_lockptr(_page));				\
 } while (0)
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
-#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pt_lockptr(pmd_page(*(pmd)));})
 #else
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
--- 2.6.15/./include/linux/sched.h	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./include/linux/sched.h	2006-01-03 10:30:01.000000000 -0600
@@ -249,7 +249,7 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 /*
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
--- 2.6.15/./include/linux/ptshare.h	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.15-shpt/./include/linux/ptshare.h	2006-01-03 10:30:01.000000000 -0600
@@ -0,0 +1,275 @@
+#ifndef _LINUX_PTSHARE_H
+#define _LINUX_PTSHARE_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <dmccr@us.ibm.com>
+ */
+
+#ifdef CONFIG_PTSHARE
+static inline int pt_is_shared(struct page *page)
+{
+	return (page_mapcount(page) > 1);
+}
+
+static inline void pt_increment_share(struct page *page)
+{
+	atomic_inc(&page->_mapcount);
+}
+
+static inline void pt_decrement_share(struct page *page)
+{
+	atomic_dec(&page->_mapcount);
+}
+
+static inline int pt_shareable_vma(struct vm_area_struct *vma)
+{
+	/* We can't share anonymous memory */
+	if (!vma->vm_file)
+		return 0;
+
+	/* No sharing of nonlinear areas */
+	if (vma->vm_flags & VM_NONLINEAR)
+		return 0;
+
+	/* Only share shared mappings or read-only mappings */
+	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) == VM_WRITE)
+		return 0;
+
+	/* If it's smaller than the smallest shareable unit, don't bother
+	   calling it shareable */
+	if ((vma->vm_end - vma->vm_start) < PMD_SIZE)
+		return 0;
+
+	return 1;
+}
+extern void pt_unshare_range(struct vm_area_struct *vma, unsigned long address,
+			     unsigned long end);
+#else /* CONFIG_PTSHARE */
+#define	pt_is_shared(page)	(0)
+#define	pt_increment_share(page)
+#define	pt_decrement_share(page)
+#define pt_shareable_vma(vma)	(0)
+#define	pt_unshare_range(vma, address, end)
+#endif /* CONFIG_PTSHARE */
+
+#ifdef CONFIG_PTSHARE_PTE
+static inline int pt_is_shared_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	return pt_is_shared(page);
+}
+
+static inline void pt_increment_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	pt_increment_share(page);
+	return;
+}
+
+static inline void pt_decrement_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	pt_decrement_share(page);
+	return;
+}
+
+static inline int pt_shareable_pte(struct vm_area_struct *vma, unsigned long address)
+{
+	unsigned long base = address & PMD_MASK;
+	unsigned long end = base + (PMD_SIZE-1);
+
+	if ((vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+extern pte_t * pt_share_pte(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmd, struct address_space *mapping);
+extern void pt_copy_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd);
+#else /* CONFIG_PTSHARE_PTE */
+static inline int pt_is_shared_pte(pmd_t pmdval)
+{
+	return 0;
+}
+#define	pt_increment_pte(pmdval)
+#define	pt_decrement_pte(pmdval)
+#define	pt_copy_pte(mm, dst_pmd, src_pmd)
+#define	pt_shareable_pte(vma, address) (0)
+#define	pt_share_pte(vma, address, pmd, mapping) pte_alloc_map(vma->vm_mm, pmd, address)
+#endif /* CONFIG_PTSHARE_PTE */
+
+#ifdef CONFIG_PTSHARE_PMD
+static inline int pt_is_shared_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	return pt_is_shared(page);
+}
+
+static inline void pt_increment_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	pt_increment_share(page);
+	return;
+}
+
+static inline void pt_decrement_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	pt_decrement_share(page);
+	return;
+}
+
+static inline int pt_shareable_pmd(struct vm_area_struct *vma,
+		 unsigned long address)
+{
+	unsigned long base = address & PUD_MASK;
+	unsigned long end = base + (PUD_SIZE-1);
+
+	if ((vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+extern pmd_t * pt_share_pmd(struct vm_area_struct *vma, unsigned long address,
+			    pud_t *pud, struct address_space *mapping);
+extern void pt_copy_pmd(struct mm_struct *mm, pud_t *dst_pud, pud_t *src_pud);
+#else /* CONFIG_PTSHARE_PMD */
+static inline int pt_is_shared_pmd(pud_t pudval)
+{
+	return 0;
+}
+#define	pt_increment_pmd(pudval)
+#define	pt_decrement_pmd(pudval)
+#define	pt_copy_pmd(mm, dst_pud, src_pud)
+#define	pt_shareable_pmd(vma, address) (0)
+#define	pt_share_pmd(vma, address, pud, mapping) pmd_alloc(vma->vm_mm, pud, address)
+#endif /* CONFIG_PTSHARE_PMD */
+
+#ifdef CONFIG_PTSHARE_PUD
+static inline int pt_is_shared_pud(pgd_t pgdval)
+{
+	struct page *page;
+
+	page = pgd_page(pgdval);
+	return pt_is_shared(page);
+}
+
+static inline void pt_increment_pud(pgd_t pgdval)
+{
+	struct page *page;
+
+	page = pgd_page(pgdval);
+	pt_increment_share(page);
+	return;
+}
+
+static inline void pt_decrement_pud(pgd_t pgdval)
+{
+	struct page *page;
+
+	page = pgd_page(pgdval);
+	pt_decrement_share(page);
+	return;
+}
+
+static inline int pt_shareable_pud(struct vm_area_struct *vma, unsigned long address)
+{
+	unsigned long base = address & PGDIR_MASK;
+	unsigned long end = base + (PGDIR_SIZE-1);
+
+	if ((vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+extern pud_t *pt_share_pud(struct vm_area_struct *vma, unsigned long address,
+			    pgd_t *pgd, struct address_space *mapping);
+extern void pt_copy_pud(struct mm_struct *mm, pgd_t *dst_pgd, pgd_t *src_pgd);
+#else /* CONFIG_PTSHARE_PUD */
+static inline int pt_is_shared_pud(pgd_t pgdval)
+{
+	return 0;
+}
+#define	pt_increment_pud(pgdval)
+#define	pt_decrement_pud(pgdval)
+#define	pt_copy_pud(mm, dst_pgd, src_pgd)
+#define	pt_shareable_pud(vma, address) (0)
+#define	pt_share_pud(vma, address, pgd, mapping) pud_alloc(vma->vm_mm, pgd, address)
+#endif /* CONFIG_PTSHARE_PUD */
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+extern pte_t *pt_share_hugepage(struct mm_struct *mm, struct vm_area_struct *vma,
+			       unsigned long address);
+extern void pt_unshare_huge_range(struct vm_area_struct *vma, unsigned long address,
+				  unsigned long end);
+#else
+#define	pt_share_hugepage(mm, vma, address)	huge_pte_alloc(mm, address)
+#define	pt_unshare_huge_range(vma, address, end)
+#endif	/* CONFIG_PTSHARE_HUGEPAGE */
+
+/*
+ *  Locking macros...
+ *  All levels of page table at or above the level(s) we share use page_table_lock.
+ *  Each level below the share level uses the pt_lockptr in struct page in the level
+ *  above.
+ */
+
+#ifdef CONFIG_PTSHARE_PUD
+#define pud_lock_init(pud)	do {					\
+	spin_lock_init(__pt_lockptr(virt_to_page(pud)));		\
+} while (0)
+#define pud_lock_deinit(pud)	((virt_to_page(pud))->mapping = NULL)
+#define pud_lockptr(mm, pud)	({(void)(mm); __pt_lockptr(virt_to_page(pud));})
+#else
+#define pud_lock_init(pud)	do {} while (0)
+#define pud_lock_deinit(pud)	do {} while (0)
+#define pud_lockptr(mm, pud)	({(void)(pud); &(mm)->page_table_lock;})
+#endif
+#ifdef CONFIG_PTSHARE_PMD
+#define pmd_lock_init(pmd)	do {					\
+	spin_lock_init(__pt_lockptr(virt_to_page(pmd)));		\
+} while (0)
+#define pmd_lock_deinit(pmd)	((virt_to_page(pmd))->mapping = NULL)
+#define pmd_lockptr(mm, pmd)	({(void)(mm); __pt_lockptr(virt_to_page(pmd));})
+#else
+#define pmd_lock_init(pmd)	do {} while (0)
+#define pmd_lock_deinit(pmd)	do {} while (0)
+#define pmd_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
+#endif
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+#define	hugepte_lockptr(mm, pte)	pmd_lockptr(mm, pte)
+#else
+#define hugepte_lockptr(mm, pte)	({(void)(pte); &(mm)->page_table_lock;})
+#endif
+#endif /* _LINUX_PTSHARE_H */
--- 2.6.15/./mm/Makefile	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/Makefile	2006-01-03 10:30:01.000000000 -0600
@@ -20,3 +20,4 @@ obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_PTSHARE) += ptshare.o
--- 2.6.15/./mm/fremap.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/fremap.c	2006-01-03 10:30:01.000000000 -0600
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -193,6 +194,9 @@ asmlinkage long sys_remap_file_pages(uns
 				has_write_lock = 1;
 				goto retry;
 			}
+			if (pt_shareable_vma(vma))
+				pt_unshare_range(vma, vma->vm_start, vma->vm_end);
+
 			mapping = vma->vm_file->f_mapping;
 			spin_lock(&mapping->i_mmap_lock);
 			flush_dcache_mmap_lock(mapping);
--- 2.6.15/./mm/hugetlb.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/hugetlb.c	2006-01-03 10:30:01.000000000 -0600
@@ -11,6 +11,7 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/ptshare.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -322,6 +323,8 @@ void unmap_hugepage_range(struct vm_area
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	pt_unshare_huge_range(vma, start, end);
+
 	spin_lock(&mm->page_table_lock);
 
 	/* Update high watermark before we lower rss */
@@ -392,8 +395,9 @@ int hugetlb_fault(struct mm_struct *mm, 
 	pte_t *pte;
 	struct page *page;
 	struct address_space *mapping;
+	spinlock_t *ptl;
 
-	pte = huge_pte_alloc(mm, address);
+	pte = pt_share_hugepage(mm, vma, address);
 	if (!pte)
 		goto out;
 
@@ -409,7 +413,8 @@ int hugetlb_fault(struct mm_struct *mm, 
 	if (!page)
 		goto out;
 
-	spin_lock(&mm->page_table_lock);
+	ptl = hugepte_lockptr(mm, pte);
+	spin_lock(ptl);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
 		goto backout;
@@ -420,7 +425,7 @@ int hugetlb_fault(struct mm_struct *mm, 
 
 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	unlock_page(page);
 out:
 	return ret;
--- 2.6.15/./mm/memory.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/memory.c	2006-01-03 10:30:01.000000000 -0600
@@ -48,6 +48,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ptshare.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -82,6 +83,8 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
+#undef	PT_DEBUG
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
@@ -110,14 +113,25 @@ void pmd_clear_bad(pmd_t *pmd)
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
-static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+			   unsigned long addr)
 {
 	struct page *page = pmd_page(*pmd);
-	pmd_clear(pmd);
-	pte_lock_deinit(page);
-	pte_free_tlb(tlb, page);
-	dec_page_state(nr_page_table_pages);
-	tlb->mm->nr_ptes--;
+	pmd_t pmdval= *pmd;
+	int share;
+
+	share = pt_is_shared_pte(pmdval);
+	if (share)
+		pmd_clear_flush(tlb->mm, addr, pmd);
+	else
+		pmd_clear(pmd);
+
+	pt_decrement_pte(pmdval);
+	if (!share) {
+		pte_lock_deinit(page);
+		pte_free_tlb(tlb, page);
+		dec_page_state(nr_page_table_pages);
+	}
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -125,17 +139,22 @@ static inline void free_pmd_range(struct
 				unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
-	unsigned long next;
-	unsigned long start;
-
-	start = addr;
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		free_pte_range(tlb, pmd);
-	} while (pmd++, addr = next, addr != end);
+	pud_t pudval = *pud;
+  	unsigned long next;
+  	unsigned long start;
+	int share;
+  
+	share = pt_is_shared_pmd(pudval);
+  	start = addr;
+  	pmd = pmd_offset(pud, addr);
+	if (!share) {
+		do {
+			next = pmd_addr_end(addr, end);
+			if (pmd_none_or_clear_bad(pmd))
+				continue;
+			free_pte_range(tlb, pmd, addr);
+		} while (pmd++, addr = next, addr != end);
+	}
 
 	start &= PUD_MASK;
 	if (start < floor)
@@ -148,9 +167,21 @@ static inline void free_pmd_range(struct
 	if (end - 1 > ceiling - 1)
 		return;
 
+#ifdef PT_DEBUG
+	if (share)
+		printk(KERN_DEBUG "Free: Unsharing pmd at address 0x%lx\n", start);
+#endif
 	pmd = pmd_offset(pud, start);
-	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd);
+	if (share)
+		pud_clear_flush(tlb->mm, addr, pud);
+	else
+		pud_clear(pud);
+
+	pt_decrement_pmd(pudval);
+	if (!share) {
+		pud_lock_deinit(pmd);
+		pmd_free_tlb(tlb, pmd);
+	}
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -158,17 +189,22 @@ static inline void free_pud_range(struct
 				unsigned long floor, unsigned long ceiling)
 {
 	pud_t *pud;
+	pgd_t pgdval = *pgd;
 	unsigned long next;
 	unsigned long start;
+	int share;
 
+	share = pt_is_shared_pud(pgdval);
 	start = addr;
 	pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
-	} while (pud++, addr = next, addr != end);
+	if (!share) {
+		do {
+			next = pud_addr_end(addr, end);
+			if (pud_none_or_clear_bad(pud))
+				continue;
+			free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		} while (pud++, addr = next, addr != end);
+	}
 
 	start &= PGDIR_MASK;
 	if (start < floor)
@@ -182,8 +218,16 @@ static inline void free_pud_range(struct
 		return;
 
 	pud = pud_offset(pgd, start);
-	pgd_clear(pgd);
-	pud_free_tlb(tlb, pud);
+	if (share)
+		pgd_clear_flush(tlb->mm, addr, pgd);
+	else
+		pgd_clear(pgd);
+
+	pt_decrement_pud(pgdval);
+	if (!share) {
+		pud_lock_deinit(pud);
+		pud_free_tlb(tlb, pud);
+	}
 }
 
 /*
@@ -292,20 +336,22 @@ void free_pgtables(struct mmu_gather **t
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	struct page *new = pte_alloc_one(mm, address);
+	spinlock_t *ptl = pmd_lockptr(mm, pmd);
+
 	if (!new)
 		return -ENOMEM;
 
 	pte_lock_init(new);
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
 		pte_lock_deinit(new);
 		pte_free(new);
 	} else {
-		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
+		pt_increment_pte(*pmd);
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	return 0;
 }
 
@@ -318,8 +364,10 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	spin_lock(&init_mm.page_table_lock);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free_kernel(new);
-	else
+	else {
 		pmd_populate_kernel(&init_mm, pmd, new);
+		pt_increment_pte(*pmd);
+	}
 	spin_unlock(&init_mm.page_table_lock);
 	return 0;
 }
@@ -518,7 +566,7 @@ again:
 
 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end)
+		unsigned long addr, unsigned long end, int shareable)
 {
 	pmd_t *src_pmd, *dst_pmd;
 	unsigned long next;
@@ -531,16 +579,20 @@ static inline int copy_pmd_range(struct 
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
-		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
-						vma, addr, next))
-			return -ENOMEM;
+		if (shareable && pt_shareable_pte(vma, addr)) {
+			pt_copy_pte(dst_mm, dst_pmd, src_pmd);
+		} else {
+			if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+					   vma, addr, next))
+				return -ENOMEM;
+		}
 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end)
+		unsigned long addr, unsigned long end, int shareable)
 {
 	pud_t *src_pud, *dst_pud;
 	unsigned long next;
@@ -553,9 +605,16 @@ static inline int copy_pud_range(struct 
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
-		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
-						vma, addr, next))
-			return -ENOMEM;
+		if (shareable && pt_shareable_pmd(vma, addr)) {
+			pt_copy_pmd(dst_mm, dst_pud, src_pud);
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Fork: Sharing pmd at address 0x%lx\n", addr);
+#endif
+		} else {
+			if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+					   vma, addr, next, shareable))
+				return -ENOMEM;
+		}
 	} while (dst_pud++, src_pud++, addr = next, addr != end);
 	return 0;
 }
@@ -567,6 +626,7 @@ int copy_page_range(struct mm_struct *ds
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	int shareable;
 
 	/*
 	 * Don't copy ptes where a page fault will fill them correctly.
@@ -574,7 +634,7 @@ int copy_page_range(struct mm_struct *ds
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+	if (!(vma->vm_flags & (VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -582,15 +642,21 @@ int copy_page_range(struct mm_struct *ds
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	shareable = pt_shareable_vma(vma);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
-		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-						vma, addr, next))
-			return -ENOMEM;
+		if (shareable && pt_shareable_pud(vma, addr)) {
+			pt_copy_pud(dst_mm, dst_pgd, src_pgd);
+		} else {
+			if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+					   vma, addr, next, shareable))
+				return -ENOMEM;
+		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 	return 0;
 }
@@ -610,6 +676,7 @@ static unsigned long zap_pte_range(struc
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
+
 			(*zap_work)--;
 			continue;
 		}
@@ -685,6 +752,15 @@ static inline unsigned long zap_pmd_rang
 	pmd_t *pmd;
 	unsigned long next;
 
+	if (pt_is_shared_pmd(*pud)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Zap: Unsharing pmd at address 0x%lx\n", addr);
+#endif
+		pt_decrement_pmd(*pud);
+		pud_clear_flush(tlb->mm, addr, pud);
+		return end;
+	}
+
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
@@ -707,6 +783,12 @@ static inline unsigned long zap_pud_rang
 	pud_t *pud;
 	unsigned long next;
 
+	if (pt_is_shared_pud(*pgd)) {
+		pt_decrement_pud(*pgd);
+		pgd_clear_flush(tlb->mm, addr, pgd);
+		return end;
+	}
+
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
@@ -2223,6 +2305,7 @@ int __handle_mm_fault(struct mm_struct *
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	struct address_space *mapping = NULL;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -2231,14 +2314,17 @@ int __handle_mm_fault(struct mm_struct *
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
 
+	if (pt_shareable_vma(vma))
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+
 	pgd = pgd_offset(mm, address);
-	pud = pud_alloc(mm, pgd, address);
+	pud = pt_share_pud(vma, address, pgd, mapping);
 	if (!pud)
 		return VM_FAULT_OOM;
-	pmd = pmd_alloc(mm, pud, address);
+	pmd = pt_share_pmd(vma, address, pud, mapping);
 	if (!pmd)
 		return VM_FAULT_OOM;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pt_share_pte(vma, address, pmd, mapping);
 	if (!pte)
 		return VM_FAULT_OOM;
 
@@ -2259,8 +2345,11 @@ int __pud_alloc(struct mm_struct *mm, pg
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
-	else
+	else {
+		pud_lock_init(new);
 		pgd_populate(mm, pgd, new);
+		pt_increment_pud(*pgd);
+	}
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
@@ -2280,22 +2369,32 @@ int __pud_alloc(struct mm_struct *mm, pg
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
 	pmd_t *new = pmd_alloc_one(mm, address);
+	spinlock_t *ptl;
 	if (!new)
 		return -ENOMEM;
 
-	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
+	ptl = pud_lockptr(mm, pud);
+	spin_lock(ptl);
 	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(new);
-	else
+	else {
+		pmd_lock_init(new);
 		pud_populate(mm, pud, new);
+		pt_increment_pmd(*pud);
+	}
 #else
+	ptl = &mm->page_table_lock;
+	spin_lock(ptl);
 	if (pgd_present(*pud))		/* Another has populated it */
 		pmd_free(new);
-	else
+	else {
+		pmd_lock_init(new);
 		pgd_populate(mm, pud, new);
+		pt_increment_pmd(*pud);
+	}
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	return 0;
 }
 #else
--- 2.6.15/./mm/mmap.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/mmap.c	2006-01-03 10:30:01.000000000 -0600
@@ -1953,7 +1953,6 @@ void exit_mmap(struct mm_struct *mm)
 	while (vma)
 		vma = remove_vma(vma);
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
--- 2.6.15/./mm/mprotect.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/mprotect.c	2006-01-03 10:30:01.000000000 -0600
@@ -19,6 +19,7 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -115,6 +116,9 @@ mprotect_fixup(struct vm_area_struct *vm
 		return 0;
 	}
 
+	if (pt_shareable_vma(vma))
+		pt_unshare_range(vma, start, end);
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
--- 2.6.15/./mm/mremap.c	2006-01-02 21:21:10.000000000 -0600
+++ 2.6.15-shpt/./mm/mremap.c	2006-01-03 10:30:01.000000000 -0600
@@ -17,6 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -165,6 +166,9 @@ static unsigned long move_vma(struct vm_
 	unsigned long hiwater_vm;
 	int split = 0;
 
+	if (pt_shareable_vma(vma))
+		pt_unshare_range(vma, old_addr, old_addr + old_len);
+
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
 	 * which may split one vma into three before unmapping.
--- 2.6.15/./mm/ptshare.c	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.15-shpt/./mm/ptshare.c	2006-01-03 10:30:01.000000000 -0600
@@ -0,0 +1,415 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <dmccr@us.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/prio_tree.h>
+#include <linux/mm.h>
+#include <linux/ptshare.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+#undef	PT_DEBUG
+
+#define	VM_PGEND(vma)	(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) -1)
+
+#ifndef __PAGETABLE_PMD_FOLDED
+static void pt_unshare_pmd(struct mm_struct *mm, pud_t *pud, unsigned long address,
+			   unsigned long end)
+{
+	pmd_t *pmd;
+	pmd_t pmdval;
+
+	pmd = pmd_offset(pud, address);
+	end = pud_addr_end(address, end);
+	while (address <= end) {
+		pmdval = *pmd;
+		if (pmd_present(*pmd)) {
+			if (pt_is_shared_pte(pmdval)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Unsharing pte page at address 0x%lx\n",
+				       address);
+#endif
+				pt_decrement_pte(pmdval);
+				pmd_clear_flush(mm, address, pmd);
+			}
+		}
+		pmd++;
+		address += PMD_SIZE;
+	}
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void pt_unshare_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long address,
+			   unsigned long end, int hugepage)
+{
+	pud_t *pud;
+	pud_t pudval;
+
+	pud = pud_offset(pgd, address);
+	end = pgd_addr_end(address, end);
+	while (address <= end) {
+		pudval = *pud;
+		if (pud_present(pudval)) {
+			if (pt_is_shared_pmd(pudval)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Unsharing pmd page at address 0x%lx\n",
+				       address);
+#endif
+				pt_decrement_pmd(pudval);
+				pud_clear_flush(mm, address, pud);
+			} else if (!hugepage) {
+				pt_unshare_pmd(mm, pud, address, end);
+			}
+		}
+		pud++;
+		address += PUD_SIZE;
+	}
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+static void pt_unshare_pgd(struct mm_struct *mm, unsigned long address,
+			   unsigned long end, int hugepage)
+{
+	pgd_t *pgd;
+	pgd_t pgdval;
+
+	pgd = pgd_offset(mm, address);
+
+	while (address <= end) {
+		pgdval = *pgd;
+		if (pgd_present(pgdval)) {
+			if (pt_is_shared_pud(pgdval)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Unsharing pud page at address 0x%lx\n",
+				       address);
+#endif
+				pt_decrement_pud(pgdval);
+				pgd_clear_flush(mm, address, pgd);
+#ifndef __PAGETABLE_PMD_FOLDED
+			} else {
+#ifndef __PAGETABLE_PUD_FOLDED
+				pt_unshare_pud(mm, pgd, address, end, hugepage);
+#else /* __PAGETABLE_PUD_FOLDED */
+				if (!hugepage)
+					pt_unshare_pmd(mm, (pud_t *)pgd, address, end);
+#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* __PAGETABLE_PMD_FOLDED */
+			}
+		}
+		pgd++;
+		address += PGDIR_SIZE;
+	}
+}
+
+void pt_unshare_range(struct vm_area_struct *vma, unsigned long address,
+		      unsigned long end)
+{
+	struct address_space *mapping;
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+
+	pt_unshare_pgd(vma->vm_mm, address, end, 0);
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+static struct vm_area_struct *next_shareable_vma(struct vm_area_struct *vma,
+						 struct vm_area_struct *svma,
+						 struct prio_tree_iter *iter)
+{
+	while ((svma = vma_prio_tree_next(svma, iter))) {
+		if ((svma != vma) &&
+		    (vma->vm_flags == svma->vm_flags) &&
+		    (vma->vm_start == svma->vm_start) &&
+		    (vma->vm_end == svma->vm_end) &&
+		    (vma->vm_pgoff == svma->vm_pgoff))
+			break;
+	}
+	return svma;
+}
+
+#ifdef CONFIG_PTSHARE_PTE
+pte_t *pt_share_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd,
+		    struct address_space *mapping)
+{
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd, spgde;
+	pud_t *spud, spude;
+	pmd_t *spmd, spmde;
+	pte_t *pte;
+	struct page *page;
+	spinlock_t *ptl;
+
+	pmd_val(spmde) = 0;
+	page = NULL;
+	if (pmd_none(*pmd) &&
+	    mapping &&
+	    pt_shareable_pte(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pte page at address 0x%lx\n",
+		       address);
+#endif
+		spin_lock(&mapping->i_mmap_lock);
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_pgoff, VM_PGEND(vma));
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			spgde = *spgd;
+			if (pgd_none(spgde))
+				continue;
+
+			spud = pud_offset(&spgde, address);
+			spude = *spud;
+			if (pud_none(spude))
+				continue;
+
+			spmd = pmd_offset(&spude, address);
+			spmde = *spmd;
+			if (pmd_none(spmde))
+				continue;
+
+			page = pmd_page(spmde);
+			pt_increment_share(page);
+			break;
+		}
+		spin_unlock(&mapping->i_mmap_lock);
+		if (pmd_present(spmde)) {
+			ptl = pmd_lockptr(vma->vm_mm, pmd);
+			spin_lock(ptl);
+			if (pmd_none(*pmd)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Sharing pte page at address 0x%lx\n",
+				       address);
+#endif
+				pmd_populate(vma->vm_mm, pmd, page);
+			} else {
+				pt_decrement_share(page);
+			}
+			spin_unlock(ptl);
+		}
+
+	}
+	pte = pte_alloc_map(vma->vm_mm, pmd, address);
+
+	return pte;
+}
+
+void pt_copy_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd)
+{
+	struct page *page;
+
+	page = pmd_page(*src_pmd);
+	pmd_populate(mm, dst_pmd, page);
+	pt_increment_share(page);
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_PMD
+pmd_t *pt_share_pmd(struct vm_area_struct *vma, unsigned long address, pud_t *pud,
+		    struct address_space *mapping)
+{
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd, spgde;
+	pud_t *spud, spude;
+	pmd_t *pmd;
+	struct page *page;
+	spinlock_t *ptl;
+
+	pud_val(spude) = 0;
+	page = NULL;
+	if (pud_none(*pud) &&
+	    mapping && 
+	    pt_shareable_pmd(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pmd page at address 0x%lx\n",
+		       address);
+#endif
+		spin_lock(&mapping->i_mmap_lock);
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_pgoff, VM_PGEND(vma));
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			spgde = *spgd;
+			if (pgd_none(spgde))
+				continue;
+
+			spud = pud_offset(spgd, address);
+			spude = *spud;
+			if (pud_none(spude))
+				continue;
+
+			page = pud_page(spude);
+			pt_increment_share(page);
+			break;
+		}
+		spin_unlock(&mapping->i_mmap_lock);
+		if (pud_present(spude)) {
+			ptl = pud_lockptr(vma->vm_mm, pud);
+			spin_lock(ptl);
+			if (pud_none(*pud)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Sharing pmd page at address 0x%lx\n",
+				       address);
+#endif
+				pud_populate(vma->vm_mm, pud,
+					     (pmd_t *)pud_page_kernel(spude));
+			} else {
+				pt_decrement_share(page);
+			}
+			spin_unlock(ptl);
+		}
+	}
+	pmd = pmd_alloc(vma->vm_mm, pud, address);
+
+	return pmd;
+}
+
+void pt_copy_pmd(struct mm_struct *mm, pud_t *dst_pud, pud_t *src_pud)
+{
+	struct page *page;
+
+	pud_populate(mm, dst_pud, (pmd_t *)pud_page_kernel(*src_pud));
+
+	page = pud_page(*src_pud);
+	pt_increment_share(page);
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_PUD
+pud_t *pt_share_pud(struct vm_area_struct *vma, unsigned long address, pgd_t *pgd,
+		    struct address_space *mapping)
+{
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd, spgde;
+	pud_t *pud;
+	struct page *page;
+
+	spged = pgd_val(0);
+	page = NULL;
+	if (pgd_none(*pgd) &&
+	    mapping && 
+	    pt_shareable_pud(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pud page at address 0x%lx\n",
+		       address);
+#endif
+		spin_lock(&mapping->i_mmap_lock);
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_pgoff, VM_PGEND(vma));
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			spgde = *spgd;
+			if (pgd_none(spgde))
+				continue;
+
+			page = pgd_page(spgde);
+			pt_increment_share(page);
+			break;
+		}
+		spin_unlock(&mapping->i_mmap_lock);
+		if (pgd_present(spgde)) {
+			spin_lock(&vma->vm_mm->page_table_lock);
+			if (pgd_none(*pgd)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Sharing pud page at address 0x%lx\n",
+				       address);
+#endif
+				pgd_populate(vma->vm_mm, pgd,
+					     (pud_t *)pgd_page_kernel(spgde));
+			} else {
+				pt_decrement_share(page);
+			}
+			spin_unlock(vma->vm_mm->page_table_lock);
+		}
+	}
+	pud = pud_alloc(vma->vm_mm, pgd, address);
+
+	return pud;
+}
+
+void pt_copy_pud(struct mm_struct *mm, pgd_t *dst_pgd, pgd_t *src_pgd)
+{
+	struct page *page;
+
+	pgd_populate(mm, dst_pgd, (pud_t *)pgd_page_kernel(*src_pgd));
+
+	page = pgd_page(*src_pgd);
+	pt_increment_share(page);
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+
+void pt_unshare_huge_range(struct vm_area_struct *vma, unsigned long address,
+			   unsigned long end)
+{
+	struct address_space *mapping;
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE_PTE
+	pt_unshare_pgd(vma->vm_mm, address, end, 0);
+#else
+	pt_unshare_pgd(vma->vm_mm, address, end, 1);
+#endif
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+pte_t *pt_share_hugepage(struct mm_struct *mm, struct vm_area_struct *vma,
+			 unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	struct address_space *mapping = NULL;
+
+	if (pt_shareable_vma(vma))
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+
+	pgd = pgd_offset(mm, address);
+
+	pud = pt_share_pud(vma, address, pgd, mapping);
+	if (!pud)
+		return NULL;
+
+	pmd = pt_share_pmd(vma, address, pud, mapping);
+	if (!pmd)
+		return NULL;
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE_PTE
+	pte = pt_share_pte(vma, address, pmd, mapping);
+#else
+	pte = (pte_t *)pmd;
+#endif
+	return pte;
+}
+#endif

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-05 16:19 [PATCH/RFC] Shared page tables Dave McCracken
@ 2006-01-07 12:25 ` Heiko Carstens
  2006-01-07 18:09   ` Dave McCracken
  2006-01-13  5:15 ` Brian Twichell
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 55+ messages in thread
From: Heiko Carstens @ 2006-01-07 12:25 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management

> Here's a new version of my shared page tables patch.
> 
> The primary purpose of sharing page tables is improved performance for
> large applications that share big memory areas between multiple processes.
> It eliminates the redundant page tables and significantly reduces the
> number of minor page faults.  Tests show significant performance
> improvement for large database applications, including those using large
> pages.  There is no measurable performance degradation for small processes.

Tried to get this running with CONFIG_PTSHARE and CONFIG_PTSHARE_PTE on
s390x. Unfortunately it crashed on boot, because pt_share_pte
returned a broken pte pointer:

> +pte_t *pt_share_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd,
> + ...
> +	pmd_val(spmde) = 0;
> + ...
> +		if (pmd_present(spmde)) {

This is wrong. A pmd_val of 0 will make pmd_present return true on s390x
which is not what you want.
Should be pmd_clear(&spmde).

> +pmd_t *pt_share_pmd(struct vm_area_struct *vma, unsigned long address, pud_t *pud,
> + ...
> +	pud_val(spude) = 0;

Should be pud_clear, I guess :)

Thanks,
Heiko

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-07 12:25 ` Heiko Carstens
@ 2006-01-07 18:09   ` Dave McCracken
  2006-01-08 12:09     ` Heiko Carstens
  0 siblings, 1 reply; 55+ messages in thread
From: Dave McCracken @ 2006-01-07 18:09 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management


--On Saturday, January 07, 2006 13:25:34 +0100 Heiko Carstens
<heiko.carstens@de.ibm.com> wrote:

>> The primary purpose of sharing page tables is improved performance for
>> large applications that share big memory areas between multiple
>> processes. It eliminates the redundant page tables and significantly
>> reduces the number of minor page faults.  Tests show significant
>> performance improvement for large database applications, including those
>> using large pages.  There is no measurable performance degradation for
>> small processes.
> 
> Tried to get this running with CONFIG_PTSHARE and CONFIG_PTSHARE_PTE on
> s390x. Unfortunately it crashed on boot, because pt_share_pte
> returned a broken pte pointer:

The patch as submitted only works on i386 and x86_64.  Sorry.

>> +pte_t *pt_share_pte(struct vm_area_struct *vma, unsigned long address,
>> pmd_t *pmd, + ...
>> +	pmd_val(spmde) = 0;
>> + ...
>> +		if (pmd_present(spmde)) {
> 
> This is wrong. A pmd_val of 0 will make pmd_present return true on s390x
> which is not what you want.
> Should be pmd_clear(&spmde).
> 
>> +pmd_t *pt_share_pmd(struct vm_area_struct *vma, unsigned long address,
>> pud_t *pud, + ...
>> +	pud_val(spude) = 0;
> 
> Should be pud_clear, I guess :)

Yes, you're right.  pmd_clear() and pud_clear() would be more portable.
I'll make that change.

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-07 18:09   ` Dave McCracken
@ 2006-01-08 12:09     ` Heiko Carstens
  2006-01-08 14:04       ` Dave McCracken
  0 siblings, 1 reply; 55+ messages in thread
From: Heiko Carstens @ 2006-01-08 12:09 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management

> > Tried to get this running with CONFIG_PTSHARE and CONFIG_PTSHARE_PTE on
> > s390x. Unfortunately it crashed on boot, because pt_share_pte
> > returned a broken pte pointer:
> 
> The patch as submitted only works on i386 and x86_64.  Sorry.

That's why I added what seems to be needed for s390. For CONFIG_PTSHARE and
CONFIG_PTSHARE_PTE it's just a slightly modified Kconfig file.
For CONFIG_PTSHARE_PMD it involves adding a few more pud_* defines to
asm-generic/4level-fixup.h.
Seems to work with the pmd/pud_clear changes as far as I can tell.

Just tested this out of curiousity :)

Thanks,
Heiko

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-08 12:09     ` Heiko Carstens
@ 2006-01-08 14:04       ` Dave McCracken
  0 siblings, 0 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-08 14:04 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management


--On Sunday, January 08, 2006 13:09:48 +0100 Heiko Carstens
<heiko.carstens@de.ibm.com> wrote:

>> The patch as submitted only works on i386 and x86_64.  Sorry.
> 
> That's why I added what seems to be needed for s390. For CONFIG_PTSHARE
> and CONFIG_PTSHARE_PTE it's just a slightly modified Kconfig file.
> For CONFIG_PTSHARE_PMD it involves adding a few more pud_* defines to
> asm-generic/4level-fixup.h.
> Seems to work with the pmd/pud_clear changes as far as I can tell.

Wow.  That's good to know.  Thanks.

Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-05 16:19 [PATCH/RFC] Shared page tables Dave McCracken
  2006-01-07 12:25 ` Heiko Carstens
@ 2006-01-13  5:15 ` Brian Twichell
  2006-01-13 22:34   ` Ray Bryant
  2006-01-25  4:14   ` Brian Twichell
  2006-01-13 15:18 ` Phillip Susi
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 55+ messages in thread
From: Brian Twichell @ 2006-01-13  5:15 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel,
	Linux Memory Management, slpratt

Dave McCracken wrote:

>Here's a new version of my shared page tables patch.
>
>The primary purpose of sharing page tables is improved performance for
>large applications that share big memory areas between multiple processes.
>It eliminates the redundant page tables and significantly reduces the
>number of minor page faults.  Tests show significant performance
>improvement for large database applications, including those using large
>pages.  There is no measurable performance degradation for small processes.
>
>  
>
Hi,

We evaluated page table sharing on x86_64 and ppc64 setups, using a database
OLTP workload.  In both cases, 4-way systems with 64 GB of memory were used.

On the x86_64 setup, page table sharing provided a 25% increase in 
performance,
when the database buffers were in small (4 KB) pages.  In this case, 
over 14 GB
of memory was freed, that had previously been taken up by page tables.  
In the
case that the database buffers were in huge (2 MB) pages, page table sharing
provided a 4% increase in performance.

Our ppc64 experiments used an earlier version of Dave's patch, along with
ppc64-specific code for sharing of ppc64 segments.  On this setup, page
table sharing provided a 49% increase in performance, when the database
buffers were in small (4 KB) pages.  Over 10 GB of memory was freed, that
had previously been taken up by page tables.  In the case that the database
buffers were in huge (16 MB) pages, page table sharing provided a 3% 
increase
in performance.

In the experiments above, page table sharing brought performance with small
pages to within 12% of the performance observed with hugepages.

Given the results above, we are keen for page table sharing to get included,
for a couple of reasons.  First, we feel it provides for significantly more
robust "out-of-the-box" performance for process-based middleware such as 
DB2,
Oracle, and SAP.  Customers don't have to use or even know about hugepages
to get near best-case performance.  Secondly, the performance boost provided
will help efforts to publish proof points which can be used to advance the
adoption of Linux in performance-sensitive data-center environments.

Cheers,
Brian Twichell



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-05 16:19 [PATCH/RFC] Shared page tables Dave McCracken
  2006-01-07 12:25 ` Heiko Carstens
  2006-01-13  5:15 ` Brian Twichell
@ 2006-01-13 15:18 ` Phillip Susi
  2006-01-14 20:45   ` Brian Twichell
  2006-01-17 23:53 ` Robin Holt
  2006-01-20 21:24 ` Hugh Dickins
  4 siblings, 1 reply; 55+ messages in thread
From: Phillip Susi @ 2006-01-13 15:18 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management

Shouldn't those kind of applications already be using threads to share 
page tables rather than forking hundreds of processes that all mmap() 
the same file?


Dave McCracken wrote:
> Here's a new version of my shared page tables patch.
>
> The primary purpose of sharing page tables is improved performance for
> large applications that share big memory areas between multiple processes.
> It eliminates the redundant page tables and significantly reduces the
> number of minor page faults.  Tests show significant performance
> improvement for large database applications, including those using large
> pages.  There is no measurable performance degradation for small processes.
>
> This version of the patch uses Hugh's new locking mechanism, extending it
> up the page table tree as far as necessary for proper concurrency control.
>
> The patch also includes the proper locking for following the vma chains.
>
> Hugh, I believe I have all the lock points nailed down.  I'd appreciate
> your input on any I might have missed.
>
> The architectures supported are i386 and x86_64.  I'm working on 64 bit
> ppc, but there are still some issues around proper segment handling that
> need more testing.  This will be available in a separate patch once it's
> solid.
>
> Dave McCracken
>   


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-13  5:15 ` Brian Twichell
@ 2006-01-13 22:34   ` Ray Bryant
  2006-01-17  4:50     ` Brian Twichell
  2006-01-25  4:14   ` Brian Twichell
  1 sibling, 1 reply; 55+ messages in thread
From: Ray Bryant @ 2006-01-13 22:34 UTC (permalink / raw)
  To: Brian Twichell
  Cc: Dave McCracken, Hugh Dickins, Andrew Morton, Linux Kernel,
	Linux Memory Management, slpratt

On Thursday 12 January 2006 23:15, Brian Twichell wrote:

> Hi,
>
> We evaluated page table sharing on x86_64 and ppc64 setups, using a
> database OLTP workload.  In both cases, 4-way systems with 64 GB of memory
> were used.
>
> On the x86_64 setup, page table sharing provided a 25% increase in
> performance,
> when the database buffers were in small (4 KB) pages.  In this case,
> over 14 GB
> of memory was freed, that had previously been taken up by page tables.
> In the
> case that the database buffers were in huge (2 MB) pages, page table
> sharing provided a 4% increase in performance.
>

Brian,

Is that 25%-50% percent of overall performance (e. g. transaction throughput), 
or is this a measurement of, say, DB process startup times, or what?   It 
seems to me that the impact of the shared page table patch would mostly be 
noticed at address space construction/destruction times, and for a big OLTP 
workload, the processes are probably built once and stay around forever, no?

If the performance improvement is in overall throughput, do you understand why 
the impact would be so large?   TLB reloads?   I don't understand why one 
would see that kind of overall performance improvement, but I could be 
overlooking something.   (Could very likely be overlooking something...:-) )

Oh, and yeah, was this an AMD x86_64 box or what?
-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-13 15:18 ` Phillip Susi
@ 2006-01-14 20:45   ` Brian Twichell
  0 siblings, 0 replies; 55+ messages in thread
From: Brian Twichell @ 2006-01-14 20:45 UTC (permalink / raw)
  To: Phillip Susi
  Cc: Dave McCracken, Hugh Dickins, Andrew Morton, Linux Kernel,
	Linux Memory Management

Phillip Susi wrote:

> Shouldn't those kind of applications already be using threads to share 
> page tables rather than forking hundreds of processes that all mmap() 
> the same file?
>
We're talking about sharing anonymous memory here, not files.

The feedback we've gotten on converting from a process-based to a 
thread-based model is that it's a major undertaking,
when development and test expense is considered.  It's understandable if 
one considers that they'd probably want to convert
across on several operating systems at once to minimize the number of 
source trees they have to maintain.

Also, the case for conversion isn't helped by the fact that at least two 
prominent commercial UNIX flavors either inherently
share page tables, or provide an explicit memory allocation mechanism 
that achieves page table sharing (e.g. Intimate Shared
Memory).

Cheers,
Brian


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-13 22:34   ` Ray Bryant
@ 2006-01-17  4:50     ` Brian Twichell
  0 siblings, 0 replies; 55+ messages in thread
From: Brian Twichell @ 2006-01-17  4:50 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Dave McCracken, Hugh Dickins, Andrew Morton, Linux Kernel,
	Linux Memory Management, slpratt

Ray Bryant wrote:

>On Thursday 12 January 2006 23:15, Brian Twichell wrote:
>
>  
>
>>Hi,
>>
>>We evaluated page table sharing on x86_64 and ppc64 setups, using a
>>database OLTP workload.  In both cases, 4-way systems with 64 GB of memory
>>were used.
>>
>>On the x86_64 setup, page table sharing provided a 25% increase in
>>performance,
>>when the database buffers were in small (4 KB) pages.  In this case,
>>over 14 GB
>>of memory was freed, that had previously been taken up by page tables.
>>In the
>>case that the database buffers were in huge (2 MB) pages, page table
>>sharing provided a 4% increase in performance.
>>
>>    
>>
>
>Brian,
>
>Is that 25%-50% percent of overall performance (e. g. transaction throughput), 
>or is this a measurement of, say, DB process startup times, or what?   It 
>seems to me that the impact of the shared page table patch would mostly be 
>noticed at address space construction/destruction times, and for a big OLTP 
>workload, the processes are probably built once and stay around forever, no?
>
>If the performance improvement is in overall throughput, do you understand why 
>the impact would be so large?   TLB reloads?   I don't understand why one 
>would see that kind of overall performance improvement, but I could be 
>overlooking something.   (Could very likely be overlooking something...:-) )
>
>Oh, and yeah, was this an AMD x86_64 box or what?
>  
>
Ray,

The percentage improvement I quoted is of overall transaction throughput.

A full detailed characterization of the improvements hasn't been done,
especially since we just got the x86_64 setup up and running, but I can
share with you what I know.

In the case that the database buffers were in small pages, sharing page 
tables
freed up a large amount of memory, which was subsequently devoted to 
enlarging
the database buffer cache and/or increasing the number of connections to the
database.  The arithmetic of the memory savings works as follows:  The
database systems we used are structured as multiple processes which map a
large shared memory buffer for caching application data.  In the case of
unshared page tables, each database process requires its own Linux page
table entry for each 4 KB page in the buffer.  A Linux page table entry
consumes eight bytes.  So, if there are 100 database processes, there are
100 * 8 = 800 bytes of Linux page table entries per 4 KB page in the 
buffer --
equating to a 20% overhead for page table entries.  With shared page tables,
there is only one page table entry per buffer cache page, no matter how many
database processes there are, thus reducing the page table overhead down
to 8 / 4 KB = ~ 0.2%.

I suspect that, in the small page case, sharing page tables is also 
giving us a
significant benefit from reduced thrashing in the ppc64 hardware page 
tables,
although I haven't confirmed that.

For the hugepage case on ppc64, the performance improvement appears to
stem mainly from a CPI decrease due to reduced TLB and cache miss rates.

Our x86_64 setup is based on Intel processors.

Cheers,
Brian


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-05 16:19 [PATCH/RFC] Shared page tables Dave McCracken
                   ` (2 preceding siblings ...)
  2006-01-13 15:18 ` Phillip Susi
@ 2006-01-17 23:53 ` Robin Holt
  2006-01-18  0:17   ` Dave Hansen
                     ` (2 more replies)
  2006-01-20 21:24 ` Hugh Dickins
  4 siblings, 3 replies; 55+ messages in thread
From: Robin Holt @ 2006-01-17 23:53 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management

Dave,

This appears to work on ia64 with the attached patch.  Could you
send me any test application you think would be helpful for me
to verify it is operating correctly?  I could not get the PTSHARE_PUD
to compile.  I put _NO_ effort into it.  I found the following line
was invalid and quit trying.

> +	spged = pgd_val(0);

Thanks,
Robin Holt


Index: linux-2.6/arch/ia64/Kconfig
===================================================================
--- linux-2.6.orig/arch/ia64/Kconfig	2006-01-14 07:16:46.149226872 -0600
+++ linux-2.6/arch/ia64/Kconfig	2006-01-14 07:25:02.228853432 -0600
@@ -289,6 +289,38 @@ source "mm/Kconfig"
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+menu "Page table levels to share"
+	depends on PTSHARE
+
+config PTSHARE_PTE
+	bool "Bottom level table (PTE)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PMD
+	bool "Middle level table (PMD)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PUD
+	bool "Upper level table (PUD)"
+	depends on PTSHARE && PGTABLE_4
+	default n
+
+endmenu
+
+config PTSHARE_HUGEPAGE
+	bool
+	depends on PTSHARE && PTSHARE_PMD
+	default y
+
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
 	help
Index: linux-2.6/include/asm-ia64/pgtable.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/pgtable.h	2006-01-14 09:49:47.628417563 -0600
+++ linux-2.6/include/asm-ia64/pgtable.h	2006-01-14 09:51:25.315194368 -0600
@@ -283,14 +283,16 @@ ia64_phys_addr_valid (unsigned long addr
 #define pud_bad(pud)			(!ia64_phys_addr_valid(pud_val(pud)))
 #define pud_present(pud)		(pud_val(pud) != 0UL)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-#define pud_page(pud)			((unsigned long) __va(pud_val(pud) & _PFN_MASK))
+#define pud_page_kernel(pud)		((unsigned long) __va(pud_val(pud) & _PFN_MASK))
+#define pud_page(pud)			virt_to_page((pud_val(pud) + PAGE_OFFSET))
 
 #ifdef CONFIG_PGTABLE_4
 #define pgd_none(pgd)			(!pgd_val(pgd))
 #define pgd_bad(pgd)			(!ia64_phys_addr_valid(pgd_val(pgd)))
 #define pgd_present(pgd)		(pgd_val(pgd) != 0UL)
 #define pgd_clear(pgdp)			(pgd_val(*(pgdp)) = 0UL)
-#define pgd_page(pgd)			((unsigned long) __va(pgd_val(pgd) & _PFN_MASK))
+#define pgd_page_kernel(pgd)		((unsigned long) __va(pgd_val(pgd) & _PFN_MASK))
+#define pgd_page(pgd)			virt_to_page((pgd_val(pgd) + PAGE_OFFSET))
 #endif
 
 /*
@@ -363,12 +365,12 @@ pgd_offset (struct mm_struct *mm, unsign
 #ifdef CONFIG_PGTABLE_4
 /* Find an entry in the second-level page table.. */
 #define pud_offset(dir,addr) \
-	((pud_t *) pgd_page(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+	((pud_t *) pgd_page_kernel(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
 #endif
 
 /* Find an entry in the third-level page table.. */
 #define pmd_offset(dir,addr) \
-	((pmd_t *) pud_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+	((pmd_t *) pud_page_kernel(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
 /*
  * Find an entry in the third-level page table.  This looks more complicated than it

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-17 23:53 ` Robin Holt
@ 2006-01-18  0:17   ` Dave Hansen
  2006-01-18  6:11     ` Dave McCracken
  2006-01-18  1:27   ` Chen, Kenneth W
  2006-01-23 23:58   ` Ray Bryant
  2 siblings, 1 reply; 55+ messages in thread
From: Dave Hansen @ 2006-01-18  0:17 UTC (permalink / raw)
  To: Robin Holt
  Cc: Dave McCracken, Hugh Dickins, Andrew Morton, Linux Kernel,
	Linux Memory Management

On Tue, 2006-01-17 at 17:53 -0600, Robin Holt wrote:
> This appears to work on ia64 with the attached patch.  Could you
> send me any test application you think would be helpful for me
> to verify it is operating correctly?  I could not get the PTSHARE_PUD
> to compile.  I put _NO_ effort into it.  I found the following line
> was invalid and quit trying.
...
> +config PTSHARE
> +	bool "Share page tables"
> +	default y
> +	help
> +	  Turn on sharing of page tables between processes for large shared
> +	  memory regions.
...

These are probably best put in mm/Kconfig, especially if you're going to
have verbatim copies in each architecture.

-- Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH/RFC] Shared page tables
  2006-01-17 23:53 ` Robin Holt
  2006-01-18  0:17   ` Dave Hansen
@ 2006-01-18  1:27   ` Chen, Kenneth W
  2006-01-18  3:32     ` Robin Holt
  2006-01-23 23:58   ` Ray Bryant
  2 siblings, 1 reply; 55+ messages in thread
From: Chen, Kenneth W @ 2006-01-18  1:27 UTC (permalink / raw)
  To: 'Robin Holt', Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management

Robin Holt wrote on Tuesday, January 17, 2006 3:53 PM
> This appears to work on ia64 with the attached patch.  Could you
> send me any test application you think would be helpful for me
> to verify it is operating correctly?  I could not get the PTSHARE_PUD
> to compile.  I put _NO_ effort into it.  I found the following line
> was invalid and quit trying.
> 
> --- linux-2.6.orig/arch/ia64/Kconfig	2006-01-14 07:16:46.149226872 -0600
> +++ linux-2.6/arch/ia64/Kconfig	2006-01-14 07:25:02.228853432 -0600
> @@ -289,6 +289,38 @@ source "mm/Kconfig"
>  config ARCH_SELECT_MEMORY_MODEL
>  	def_bool y
>  
> +
> +config PTSHARE_HUGEPAGE
> +	bool
> +	depends on PTSHARE && PTSHARE_PMD
> +	default y
> +

You need to thread carefully with hugetlb ptshare on ia64. PTE for
hugetlb page on ia64 observe full page table levels, not like x86
that sits in the pmd level.

- Ken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-18  1:27   ` Chen, Kenneth W
@ 2006-01-18  3:32     ` Robin Holt
  0 siblings, 0 replies; 55+ messages in thread
From: Robin Holt @ 2006-01-18  3:32 UTC (permalink / raw)
  To: Chen, Kenneth W
  Cc: 'Robin Holt',
	Dave McCracken, Hugh Dickins, Andrew Morton, Linux Kernel,
	Linux Memory Management

On Tue, Jan 17, 2006 at 05:27:09PM -0800, Chen, Kenneth W wrote:
> Robin Holt wrote on Tuesday, January 17, 2006 3:53 PM
> > This appears to work on ia64 with the attached patch.  Could you
> > send me any test application you think would be helpful for me
> > to verify it is operating correctly?  I could not get the PTSHARE_PUD
> > to compile.  I put _NO_ effort into it.  I found the following line
> > was invalid and quit trying.
> > 
> > --- linux-2.6.orig/arch/ia64/Kconfig	2006-01-14 07:16:46.149226872 -0600
> > +++ linux-2.6/arch/ia64/Kconfig	2006-01-14 07:25:02.228853432 -0600
> > @@ -289,6 +289,38 @@ source "mm/Kconfig"
> >  config ARCH_SELECT_MEMORY_MODEL
> >  	def_bool y
> >  
> > +
> > +config PTSHARE_HUGEPAGE
> > +	bool
> > +	depends on PTSHARE && PTSHARE_PMD
> > +	default y
> > +
> 
> You need to thread carefully with hugetlb ptshare on ia64. PTE for
> hugetlb page on ia64 observe full page table levels, not like x86
> that sits in the pmd level.

I did no testing with hugetlb pages.

Robin

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-18  0:17   ` Dave Hansen
@ 2006-01-18  6:11     ` Dave McCracken
  0 siblings, 0 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-18  6:11 UTC (permalink / raw)
  To: Dave Hansen, Robin Holt
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management


--On Tuesday, January 17, 2006 16:17:30 -0800 Dave Hansen
<haveblue@us.ibm.com> wrote:

> On Tue, 2006-01-17 at 17:53 -0600, Robin Holt wrote:
>> This appears to work on ia64 with the attached patch.  Could you
>> send me any test application you think would be helpful for me
>> to verify it is operating correctly?  I could not get the PTSHARE_PUD
>> to compile.  I put _NO_ effort into it.  I found the following line
>> was invalid and quit trying.
> ...
>> +config PTSHARE
>> +	bool "Share page tables"
>> +	default y
>> +	help
>> +	  Turn on sharing of page tables between processes for large shared
>> +	  memory regions.
> ...
> 
> These are probably best put in mm/Kconfig, especially if you're going to
> have verbatim copies in each architecture.

No, the specific variables that should be set are different per
architecture, and some (most) architectures don't yet support shared page
tables.  It's likely some never will.

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-05 16:19 [PATCH/RFC] Shared page tables Dave McCracken
                   ` (3 preceding siblings ...)
  2006-01-17 23:53 ` Robin Holt
@ 2006-01-20 21:24 ` Hugh Dickins
  2006-01-20 21:54   ` Chen, Kenneth W
                     ` (2 more replies)
  4 siblings, 3 replies; 55+ messages in thread
From: Hugh Dickins @ 2006-01-20 21:24 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management

On Thu, 5 Jan 2006, Dave McCracken wrote:
> 
> Here's a new version of my shared page tables patch.
> 
> The primary purpose of sharing page tables is improved performance for
> large applications that share big memory areas between multiple processes.
> It eliminates the redundant page tables and significantly reduces the
> number of minor page faults.  Tests show significant performance
> improvement for large database applications, including those using large
> pages.  There is no measurable performance degradation for small processes.
> 
> This version of the patch uses Hugh's new locking mechanism, extending it
> up the page table tree as far as necessary for proper concurrency control.
> 
> The patch also includes the proper locking for following the vma chains.
> 
> Hugh, I believe I have all the lock points nailed down.  I'd appreciate
> your input on any I might have missed.
> 
> The architectures supported are i386 and x86_64.  I'm working on 64 bit
> ppc, but there are still some issues around proper segment handling that
> need more testing.  This will be available in a separate patch once it's
> solid.
> 
> Dave McCracken

The locking looks much better now, and I like the way i_mmap_lock seems
to fall naturally into place where the pte lock doesn't work.  But still
some raciness noted in comments on patch below.

The main thing I dislike is the
 16 files changed, 937 insertions(+), 69 deletions(-)
(with just i386 and x86_64 included): it's adding more complexity than
I can welcome, and too many unavoidable "if (shared) ... else ..."s.
With significant further change needed, not just adding architectures.

Worthwhile additional complexity?  I'm not the one to judge that.
Brian has posted dramatic improvments (25%, 49%) for the non-huge OLTP,
and yes, it's sickening the amount of memory we're wasting on pagetables
in that particular kind of workload.  Less dramatic (3%, 4%) in the
hugetlb case: and as yet (since last summer even) no profiles to tell
where that improvement actually comes from.

Could it be simpler?  I'd love you to chuck out the pud and pmd sharing
and just stick to sharing the lowest level of pagetable (though your
handling of the different levels is decently symmetrical).  But you
told me last time around that sharing at pmd level is somehow essential
to having it work on ppc64.

An annoying piece of work you have yet to do: getting file_rss working.
You may want to leave that undone, until/unless there's some signal
that mainline really wants shared pagetables: it's off your track,
and probably not worth the effort while prototyping.

I guess what you'll have to do is keep file_rss and anon_rss per
pagetable page (in its struct page) when split ptlock - which would
have the benefit that they can revert back from being atomics in that
case.  Tiresome (prohibitive?) downside being that gatherers of rss
would have to walk the pagetable pages to sum up rss.

Unless we can persuade ourselves that file_rss just isn't worth
supporting any more: tempting for us, but not for those who look at
rss stats.  Even though properly accounted file_rss becomes somewhat
misleading in the shared pagetable world - a process which never
faulted being suddenly graced with thousands of pages.

A more serious omission that you need to fix soon: the TLB flushing
in mm/rmap.c.  In a shared pagetable, you have no record of which
cpus the pte is exposed on (unlike mm->cpu_vm_mask), and so you'll
probably have to flush TLB on all online cpus each time (per pte,
or perhaps can be rearranged to do so only once for the page).  I
don't think you can keep a cpu_vm_mask with the pagetable - we
wouldn't want scheduling a task to have to walk its pagetables.

Well, I say "no record", but I suppose if you change the code somehow
to leave the pte in place when the page is first found, and accumulate
the mm->cpu_vm_masks of all the vmas in which the page is found, and
only clear it at the end.  Not sure if that can fly, or worthwhile.

More comments, mostly trivial, against extracts from the patch below.
(Quite often I comment on one instance, but same applies in similar places.)

> --- 2.6.15/./arch/x86_64/Kconfig	2006-01-02 21:21:10.000000000 -0600
> +++ 2.6.15-shpt/./arch/x86_64/Kconfig	2006-01-03 10:30:01.000000000 -0600
> @@ -267,6 +267,38 @@ config NUMA_EMU
>  	  into virtual nodes when booted with "numa=fake=N", where N is the
>  	  number of nodes. This is only useful for debugging.
>  
> +config PTSHARE
> +	bool "Share page tables"
> +	default y
> +	help
> +	  Turn on sharing of page tables between processes for large shared
> +	  memory regions.
> +
> +menu "Page table levels to share"
> +	depends on PTSHARE
> +
> +config PTSHARE_PTE
> +	bool "Bottom level table (PTE)"
> +	depends on PTSHARE
> +	default y
> +
> +config PTSHARE_PMD
> +	bool "Middle level table (PMD)"
> +	depends on PTSHARE
> +	default y
> +
> +config PTSHARE_PUD
> +	bool "Upper level table (PUD)"
> +	depends on PTSHARE
> +	default n
> +
> +endmenu
> +
> +config PTSHARE_HUGEPAGE
> +	bool
> +	depends on PTSHARE && PTSHARE_PMD
> +	default y
> +

I presume this cornucopia of config options is for ease of testing the
performance effect of different aspects of the patch, and will go away
in due course.

> --- 2.6.15/./include/asm-x86_64/pgtable.h	2006-01-02 21:21:10.000000000 -0600
> +++ 2.6.15-shpt/./include/asm-x86_64/pgtable.h	2006-01-03 10:30:01.000000000 -0600
> @@ -324,7 +321,8 @@ static inline int pmd_large(pmd_t pte) {
>  /*
>   * Level 4 access.
>   */
> -#define pgd_page(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
> +#define pgd_page_kernel(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
> +#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))

Hmm, so pgd_page changes its meaning: is that wise?  Looks like it isn't
used much outside of include/ so perhaps you're okay, and I can see the
attraction of using "_page" for something that supplies a struct page *.
I can also see the attraction of appending "_kernel" to the other,
following pte_offset_kernel, but "_kernel" isn't really appropriate.
Musing aloud, no particular suggestion.

> --- 2.6.15/./include/linux/ptshare.h	1969-12-31 18:00:00.000000000 -0600
> +++ 2.6.15-shpt/./include/linux/ptshare.h	2006-01-03 10:30:01.000000000 -0600
> +
> +#ifdef CONFIG_PTSHARE
> +static inline int pt_is_shared(struct page *page)
> +{
> +	return (page_mapcount(page) > 1);

This looks like its uses will be racy: see further remarks below.

> +extern void pt_unshare_range(struct vm_area_struct *vma, unsigned long address,
> +			     unsigned long end);
> +#else /* CONFIG_PTSHARE */
> +#define	pt_is_shared(page)	(0)
> +#define	pt_increment_share(page)
> +#define	pt_decrement_share(page)
> +#define pt_shareable_vma(vma)	(0)
> +#define	pt_unshare_range(vma, address, end)
> +#endif /* CONFIG_PTSHARE */

Please keep to "#define<space>MACRO<tab(s)definition" throughout:
easiest thing would be to edit the patch itself.

> +static inline void pt_increment_pte(pmd_t pmdval)
> +{
> +	struct page *page;
> +
> +	page = pmd_page(pmdval);
> +	pt_increment_share(page);
> +	return;
> +}

Please leave out all these unnecessary "return;"s.

> +static inline int pt_shareable_pte(struct vm_area_struct *vma, unsigned long address)
> +{
> +	unsigned long base = address & PMD_MASK;
> +	unsigned long end = base + (PMD_SIZE-1);
> +
> +	if ((vma->vm_start <= base) &&
> +	    (vma->vm_end >= end))
> +		return 1;
> +
> +	return 0;
> +}

I think you have an off-by-one on the end test there (and in similar fns):
maybe you should be saying "(vma->vm_end - 1 >= end)".

> --- 2.6.15/./mm/fremap.c	2006-01-02 21:21:10.000000000 -0600
> +++ 2.6.15-shpt/./mm/fremap.c	2006-01-03 10:30:01.000000000 -0600
> @@ -193,6 +194,9 @@ asmlinkage long sys_remap_file_pages(uns
>  				has_write_lock = 1;
>  				goto retry;
>  			}
> +			if (pt_shareable_vma(vma))
> +				pt_unshare_range(vma, vma->vm_start, vma->vm_end);
> +

I do like the way you simply unshare whenever (or almost whenever) in
doubt, and the way unsharing doesn't have to copy the pagetable, because
it's known to be full of straight file mappings which can just be faulted
back.

But it's not robust to be using the "pt_shareable_vma" test for whether
you need to unshare.  Notably the PMD_SIZE part of the pt_shareable_vma
test is good for deciding whether it's worth trying to share, but not so
good for deciding whether in fact we are shared.  By all means have an
inline test for whether it's worth going off to the exceptional unshare
function, but use a safer test.

Even if you do fix up those places which reduce the size of the area
but don't at present unshare.  For example, madvise.c and mlock.c don't
appear in this patch, but madvise and mlock can split_vma bringing it
below PMD_SIZE.  It's not necessarily the case that we should then
unshare, but probably need to at least in the mlock case (since
try_to_unmap might unmap pte from non-VM_LOCKED vma before reaching
VM_LOCKED vma sharing the same pagetable).  You might decide it's
best to unshare in split_vma.

> --- 2.6.15/./mm/memory.c	2006-01-02 21:21:10.000000000 -0600
> +++ 2.6.15-shpt/./mm/memory.c	2006-01-03 10:30:01.000000000 -0600
> @@ -110,14 +113,25 @@ void pmd_clear_bad(pmd_t *pmd)
>   * Note: this doesn't free the actual pages themselves. That
>   * has been handled earlier when unmapping all the memory regions.
>   */
> -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
> +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
> +			   unsigned long addr)
>  {
>  	struct page *page = pmd_page(*pmd);
> -	pmd_clear(pmd);
> -	pte_lock_deinit(page);
> -	pte_free_tlb(tlb, page);
> -	dec_page_state(nr_page_table_pages);
> -	tlb->mm->nr_ptes--;
> +	pmd_t pmdval= *pmd;
> +	int share;
> +
> +	share = pt_is_shared_pte(pmdval);
> +	if (share)
> +		pmd_clear_flush(tlb->mm, addr, pmd);
> +	else
> +		pmd_clear(pmd);
> +
> +	pt_decrement_pte(pmdval);
> +	if (!share) {
> +		pte_lock_deinit(page);
> +		pte_free_tlb(tlb, page);
> +		dec_page_state(nr_page_table_pages);
> +	}
>  }

Here's an example of where using pt_is_shared_pte is racy.
The last two tasks sharing might find page_mapcount > 1 at the same
time, so neither do the freeing.  In this case, haven't looked to
see if it's the same everywhere, you want pt_decrement_pte to
return the atomic_dec_and_test - I think.  Or else be holding
i_mmap_lock here and in more places, but that's probably worse.

>  int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
>  {
>  	struct page *new = pte_alloc_one(mm, address);
> +	spinlock_t *ptl = pmd_lockptr(mm, pmd);
> +
>  	if (!new)
>  		return -ENOMEM;
>  
>  	pte_lock_init(new);
> -	spin_lock(&mm->page_table_lock);
> +	spin_lock(ptl);

Yes, that's nice - IF you really have to cover multiple levels.

>  	if (pmd_present(*pmd)) {	/* Another has populated it */
>  		pte_lock_deinit(new);
>  		pte_free(new);
>  	} else {
> -		mm->nr_ptes++;

So /proc/<pid>/status VmPTE will always be "0 kB"?
I think you ought to try a bit harder there (not sure if it should
include shared pagetables or not), guess best left along with rss.

>  static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
> -		unsigned long addr, unsigned long end)
> +		unsigned long addr, unsigned long end, int shareable)
>  {
>  	pmd_t *src_pmd, *dst_pmd;
>  	unsigned long next;
> @@ -531,16 +579,20 @@ static inline int copy_pmd_range(struct 
>  		next = pmd_addr_end(addr, end);
>  		if (pmd_none_or_clear_bad(src_pmd))
>  			continue;
> -		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
> -						vma, addr, next))
> -			return -ENOMEM;
> +		if (shareable && pt_shareable_pte(vma, addr)) {
> +			pt_copy_pte(dst_mm, dst_pmd, src_pmd);

I'd have preferred not to add the shareable argument at each level,
and work it out here; or is that significantly less efficient?

> @@ -610,6 +676,7 @@ static unsigned long zap_pte_range(struc
>  	do {
>  		pte_t ptent = *pte;
>  		if (pte_none(ptent)) {
> +
>  			(*zap_work)--;
>  			continue;
>  		}

An utterly inexcusable blank line!

> --- 2.6.15/./mm/ptshare.c	1969-12-31 18:00:00.000000000 -0600
> +++ 2.6.15-shpt/./mm/ptshare.c	2006-01-03 10:30:01.000000000 -0600
> +			spude = *spud;

I seem to have a strange aversion to the name "spude",
but you can safely ignore my affliction.

Hugh

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH/RFC] Shared page tables
  2006-01-20 21:24 ` Hugh Dickins
@ 2006-01-20 21:54   ` Chen, Kenneth W
  2006-01-23 17:39   ` Dave McCracken
  2006-01-27 22:50   ` Brian Twichell
  2 siblings, 0 replies; 55+ messages in thread
From: Chen, Kenneth W @ 2006-01-20 21:54 UTC (permalink / raw)
  To: 'Hugh Dickins', Dave McCracken
  Cc: Andrew Morton, Linux Kernel, Linux Memory Management

Hugh Dickins wrote on Friday, January 20, 2006 1:24 PM
> More comments, mostly trivial, against extracts from the patch below.
> (Quite often I comment on one instance, but same applies in similar places.)
> 
> > --- 2.6.15/./include/asm-x86_64/pgtable.h	2006-01-02 21:21:10.000000000 -0600
> > +++ 2.6.15-shpt/./include/asm-x86_64/pgtable.h	2006-01-03 10:30:01.000000000 -0600
> > @@ -324,7 +321,8 @@ static inline int pmd_large(pmd_t pte) {
> >  /*
> >   * Level 4 access.
> >   */
> > -#define pgd_page(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
> > +#define pgd_page_kernel(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
> > +#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
> 
> Hmm, so pgd_page changes its meaning: is that wise?  Looks like it isn't
> used much outside of include/ so perhaps you're okay, and I can see the
> attraction of using "_page" for something that supplies a struct page *.
> I can also see the attraction of appending "_kernel" to the other,
> following pte_offset_kernel, but "_kernel" isn't really appropriate.
> Musing aloud, no particular suggestion.

I was wondering about that myself too:  in current code, pgd_page() and
pud_page() deviate from pmd_page and pte_page in terms of symmetry.  The
first two return virtual address of the pgd_val or pud_val, while pmd_page
and pte_page both return point of struct page of underlying entry.  Is
the asymmetry intentional?


Because the way shared page table uses pgd_page and pud_page, it causes
every arch who wants to enable the feature to redefine pgd_page and
pud_page, not exactly nice though.

- Ken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-20 21:24 ` Hugh Dickins
  2006-01-20 21:54   ` Chen, Kenneth W
@ 2006-01-23 17:39   ` Dave McCracken
  2006-01-23 20:19     ` Benjamin LaHaise
  2006-01-24 17:50     ` Hugh Dickins
  2006-01-27 22:50   ` Brian Twichell
  2 siblings, 2 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-23 17:39 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management


--On Friday, January 20, 2006 21:24:08 +0000 Hugh Dickins
<hugh@veritas.com> wrote:

> The locking looks much better now, and I like the way i_mmap_lock seems
> to fall naturally into place where the pte lock doesn't work.  But still
> some raciness noted in comments on patch below.
> 
> The main thing I dislike is the
>  16 files changed, 937 insertions(+), 69 deletions(-)
> (with just i386 and x86_64 included): it's adding more complexity than
> I can welcome, and too many unavoidable "if (shared) ... else ..."s.
> With significant further change needed, not just adding architectures.

I've received preliminary patches from people trying other architectures.
It looks like for architectures with a standard hardware page table it's
only a couple of lines for each architecture.

> Worthwhile additional complexity?  I'm not the one to judge that.
> Brian has posted dramatic improvments (25%, 49%) for the non-huge OLTP,
> and yes, it's sickening the amount of memory we're wasting on pagetables
> in that particular kind of workload.  Less dramatic (3%, 4%) in the
> hugetlb case: and as yet (since last summer even) no profiles to tell
> where that improvement actually comes from.

I'll look into getting some profiles.

> Could it be simpler?  I'd love you to chuck out the pud and pmd sharing
> and just stick to sharing the lowest level of pagetable (though your
> handling of the different levels is decently symmetrical).  But you
> told me last time around that sharing at pmd level is somehow essential
> to having it work on ppc64.

You're probably right about the pud sharing.  I did it for completeness,
but it's probably useless.  There's not enough win to be worth the extra
cost of including it.

The pmd level is important for ppc because it works in segments, which are
256M in size and consist of a full pmd page.  The current implementation of
the way ppc loads its tlb doesn't allow sharing at smaller than segment
size.  I currently also enable pmd sharing for x86_64, but I'm not sure how
much of a win it is.  I use it to share pmd pages for hugetlb, as well.

> An annoying piece of work you have yet to do: getting file_rss working.
> You may want to leave that undone, until/unless there's some signal
> that mainline really wants shared pagetables: it's off your track,
> and probably not worth the effort while prototyping.
> 
> I guess what you'll have to do is keep file_rss and anon_rss per
> pagetable page (in its struct page) when split ptlock - which would
> have the benefit that they can revert back from being atomics in that
> case.  Tiresome (prohibitive?) downside being that gatherers of rss
> would have to walk the pagetable pages to sum up rss.
> 
> Unless we can persuade ourselves that file_rss just isn't worth
> supporting any more: tempting for us, but not for those who look at
> rss stats.  Even though properly accounted file_rss becomes somewhat
> misleading in the shared pagetable world - a process which never
> faulted being suddenly graced with thousands of pages.

Yep, I punted on it for now.  I've had a couple of thoughts in that
direction, but nothing complete enough to code.

> A more serious omission that you need to fix soon: the TLB flushing
> in mm/rmap.c.  In a shared pagetable, you have no record of which
> cpus the pte is exposed on (unlike mm->cpu_vm_mask), and so you'll
> probably have to flush TLB on all online cpus each time (per pte,
> or perhaps can be rearranged to do so only once for the page).  I
> don't think you can keep a cpu_vm_mask with the pagetable - we
> wouldn't want scheduling a task to have to walk its pagetables.
> 
> Well, I say "no record", but I suppose if you change the code somehow
> to leave the pte in place when the page is first found, and accumulate
> the mm->cpu_vm_masks of all the vmas in which the page is found, and
> only clear it at the end.  Not sure if that can fly, or worthwhile.

Sigh.  You're right.  In your earlier comments I thought you were
questioning the locking, which didn't seem like an issue to me.  But yeah,
the flushing is a problem.  I do like your thoughts about collecting it up
and doing it as a batch.  I'll chase that some.

>> --- 2.6.15/./arch/x86_64/Kconfig	2006-01-02 21:21:10.000000000 -0600
>> +++ 2.6.15-shpt/./arch/x86_64/Kconfig	2006-01-03 10:30:01.000000000 -0600
>> +config PTSHARE
>> +	bool "Share page tables"
>> +	default y
>> +	help
>> +	  Turn on sharing of page tables between processes for large shared
>> +	  memory regions.
>>
>> (...) more config options.
> 
> I presume this cornucopia of config options is for ease of testing the
> performance effect of different aspects of the patch, and will go away
> in due course.

Most of them are to select different subsets of pt sharing based on
architecture, and aren't intended to be user selectable.

>> --- 2.6.15/./include/asm-x86_64/pgtable.h	2006-01-02 21:21:10.000000000
>> -0600 +++ 2.6.15-shpt/./include/asm-x86_64/pgtable.h	2006-01-03
>> 10:30:01.000000000 -0600 @@ -324,7 +321,8 @@ static inline int
>> pmd_large(pmd_t pte) {
>>  /*
>>   * Level 4 access.
>>   */
>> -#define pgd_page(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd)
>> & PTE_MASK)) +#define pgd_page_kernel(pgd) ((unsigned long)
>> __va((unsigned long)pgd_val(pgd) & PTE_MASK)) +#define pgd_page(pgd)
>> 	(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
> 
> Hmm, so pgd_page changes its meaning: is that wise?  Looks like it isn't
> used much outside of include/ so perhaps you're okay, and I can see the
> attraction of using "_page" for something that supplies a struct page *.
> I can also see the attraction of appending "_kernel" to the other,
> following pte_offset_kernel, but "_kernel" isn't really appropriate.
> Musing aloud, no particular suggestion.

I needed a function that returns a struct page for pgd and pud, defined in
each architecture.  I decided the simplest way was to redefine pgd_page and
pud_page to match pmd_page and pte_page.  Both functions are pretty much
used one place per architecture, so the change is trivial.  I could come up
with new functions instead if you think it's an issue.  I do have a bit of
a fetish about symmetry across levels :)

>> --- 2.6.15/./include/linux/ptshare.h	1969-12-31 18:00:00.000000000 -0600
>> +++ 2.6.15-shpt/./include/linux/ptshare.h	2006-01-03 10:30:01.000000000
>> -0600 +
>> +#ifdef CONFIG_PTSHARE
>> +static inline int pt_is_shared(struct page *page)
>> +{
>> +	return (page_mapcount(page) > 1);
> 
> This looks like its uses will be racy: see further remarks below.

Hmm, good point.  Originally most of the uses were under locks that went
away with the recent changes.  I'll need to revisit all that.

>> +extern void pt_unshare_range(struct vm_area_struct *vma, unsigned long
>> address, +			     unsigned long end);
>> +#else /* CONFIG_PTSHARE */
>> +#define	pt_is_shared(page)	(0)
>> +#define	pt_increment_share(page)
>> +#define	pt_decrement_share(page)
>> +#define pt_shareable_vma(vma)	(0)
>> +#define	pt_unshare_range(vma, address, end)
>> +#endif /* CONFIG_PTSHARE */
> 
> Please keep to "#define<space>MACRO<tab(s)definition" throughout:
> easiest thing would be to edit the patch itself.

Sorry.  Done.  I thought the standard was "#define<TAB>" like all the other
C code I've ever seen.  I didn't realize Linux does it different.

>> +static inline void pt_increment_pte(pmd_t pmdval)
>> +{
>> +	struct page *page;
>> +
>> +	page = pmd_page(pmdval);
>> +	pt_increment_share(page);
>> +	return;
>> +}
> 
> Please leave out all these unnecessary "return;"s.

Done.

>> +static inline int pt_shareable_pte(struct vm_area_struct *vma, unsigned
>> long address) +{
>> +	unsigned long base = address & PMD_MASK;
>> +	unsigned long end = base + (PMD_SIZE-1);
>> +
>> +	if ((vma->vm_start <= base) &&
>> +	    (vma->vm_end >= end))
>> +		return 1;
>> +
>> +	return 0;
>> +}
> 
> I think you have an off-by-one on the end test there (and in similar fns):
> maybe you should be saying "(vma->vm_end - 1 >= end)".

Hmm... you may be right.  I'll stare at it some more.

>> --- 2.6.15/./mm/fremap.c	2006-01-02 21:21:10.000000000 -0600
>> +++ 2.6.15-shpt/./mm/fremap.c	2006-01-03 10:30:01.000000000 -0600
>> @@ -193,6 +194,9 @@ asmlinkage long sys_remap_file_pages(uns
>>  				has_write_lock = 1;
>>  				goto retry;
>>  			}
>> +			if (pt_shareable_vma(vma))
>> +				pt_unshare_range(vma, vma->vm_start, vma->vm_end);
>> +
> 
> I do like the way you simply unshare whenever (or almost whenever) in
> doubt, and the way unsharing doesn't have to copy the pagetable, because
> it's known to be full of straight file mappings which can just be faulted
> back.
> 
> But it's not robust to be using the "pt_shareable_vma" test for whether
> you need to unshare.  Notably the PMD_SIZE part of the pt_shareable_vma
> test is good for deciding whether it's worth trying to share, but not so
> good for deciding whether in fact we are shared.  By all means have an
> inline test for whether it's worth going off to the exceptional unshare
> function, but use a safer test.
> 
> Even if you do fix up those places which reduce the size of the area
> but don't at present unshare.  For example, madvise.c and mlock.c don't
> appear in this patch, but madvise and mlock can split_vma bringing it
> below PMD_SIZE.  It's not necessarily the case that we should then
> unshare, but probably need to at least in the mlock case (since
> try_to_unmap might unmap pte from non-VM_LOCKED vma before reaching
> VM_LOCKED vma sharing the same pagetable).  You might decide it's
> best to unshare in split_vma.

I'll have to think that through a bit more.  I see your point.

>> --- 2.6.15/./mm/memory.c	2006-01-02 21:21:10.000000000 -0600
>> +++ 2.6.15-shpt/./mm/memory.c	2006-01-03 10:30:01.000000000 -0600
>> @@ -110,14 +113,25 @@ void pmd_clear_bad(pmd_t *pmd)
>>   * Note: this doesn't free the actual pages themselves. That
>>   * has been handled earlier when unmapping all the memory regions.
>>   */
>> -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
>> +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
>> +			   unsigned long addr)
>>  {
>>  	struct page *page = pmd_page(*pmd);
>> -	pmd_clear(pmd);
>> -	pte_lock_deinit(page);
>> -	pte_free_tlb(tlb, page);
>> -	dec_page_state(nr_page_table_pages);
>> -	tlb->mm->nr_ptes--;
>> +	pmd_t pmdval= *pmd;
>> +	int share;
>> +
>> +	share = pt_is_shared_pte(pmdval);
>> +	if (share)
>> +		pmd_clear_flush(tlb->mm, addr, pmd);
>> +	else
>> +		pmd_clear(pmd);
>> +
>> +	pt_decrement_pte(pmdval);
>> +	if (!share) {
>> +		pte_lock_deinit(page);
>> +		pte_free_tlb(tlb, page);
>> +		dec_page_state(nr_page_table_pages);
>> +	}
>>  }
> 
> Here's an example of where using pt_is_shared_pte is racy.
> The last two tasks sharing might find page_mapcount > 1 at the same
> time, so neither do the freeing.  In this case, haven't looked to
> see if it's the same everywhere, you want pt_decrement_pte to
> return the atomic_dec_and_test - I think.  Or else be holding
> i_mmap_lock here and in more places, but that's probably worse.

Yep.  Needs more thought about proper locking/sequencing.  I have a couple
ideas.

>>  	if (pmd_present(*pmd)) {	/* Another has populated it */
>>  		pte_lock_deinit(new);
>>  		pte_free(new);
>>  	} else {
>> -		mm->nr_ptes++;
> 
> So /proc/<pid>/status VmPTE will always be "0 kB"?
> I think you ought to try a bit harder there (not sure if it should
> include shared pagetables or not), guess best left along with rss.

Yep.  I didn't see a simple way of keeping that correct.

>>  static inline int copy_pmd_range(struct mm_struct *dst_mm, struct
>>  mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct
>>  		vm_area_struct *vma,
>> -		unsigned long addr, unsigned long end)
>> +		unsigned long addr, unsigned long end, int shareable)
>>  {
>>  	pmd_t *src_pmd, *dst_pmd;
>>  	unsigned long next;
>> @@ -531,16 +579,20 @@ static inline int copy_pmd_range(struct 
>>  		next = pmd_addr_end(addr, end);
>>  		if (pmd_none_or_clear_bad(src_pmd))
>>  			continue;
>> -		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
>> -						vma, addr, next))
>> -			return -ENOMEM;
>> +		if (shareable && pt_shareable_pte(vma, addr)) {
>> +			pt_copy_pte(dst_mm, dst_pmd, src_pmd);
> 
> I'd have preferred not to add the shareable argument at each level,
> and work it out here; or is that significantly less efficient?

My gut feeling is that the vma-level tests for shareability are significant
enough that we only want to do them once, then pass the result down the
call  stack.  I could change it if you disagree about the relative cost.

>> @@ -610,6 +676,7 @@ static unsigned long zap_pte_range(struc
>>  	do {
>>  		pte_t ptent = *pte;
>>  		if (pte_none(ptent)) {
>> +
>>  			(*zap_work)--;
>>  			continue;
>>  		}
> 
> An utterly inexcusable blank line!

Yep.  Fixed.  Must have been an interim editing error.

>> --- 2.6.15/./mm/ptshare.c	1969-12-31 18:00:00.000000000 -0600
>> +++ 2.6.15-shpt/./mm/ptshare.c	2006-01-03 10:30:01.000000000 -0600
>> +			spude = *spud;
> 
> I seem to have a strange aversion to the name "spude",
> but you can safely ignore my affliction.

Must be one of those Brit things :)

Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-23 17:39   ` Dave McCracken
@ 2006-01-23 20:19     ` Benjamin LaHaise
  2006-01-24 17:50     ` Hugh Dickins
  1 sibling, 0 replies; 55+ messages in thread
From: Benjamin LaHaise @ 2006-01-23 20:19 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management

On Mon, Jan 23, 2006 at 11:39:07AM -0600, Dave McCracken wrote:
> The pmd level is important for ppc because it works in segments, which are
> 256M in size and consist of a full pmd page.  The current implementation of
> the way ppc loads its tlb doesn't allow sharing at smaller than segment
> size.  I currently also enable pmd sharing for x86_64, but I'm not sure how
> much of a win it is.  I use it to share pmd pages for hugetlb, as well.

For EM64T at least, pmd sharing is definately worthwhile.  pud sharing is 
a bit more out there, but would still help database workloads.  In the case 
of a thousand connections (which is not unreasonable for some users) you 
save 4MB of memory and reduce the cache footprint of those saved 4MB of 
pages to 4KB.  Ideally the code can be structured to compile out to nothing 
if not needed.

Of course, once we have shared page tables it makes great sense to try to 
get userland to align code segments and data segments to seperate puds so 
that we could share all the page tables for common system libraries amongst 
processes...

		-ben
-- 
"Ladies and gentlemen, I'm sorry to interrupt, but the police are here 
and they've asked us to stop the party."  Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-17 23:53 ` Robin Holt
  2006-01-18  0:17   ` Dave Hansen
  2006-01-18  1:27   ` Chen, Kenneth W
@ 2006-01-23 23:58   ` Ray Bryant
  2006-01-24  0:16     ` Ray Bryant
  2006-01-24  0:19     ` Dave McCracken
  2 siblings, 2 replies; 55+ messages in thread
From: Ray Bryant @ 2006-01-23 23:58 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

On Tuesday 17 January 2006 17:53, Robin Holt wrote:
> Dave,
>
> This appears to work on ia64 with the attached patch.  Could you
> send me any test application you think would be helpful for me
> to verify it is operating correctly?  
<snip>

Dave,

Like Robin, I would appreciate a test application, or at least a description 
of how to write one, or some other trick to figure out if this is working.

I scanned through this thread looking for a test application, and didn't see 
one.   Is it sufficient just to create a large shared read-only mmap'd file 
and share it across a bunch of process to get this code invoked?   How large 
of a file is needed (on x86_64), assuming that we just turn on the pte level 
of sharing?   And what kind of alignment constraints do we end up under in 
order to make the sharing happen?   (My guess would be that there aren't any 
such constraints (well, page alignment.. :-)  if we are just sharing pte's.)

I turned on the PT_DEBUG stuff, but thus far have found no evidence of pte 
sharing actually occurring in a normal system boot.  I'm surprised by that as 
I (naively?) would have expected shared libraries to use shared ptes.

Best Regards,
-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-23 23:58   ` Ray Bryant
@ 2006-01-24  0:16     ` Ray Bryant
  2006-01-24  0:39       ` Andi Kleen
  2006-01-24  0:19     ` Dave McCracken
  1 sibling, 1 reply; 55+ messages in thread
From: Ray Bryant @ 2006-01-24  0:16 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

On Monday 23 January 2006 17:58, Ray Bryant wrote:
<snip>

> ... And what kind of alignment constraints do we end up
> under in order to make the sharing happen?   (My guess would be that there
> aren't any such constraints (well, page alignment.. :-)  if we are just
> sharing pte's.)
>

Oh, obviously that is not right as you have to share full pte pages.   So on 
x86_64 I'm guessing one needs 2MB alignment in order to get the sharing to
kick in, since a pte page maps 512 pages of 4 KB each.

Best Regards,
-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-23 23:58   ` Ray Bryant
  2006-01-24  0:16     ` Ray Bryant
@ 2006-01-24  0:19     ` Dave McCracken
  2006-01-24  0:46       ` Ray Bryant
  2006-01-24 23:43       ` Ray Bryant
  1 sibling, 2 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-24  0:19 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management



--On Monday, January 23, 2006 17:58:08 -0600 Ray Bryant
<raybry@mpdtxmail.amd.com> wrote:

> Like Robin, I would appreciate a test application, or at least a
> description  of how to write one, or some other trick to figure out if
> this is working.

My apologies.  I do have a small test program and intended to clean it up
to send to Robin, but got sidetracked (it's fugly at the moment).  I'll see
about getting it a bit more presentable.

> I scanned through this thread looking for a test application, and didn't
> see  one.   Is it sufficient just to create a large shared read-only
> mmap'd file  and share it across a bunch of process to get this code
> invoked?   How large  of a file is needed (on x86_64), assuming that we
> just turn on the pte level  of sharing?   And what kind of alignment
> constraints do we end up under in  order to make the sharing happen?
> (My guess would be that there aren't any  such constraints (well, page
> alignment.. :-)  if we are just sharing pte's.)

The basic rule for pte sharing is that some portion of a memory region must
span an entire pte page.  For i386 and x96_64 that would be 2 meg.  The
region must either be read-only or marked to be shared if it is writeable.

The code does opportunistically look for any pte page that is fully within
a shareable vma, and will share if it finds one.

Oh, and one more caveat.  The region must be mapped to the same address in
each process.

> I turned on the PT_DEBUG stuff, but thus far have found no evidence of
> pte  sharing actually occurring in a normal system boot.  I'm surprised
> by that as  I (naively?) would have expected shared libraries to use
> shared ptes.

Most system software, including the shared libraries, don't have any
regions that are big enough for sharing (the text section for libc, for
example, is about 1.5 meg).

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:16     ` Ray Bryant
@ 2006-01-24  0:39       ` Andi Kleen
  2006-01-24  0:51         ` Dave McCracken
  2006-01-24  0:53         ` Ray Bryant
  0 siblings, 2 replies; 55+ messages in thread
From: Andi Kleen @ 2006-01-24  0:39 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Dave McCracken, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management

On Tuesday 24 January 2006 01:16, Ray Bryant wrote:
> On Monday 23 January 2006 17:58, Ray Bryant wrote:
> <snip>
> 
> > ... And what kind of alignment constraints do we end up
> > under in order to make the sharing happen?   (My guess would be that there
> > aren't any such constraints (well, page alignment.. :-)  if we are just
> > sharing pte's.)
> >
> 
> Oh, obviously that is not right as you have to share full pte pages.   So on 
> x86_64 I'm guessing one needs 2MB alignment in order to get the sharing to
> kick in, since a pte page maps 512 pages of 4 KB each.

The new randomized mmaps will likely actively sabotate such alignment. I just
added them for x86-64.

-Andi

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:19     ` Dave McCracken
@ 2006-01-24  0:46       ` Ray Bryant
  2006-01-24 23:43       ` Ray Bryant
  1 sibling, 0 replies; 55+ messages in thread
From: Ray Bryant @ 2006-01-24  0:46 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

On Monday 23 January 2006 18:19, Dave McCracken wrote:
<snip>
>
> The basic rule for pte sharing is that some portion of a memory region must
> span an entire pte page.  For i386 and x96_64 that would be 2 meg.  The
> region must either be read-only or marked to be shared if it is writeable.
>

Yeah, I figured that out just after hitting "send" on that first note.  :-(

> The code does opportunistically look for any pte page that is fully within
> a shareable vma, and will share if it finds one.
>
> Oh, and one more caveat.  The region must be mapped to the same address in
> each process.
>
> > I turned on the PT_DEBUG stuff, but thus far have found no evidence of
> > pte  sharing actually occurring in a normal system boot.  I'm surprised
> > by that as  I (naively?) would have expected shared libraries to use
> > shared ptes.
>

OK, with those guidelines I can put together a test program pretty quickly.
If you have one handy that would be fine, but don't put a lot of effort into 
it.

Thanks,

> Most system software, including the shared libraries, don't have any
> regions that are big enough for sharing (the text section for libc, for
> example, is about 1.5 meg).
>

Ah, that explains that then.

> Dave McCracken

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:39       ` Andi Kleen
@ 2006-01-24  0:51         ` Dave McCracken
  2006-01-24  1:11           ` Andi Kleen
  2006-01-24  0:53         ` Ray Bryant
  1 sibling, 1 reply; 55+ messages in thread
From: Dave McCracken @ 2006-01-24  0:51 UTC (permalink / raw)
  To: Andi Kleen, Ray Bryant
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management


--On Tuesday, January 24, 2006 01:39:46 +0100 Andi Kleen <ak@suse.de> wrote:

>> Oh, obviously that is not right as you have to share full pte pages.
>> So on  x86_64 I'm guessing one needs 2MB alignment in order to get the
>> sharing to kick in, since a pte page maps 512 pages of 4 KB each.
> 
> The new randomized mmaps will likely actively sabotate such alignment. I
> just added them for x86-64.

Given that my current patch requires memory regions to be mapped at the
same address, randomized mmaps won't really make it any worse.  It's
unlikely mmap done independently in separate processes will land on the
same address.  Most of the large OLTP applications use fixed address
mapping for their large shared regions.

This also should mean large text regions will be shared, which should be a
win for big programs.

I've been kicking around some ideas on how to allow sharing of mappings at
different addresses as long as the alignment is the same, but don't have
the details worked out.  I figure it's more important to get the basic part
working first :)

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:39       ` Andi Kleen
  2006-01-24  0:51         ` Dave McCracken
@ 2006-01-24  0:53         ` Ray Bryant
  2006-01-24  1:00           ` Dave McCracken
  2006-01-24  1:10           ` Andi Kleen
  1 sibling, 2 replies; 55+ messages in thread
From: Ray Bryant @ 2006-01-24  0:53 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Dave McCracken, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management

On Monday 23 January 2006 18:39, Andi Kleen wrote:
> On Tuesday 24 January 2006 01:16, Ray Bryant wrote:
> > On Monday 23 January 2006 17:58, Ray Bryant wrote:
> > <snip>
> >
> > > ... And what kind of alignment constraints do we end up
> > > under in order to make the sharing happen?   (My guess would be that
> > > there aren't any such constraints (well, page alignment.. :-)  if we
> > > are just sharing pte's.)
> >
> > Oh, obviously that is not right as you have to share full pte pages.   So
> > on x86_64 I'm guessing one needs 2MB alignment in order to get the
> > sharing to kick in, since a pte page maps 512 pages of 4 KB each.
>
> The new randomized mmaps will likely actively sabotate such alignment. I
> just added them for x86-64.
>
> -Andi

Hmmm, does that mean there is a fundamental conflict between the desire to 
share pte's and getting good cache coloring behavior?

Isn't it the case that if the region is large enough (say >> 2MB), that 
randomized mmaps will just cause the first partial page of pte's to not be 
shareable, and as soon as we have a full pte page mapped into the file that 
the full pte pages will be shareable, etc, until the last (partial) pte page 
is not shareable?

> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:53         ` Ray Bryant
@ 2006-01-24  1:00           ` Dave McCracken
  2006-01-24  1:10           ` Andi Kleen
  1 sibling, 0 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-24  1:00 UTC (permalink / raw)
  To: Ray Bryant, Andi Kleen
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management


--On Monday, January 23, 2006 18:53:54 -0600 Ray Bryant
<raybry@mpdtxmail.amd.com> wrote:

> Isn't it the case that if the region is large enough (say >> 2MB), that 
> randomized mmaps will just cause the first partial page of pte's to not
> be  shareable, and as soon as we have a full pte page mapped into the
> file that  the full pte pages will be shareable, etc, until the last
> (partial) pte page  is not shareable?

Yes, but only if the 2MB alignment is the same.  And with the current
version of my patch the region has to be mapped at the same address in both
processes.  Sorry.

Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:53         ` Ray Bryant
  2006-01-24  1:00           ` Dave McCracken
@ 2006-01-24  1:10           ` Andi Kleen
  2006-01-24  1:23             ` Benjamin LaHaise
  2006-01-24  7:06             ` Arjan van de Ven
  1 sibling, 2 replies; 55+ messages in thread
From: Andi Kleen @ 2006-01-24  1:10 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Dave McCracken, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management

On Tuesday 24 January 2006 01:53, Ray Bryant wrote:
> On Monday 23 January 2006 18:39, Andi Kleen wrote:
> > On Tuesday 24 January 2006 01:16, Ray Bryant wrote:
> > > On Monday 23 January 2006 17:58, Ray Bryant wrote:
> > > <snip>
> > >
> > > > ... And what kind of alignment constraints do we end up
> > > > under in order to make the sharing happen?   (My guess would be that
> > > > there aren't any such constraints (well, page alignment.. :-)  if we
> > > > are just sharing pte's.)
> > >
> > > Oh, obviously that is not right as you have to share full pte pages.   So
> > > on x86_64 I'm guessing one needs 2MB alignment in order to get the
> > > sharing to kick in, since a pte page maps 512 pages of 4 KB each.
> >
> > The new randomized mmaps will likely actively sabotate such alignment. I
> > just added them for x86-64.
> >
> > -Andi
> 
> Hmmm, does that mean there is a fundamental conflict between the desire to 
> share pte's and getting good cache coloring behavior?

The randomization is not for cache coloring, but for security purposes
(except for the old very small stack randomization that was used
to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
mmap made much difference because it's page aligned and at least
on x86 the L2 and larger caches are usually PI.

> Isn't it the case that if the region is large enough (say >> 2MB), that 
> randomized mmaps will just cause the first partial page of pte's to not be 
> shareable, and as soon as we have a full pte page mapped into the file that 
> the full pte pages will be shareable, etc, until the last (partial) pte page 
> is not shareable?

They need the same alignment and with the random bits in there it's unlikely
to be ever the same.

-Andi

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:51         ` Dave McCracken
@ 2006-01-24  1:11           ` Andi Kleen
  2006-01-24  1:26             ` Dave McCracken
  0 siblings, 1 reply; 55+ messages in thread
From: Andi Kleen @ 2006-01-24  1:11 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Ray Bryant, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management

On Tuesday 24 January 2006 01:51, Dave McCracken wrote:
>  Most of the large OLTP applications use fixed address
> mapping for their large shared regions.

Really? That sounds like a quite bad idea because it can easily break
if something changes in the way virtual memory is laid out (which
has happened - e.g. movement to 4level page tables on x86-64 and now
randomized mmaps) 

I don't think we should encourage such unportable behaviour.

-Andi

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  1:10           ` Andi Kleen
@ 2006-01-24  1:23             ` Benjamin LaHaise
  2006-01-24  1:38               ` Andi Kleen
  2006-01-24  7:06             ` Arjan van de Ven
  1 sibling, 1 reply; 55+ messages in thread
From: Benjamin LaHaise @ 2006-01-24  1:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ray Bryant, Dave McCracken, Robin Holt, Hugh Dickins,
	Linux Kernel, Linux Memory Management

On Tue, Jan 24, 2006 at 02:10:03AM +0100, Andi Kleen wrote:
> The randomization is not for cache coloring, but for security purposes
> (except for the old very small stack randomization that was used
> to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
> mmap made much difference because it's page aligned and at least
> on x86 the L2 and larger caches are usually PI.

Actually, does this even affect executable segments?  Iirc, prelinking 
already results in executables being mapped at the same physical offset 
across binaries in a given system.  An strace seems to confirm that.

		-ben
-- 
"Ladies and gentlemen, I'm sorry to interrupt, but the police are here 
and they've asked us to stop the party."  Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  1:11           ` Andi Kleen
@ 2006-01-24  1:26             ` Dave McCracken
  0 siblings, 0 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-24  1:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ray Bryant, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management


--On Tuesday, January 24, 2006 02:11:58 +0100 Andi Kleen <ak@suse.de> wrote:

> Really? That sounds like a quite bad idea because it can easily break
> if something changes in the way virtual memory is laid out (which
> has happened - e.g. movement to 4level page tables on x86-64 and now
> randomized mmaps) 
> 
> I don't think we should encourage such unportable behaviour.

I haven't looked into how they do it.  It could be as simple as mapping the
memory region, then forking all the processes that use it.  Or they could
be communicating the mapped address to the other processes dynamically via
some other mechanism.  All I know is the memory ends up being mapped at the
same address in all processes.

Or are you saying using the same address is a bad thing even if it's
determined at runtime?

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  1:23             ` Benjamin LaHaise
@ 2006-01-24  1:38               ` Andi Kleen
  2006-01-24  7:08                 ` Arjan van de Ven
  0 siblings, 1 reply; 55+ messages in thread
From: Andi Kleen @ 2006-01-24  1:38 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Ray Bryant, Dave McCracken, Robin Holt, Hugh Dickins,
	Linux Kernel, Linux Memory Management

On Tuesday 24 January 2006 02:23, Benjamin LaHaise wrote:
> On Tue, Jan 24, 2006 at 02:10:03AM +0100, Andi Kleen wrote:
> > The randomization is not for cache coloring, but for security purposes
> > (except for the old very small stack randomization that was used
> > to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
> > mmap made much difference because it's page aligned and at least
> > on x86 the L2 and larger caches are usually PI.
> 
> Actually, does this even affect executable segments?  Iirc, prelinking 
> already results in executables being mapped at the same physical offset 
> across binaries in a given system.  An strace seems to confirm that.

Shared libraries should be affected. And prelink is not always used.

-Andi

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  1:10           ` Andi Kleen
  2006-01-24  1:23             ` Benjamin LaHaise
@ 2006-01-24  7:06             ` Arjan van de Ven
  2006-01-24  7:18               ` Andi Kleen
  2006-01-24 14:48               ` Dave McCracken
  1 sibling, 2 replies; 55+ messages in thread
From: Arjan van de Ven @ 2006-01-24  7:06 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ray Bryant, Dave McCracken, Robin Holt, Hugh Dickins,
	Linux Kernel, Linux Memory Management


> The randomization is not for cache coloring, but for security purposes
> (except for the old very small stack randomization that was used
> to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
> mmap made much difference because it's page aligned and at least
> on x86 the L2 and larger caches are usually PI.

randomization to a large degree is more important between machines than
within the same machine (except for setuid stuff but lets call that a
special category for now). Imo prelink is one of the better bets to get
"all code for a binary/lib on the same 2 mb page", all distros ship
prelink nowadays anyway (it's too much of a win that nobody can afford
to not ship it ;) and within prelink the balance between randomization
for security and 2Mb sharing can be struck best. In fact it needs know
about the 2Mb thing anyway to place it there properly and for all
binaries... the kernel just can't do that.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  1:38               ` Andi Kleen
@ 2006-01-24  7:08                 ` Arjan van de Ven
  0 siblings, 0 replies; 55+ messages in thread
From: Arjan van de Ven @ 2006-01-24  7:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Benjamin LaHaise, Ray Bryant, Dave McCracken, Robin Holt,
	Hugh Dickins, Linux Kernel, Linux Memory Management

On Tue, 2006-01-24 at 02:38 +0100, Andi Kleen wrote:
> On Tuesday 24 January 2006 02:23, Benjamin LaHaise wrote:
> > On Tue, Jan 24, 2006 at 02:10:03AM +0100, Andi Kleen wrote:
> > > The randomization is not for cache coloring, but for security purposes
> > > (except for the old very small stack randomization that was used
> > > to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
> > > mmap made much difference because it's page aligned and at least
> > > on x86 the L2 and larger caches are usually PI.
> > 
> > Actually, does this even affect executable segments?  Iirc, prelinking 
> > already results in executables being mapped at the same physical offset 
> > across binaries in a given system.  An strace seems to confirm that.
> 
> Shared libraries should be affected. And prelink is not always used.

without prelink you have almost no sharing of the exact same locations,
since each binary will link to different libs and in different orders,
so any sharing you get is purely an accident (with glibc being maybe an
exception since everything will link that first). This
sharing-of-code-pagetables seems to really depend on prelink to work
well, regardless of randomization


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  7:06             ` Arjan van de Ven
@ 2006-01-24  7:18               ` Andi Kleen
  2006-01-27 18:16                 ` Martin Bligh
  2006-02-01  9:49                 ` Nick Piggin
  2006-01-24 14:48               ` Dave McCracken
  1 sibling, 2 replies; 55+ messages in thread
From: Andi Kleen @ 2006-01-24  7:18 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ray Bryant, Dave McCracken, Robin Holt, Hugh Dickins,
	Linux Kernel, Linux Memory Management

On Tuesday 24 January 2006 08:06, Arjan van de Ven wrote:
> 
> > The randomization is not for cache coloring, but for security purposes
> > (except for the old very small stack randomization that was used
> > to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
> > mmap made much difference because it's page aligned and at least
> > on x86 the L2 and larger caches are usually PI.
> 
> randomization to a large degree is more important between machines than
> within the same machine (except for setuid stuff but lets call that a
> special category for now). Imo prelink is one of the better bets to get
> "all code for a binary/lib on the same 2 mb page",

Probably yes.

> all distros ship 
> prelink nowadays anyway 

SUSE doesn't use it.

> (it's too much of a win that nobody can afford 
> to not ship it ;) 

KDE and some other people disagree on that. 

> and within prelink the balance between randomization 
> for security and 2Mb sharing can be struck best. In fact it needs know
> about the 2Mb thing anyway to place it there properly and for all
> binaries... the kernel just can't do that.

Well, we first have to figure out if the shared page tables
are really worth all the ugly code, nasty locking and other problems 
(inefficient TLB flush etc.) I personally would prefer
to make large pages work better before going down that path.

-Andi
 

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  7:06             ` Arjan van de Ven
  2006-01-24  7:18               ` Andi Kleen
@ 2006-01-24 14:48               ` Dave McCracken
  2006-01-24 14:56                 ` Arjan van de Ven
  1 sibling, 1 reply; 55+ messages in thread
From: Dave McCracken @ 2006-01-24 14:48 UTC (permalink / raw)
  To: Arjan van de Ven, Andi Kleen
  Cc: Ray Bryant, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management


--On Tuesday, January 24, 2006 08:06:37 +0100 Arjan van de Ven
<arjan@infradead.org> wrote:

>> The randomization is not for cache coloring, but for security purposes
>> (except for the old very small stack randomization that was used
>> to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
>> mmap made much difference because it's page aligned and at least
>> on x86 the L2 and larger caches are usually PI.
> 
> randomization to a large degree is more important between machines than
> within the same machine (except for setuid stuff but lets call that a
> special category for now). Imo prelink is one of the better bets to get
> "all code for a binary/lib on the same 2 mb page", all distros ship
> prelink nowadays anyway (it's too much of a win that nobody can afford
> to not ship it ;) and within prelink the balance between randomization
> for security and 2Mb sharing can be struck best. In fact it needs know
> about the 2Mb thing anyway to place it there properly and for all
> binaries... the kernel just can't do that.

Currently libc and most other system libraries have text segments smaller
than 2MB, so they won't share anyway.  We can't even coalesce adjacent
libraries since the linker puts unshareable data space after each library.

The main win for text sharing is applications with large text in the
program itself.  As long as that's loaded at the same address we'll share
page tables for it.

I thought the main security benefit for randomization of mapped regions was
for writeable data space anyway.  Isn't text space protected by not being
writeable?

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24 14:48               ` Dave McCracken
@ 2006-01-24 14:56                 ` Arjan van de Ven
  0 siblings, 0 replies; 55+ messages in thread
From: Arjan van de Ven @ 2006-01-24 14:56 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Andi Kleen, Ray Bryant, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management


> I thought the main security benefit for randomization of mapped regions was
> for writeable data space anyway.  Isn't text space protected by not being
> writeable?

nope that's not correct.
Aside from stack randomization, randomization is to a large degree
intended to make the return-to-libc kind of attacks harder, by not
giving attackers a fixed address to return to. That's all code, nothing
to do with data.




^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-23 17:39   ` Dave McCracken
  2006-01-23 20:19     ` Benjamin LaHaise
@ 2006-01-24 17:50     ` Hugh Dickins
  2006-01-24 18:07       ` Dave McCracken
  1 sibling, 1 reply; 55+ messages in thread
From: Hugh Dickins @ 2006-01-24 17:50 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management

On Mon, 23 Jan 2006, Dave McCracken wrote:
> --On Friday, January 20, 2006 21:24:08 +0000 Hugh Dickins
> <hugh@veritas.com> wrote:
> 
> I'll look into getting some profiles.

Thanks, that will help everyone to judge the performance/complexity better.

> The pmd level is important for ppc because it works in segments, which are
> 256M in size and consist of a full pmd page.  The current implementation of
> the way ppc loads its tlb doesn't allow sharing at smaller than segment
> size.

Does that make pmd page sharing strictly necessary?  The way you describe
it, it sounds like it's merely that you "might as well" share pmd page,
because otherwise would always just waste memory on PPC.  But if it's
significantly less complex not to go to that level, it may be worth that
waste of memory (which would be a small fraction of what's now wasted at
the pte level, wouldn't it?).  Sorry for belabouring this point, which
may just be a figment of my ignorance, but you've not convinced me yet.

And maybe I'm exaggerating the additional complexity: you have, after
all, been resolute in treating the levels in the same way.  It's more
a matter of offputting patch size than complexity: imagine splitting
the patch into two, one to implement it at the pte level first, then
a second to take it up to pmd level, that would be better.

> I needed a function that returns a struct page for pgd and pud, defined in
> each architecture.  I decided the simplest way was to redefine pgd_page and
> pud_page to match pmd_page and pte_page.  Both functions are pretty much
> used one place per architecture, so the change is trivial.  I could come up
> with new functions instead if you think it's an issue.  I do have a bit of
> a fetish about symmetry across levels :)

Sounds to me like you made the right decision.

> >> +#define	pt_decrement_share(page)
> >> +#define pt_shareable_vma(vma)	(0)
> >> +#define	pt_unshare_range(vma, address, end)
> >> +#endif /* CONFIG_PTSHARE */
> > 
> > Please keep to "#define<space>MACRO<tab(s)definition" throughout:
> > easiest thing would be to edit the patch itself.
> 
> Sorry.  Done.  I thought the standard was "#define<TAB>" like all the other
> C code I've ever seen.  I didn't realize Linux does it different.

No, I can't claim "#define<space>" is a Linux standard: I happen to
prefer it myself, and it seems to predominate in the header files I've
most often added to; but I was only trying to remove the distracting
inconsistency from your patch, whichever way round.

> >>  static inline int copy_pmd_range(struct mm_struct *dst_mm, struct
> >>  mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct
> >>  		vm_area_struct *vma,
> >> -		unsigned long addr, unsigned long end)
> >> +		unsigned long addr, unsigned long end, int shareable)
> >>  {
> > 
> > I'd have preferred not to add the shareable argument at each level,
> > and work it out here; or is that significantly less efficient?
> 
> My gut feeling is that the vma-level tests for shareability are significant
> enough that we only want to do them once, then pass the result down the
> call  stack.  I could change it if you disagree about the relative cost.

I now think cut out completely your mods from copy_page_range and its
subfunctions.  Given Nick's "Don't copy ptes..." optimization there,
what shareable areas will reach the lower levels?  Only the VM_PFNMAP
and VM_INSERTPAGE ones: which do exist, and may be large enough to
qualify; but they're not the areas you're interested in targetting,
so I'd say keep it simple and forget about shareability here.  The
simpler you keep your patch, the likelier it is to convince people.

And that leads on to another issue which occurred to me today, in
reading through your overnight replies.  Notice the check on anon_vma
in that optimization?  None of your "shareable" tests check anon_vma:
the vm_flags shared/write check may be enough in some contexts, but
not in all.  VM_WRITE can come and go, and even a VM_SHARED vma may
have anon pages in it (through Linus' strange ptrace case): you must
keep away from sharing pagetables once you've got anonymous pages in
your vma (well, you could share those pagetables not yet anonymized,
but that's just silly complexity).  And in particular, the prio_tree
loop next_shareable_vma needs to skip any vma with its anon_vma set:
at present you're (unlikely but) liable to offer entirely unsuitable
pagetables for sharing there.

(In the course of writing this, I've discovered that 2.6.16-rc
supports COWed hugepages: anonymous pages without any anon_vma.
I'd better get up to speed on those: maybe we'll want to allocate
an anon_vma just for the appropriate tests, maybe we'll want to
set another VM_flag, don't know yet... for the moment you ignore
them, and assume anon_vma is the thing to test for, as in 2.6.15.)

One other thing I forgot to comment on your patch: too many largish
inline functions - our latest fashion is only to say "inline" on the
one-or-two liners.

Hugh

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24 17:50     ` Hugh Dickins
@ 2006-01-24 18:07       ` Dave McCracken
  2006-01-24 18:20         ` Hugh Dickins
  0 siblings, 1 reply; 55+ messages in thread
From: Dave McCracken @ 2006-01-24 18:07 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management


--On Tuesday, January 24, 2006 17:50:17 +0000 Hugh Dickins
<hugh@veritas.com> wrote:

> On Mon, 23 Jan 2006, Dave McCracken wrote:
> 
>> The pmd level is important for ppc because it works in segments, which
>> are 256M in size and consist of a full pmd page.  The current
>> implementation of the way ppc loads its tlb doesn't allow sharing at
>> smaller than segment size.
> 
> Does that make pmd page sharing strictly necessary?  The way you describe
> it, it sounds like it's merely that you "might as well" share pmd page,
> because otherwise would always just waste memory on PPC.  But if it's
> significantly less complex not to go to that level, it may be worth that
> waste of memory (which would be a small fraction of what's now wasted at
> the pte level, wouldn't it?).  Sorry for belabouring this point, which
> may just be a figment of my ignorance, but you've not convinced me yet.

No, ppc can not share at smaller than segment size at all.  We need pmd
level sharing to get to the 256MB segment size.  Once we can share this
level, there's a whole other feature of ppc that comes into play.  TLB
entries are keyed by segment id, not process, so all processes sharing a
segment can reuse TLB entries for regions they share.  This is only
possible if they share the page table at segment level.

> And maybe I'm exaggerating the additional complexity: you have, after
> all, been resolute in treating the levels in the same way.  It's more
> a matter of offputting patch size than complexity: imagine splitting
> the patch into two, one to implement it at the pte level first, then
> a second to take it up to pmd level, that would be better.
> 
>> I needed a function that returns a struct page for pgd and pud, defined
>> in each architecture.  I decided the simplest way was to redefine
>> pgd_page and pud_page to match pmd_page and pte_page.  Both functions
>> are pretty much used one place per architecture, so the change is
>> trivial.  I could come up with new functions instead if you think it's
>> an issue.  I do have a bit of a fetish about symmetry across levels :)
> 
> Sounds to me like you made the right decision.

I had a thought... would it be preferable for me to make this change as a
separate patch across all archictures in the name of consistency?  Or
should I continue to roll it into the shared pagetable patch as we enable
each architecture?

>> >>  static inline int copy_pmd_range(struct mm_struct *dst_mm, struct
>> >>  mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct
>> >>  		vm_area_struct *vma,
>> >> -		unsigned long addr, unsigned long end)
>> >> +		unsigned long addr, unsigned long end, int shareable)
>> >>  {
>> > 
>> > I'd have preferred not to add the shareable argument at each level,
>> > and work it out here; or is that significantly less efficient?
>> 
>> My gut feeling is that the vma-level tests for shareability are
>> significant enough that we only want to do them once, then pass the
>> result down the call  stack.  I could change it if you disagree about
>> the relative cost.
> 
> I now think cut out completely your mods from copy_page_range and its
> subfunctions.  Given Nick's "Don't copy ptes..." optimization there,
> what shareable areas will reach the lower levels?  Only the VM_PFNMAP
> and VM_INSERTPAGE ones: which do exist, and may be large enough to
> qualify; but they're not the areas you're interested in targetting,
> so I'd say keep it simple and forget about shareability here.  The
> simpler you keep your patch, the likelier it is to convince people.

I'll look into this in more detail.  It would be nice if I don't have to
deal with it.  I don't want to lose sharing pagetables on fork for normally
shared regions.  If they don't get into copy_page_range, then we'll be fine.

> And that leads on to another issue which occurred to me today, in
> reading through your overnight replies.  Notice the check on anon_vma
> in that optimization?  None of your "shareable" tests check anon_vma:
> the vm_flags shared/write check may be enough in some contexts, but
> not in all.  VM_WRITE can come and go, and even a VM_SHARED vma may
> have anon pages in it (through Linus' strange ptrace case): you must
> keep away from sharing pagetables once you've got anonymous pages in
> your vma (well, you could share those pagetables not yet anonymized,
> but that's just silly complexity).  And in particular, the prio_tree
> loop next_shareable_vma needs to skip any vma with its anon_vma set:
> at present you're (unlikely but) liable to offer entirely unsuitable
> pagetables for sharing there.

I'll have to think through this again.  I'll need to evaluate just what
states a vma can be in, and make sure I catch it at all its transitions out
of what should be shareable.  It sounds like there may be states I didn't
catch the first time.

> (In the course of writing this, I've discovered that 2.6.16-rc
> supports COWed hugepages: anonymous pages without any anon_vma.
> I'd better get up to speed on those: maybe we'll want to allocate
> an anon_vma just for the appropriate tests, maybe we'll want to
> set another VM_flag, don't know yet... for the moment you ignore
> them, and assume anon_vma is the thing to test for, as in 2.6.15.)

Yeah, I've been trying to keep up with the hugepage changes, but they're
coming pretty fast.  I'll look at them again too.

> One other thing I forgot to comment on your patch: too many largish
> inline functions - our latest fashion is only to say "inline" on the
> one-or-two liners.

Ok.  I haven't really kept up with the style du jour for inlining :)

Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24 18:07       ` Dave McCracken
@ 2006-01-24 18:20         ` Hugh Dickins
  0 siblings, 0 replies; 55+ messages in thread
From: Hugh Dickins @ 2006-01-24 18:20 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management

On Tue, 24 Jan 2006, Dave McCracken wrote:
> --On Tuesday, January 24, 2006 17:50:17 +0000 Hugh Dickins
> <hugh@veritas.com> wrote:
> > On Mon, 23 Jan 2006, Dave McCracken wrote:
> > 
> >> I needed a function that returns a struct page for pgd and pud, defined
> >> in each architecture.  I decided the simplest way was to redefine
> >> pgd_page and pud_page to match pmd_page and pte_page.  Both functions
> >> are pretty much used one place per architecture, so the change is
> >> trivial.  I could come up with new functions instead if you think it's
> >> an issue.  I do have a bit of a fetish about symmetry across levels :)
> > 
> > Sounds to me like you made the right decision.
> 
> I had a thought... would it be preferable for me to make this change as a
> separate patch across all archictures in the name of consistency?  Or

Yes - but don't expect it to be taken if your shared pagetables aren't:
just submit it as the first(ish) patch in your shared pagetables series.

> should I continue to roll it into the shared pagetable patch as we enable
> each architecture?

Hugh

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  0:19     ` Dave McCracken
  2006-01-24  0:46       ` Ray Bryant
@ 2006-01-24 23:43       ` Ray Bryant
  2006-01-24 23:50         ` Dave McCracken
  1 sibling, 1 reply; 55+ messages in thread
From: Ray Bryant @ 2006-01-24 23:43 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

On Monday 23 January 2006 18:19, Dave McCracken wrote:

>
> My apologies.  I do have a small test program and intended to clean it up
> to send to Robin, but got sidetracked (it's fugly at the moment).  I'll see
> about getting it a bit more presentable.
>

I created a test case that mmaps() a 20 GB anonymous, shared region, then 
forks off 128 child processes that touch the 20 GB region.    The results are 
pretty dramatic:

                          2.6.15                            2.6.15-shpt
                         ---------                         ----------------
Total time:          182 s                                  12.2 s
Fork time:             30,2 s                               10.4 s
pte space:            5 GB                                 44 MB

Here total time is the amount of time for all 128 threads to fork and touch 
all 20 GB of data and fork time is the elapsed time for the parent to do all 
of the forks.    pte space is the amount of space reported in /proc/meminfo 
to hold the page tables required to run the test program.   This is on an 
8-core, 4 socket Opteron running at 2.2 GHZ with 32 GB of RAM.

Of course, it would be more dramatic with a real DB application, but that is 
going to take a bit longer to get running, perhaps a couple of months by the 
time all is said and done.

Now I am off to figure out how Andi's mmap() randomization patch interacts 
with all of this stuff.

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24 23:43       ` Ray Bryant
@ 2006-01-24 23:50         ` Dave McCracken
  2006-01-25  0:21           ` Ray Bryant
  2006-01-25 22:48           ` Ray Bryant
  0 siblings, 2 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-24 23:50 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management


Those are interesting numbers.  That's pretty much the showcase for
sharing, yeah.

--On Tuesday, January 24, 2006 17:43:28 -0600 Ray Bryant
<raybry@mpdtxmail.amd.com> wrote:

> Of course, it would be more dramatic with a real DB application, but that
> is  going to take a bit longer to get running, perhaps a couple of months
> by the  time all is said and done.

I must mention here that I think most DB performance suites do their forks
up front, then never fork during the test, so fork performance doesn't
really factor in as much.  There are other reasons shared page tables helps
there, though.

> Now I am off to figure out how Andi's mmap() randomization patch
> interacts  with all of this stuff.

mmap() randomization doesn't affect fork at all, since by definition all
regions are at the same address in the child as the parent (ie good for
sharing).  The trickier case is where processes independently mmap() a
region.

Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24 23:50         ` Dave McCracken
@ 2006-01-25  0:21           ` Ray Bryant
  2006-01-25 22:48           ` Ray Bryant
  1 sibling, 0 replies; 55+ messages in thread
From: Ray Bryant @ 2006-01-25  0:21 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

On Tuesday 24 January 2006 17:50, Dave McCracken wrote:
> Those are interesting numbers.  That's pretty much the showcase for
> sharing, yeah.
>
<snip>
> I must mention here that I think most DB performance suites do their forks
> up front, then never fork during the test, so fork performance doesn't
> really factor in as much.  There are other reasons shared page tables helps
> there, though.
>

Yes, that's why I mentioned the DB workloads as being more interesting.

> > Now I am off to figure out how Andi's mmap() randomization patch
> > interacts  with all of this stuff.
>
> mmap() randomization doesn't affect fork at all, since by definition all
> regions are at the same address in the child as the parent (ie good for
> sharing).  The trickier case is where processes independently mmap() a
> region.
>
> Dave
>

Hmm... I agree with your previous comment there, then.   The only way the big 
DB guys can start up that way is if they use MAP_FIXED and a pre-agreed on 
address.   Otherwise, they can't do lookups of DB buffers in the shared 
storage area (i. e. this is basically user space buffer cache management).  
It has to be the case that they all agree on what addresses the buffers are 
at so each process can share the buffers.

So, that would argue that the mmap() randomization can't effect them either, 
without breaking the existing API.
-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-13  5:15 ` Brian Twichell
  2006-01-13 22:34   ` Ray Bryant
@ 2006-01-25  4:14   ` Brian Twichell
  1 sibling, 0 replies; 55+ messages in thread
From: Brian Twichell @ 2006-01-25  4:14 UTC (permalink / raw)
  To: Linux Kernel
  Cc: Dave McCracken, Andrew Morton, Linux Memory Management,
	Hugh Dickens, slpratt

Brian Twichell wrote:

>
> We evaluated page table sharing on x86_64 and ppc64 setups, using a 
> database
> OLTP workload.  In both cases, 4-way systems with 64 GB of memory were 
> used.
>
> On the x86_64 setup, page table sharing provided a 25% increase in 
> performance,
> when the database buffers were in small (4 KB) pages.  In this case, 
> over 14 GB
> of memory was freed, that had previously been taken up by page 
> tables.  In the
> case that the database buffers were in huge (2 MB) pages, page table 
> sharing
> provided a 4% increase in performance.
>
> Our ppc64 experiments used an earlier version of Dave's patch, along with
> ppc64-specific code for sharing of ppc64 segments.  On this setup, page
> table sharing provided a 49% increase in performance, when the database
> buffers were in small (4 KB) pages.  Over 10 GB of memory was freed, that
> had previously been taken up by page tables.  In the case that the 
> database
> buffers were in huge (16 MB) pages, page table sharing provided a 3% 
> increase
> in performance.
>
Hi,

Just wanted to dispel any notion that may be out there that
the improvements we've seen are peculiar to an IBM product
stack.

The 49% improvement observed using small pages on ppc64 used
Oracle as the DBMS.  The other results used DB2 as the DBMS.

So, the improvements not peculiar to an IBM product stack,
and moreover the largest improvement was seen with a non-IBM
DBMS.

Note, the relative improvement observed on each platform/pagesize
combination cannot be used to infer relative performance between
DBMS's or platforms.

Cheers,
Brian


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24 23:50         ` Dave McCracken
  2006-01-25  0:21           ` Ray Bryant
@ 2006-01-25 22:48           ` Ray Bryant
  2006-01-25 22:52             ` Dave McCracken
  1 sibling, 1 reply; 55+ messages in thread
From: Ray Bryant @ 2006-01-25 22:48 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

Dave,

Empirically, at least on Opteron, it looks like the first page of pte's is 
never shared, even if the alignment of the mapped region is correct (i. e. a 
2MB boundary for X86_64).    Is that what you expected?

(This is for a kernel built with just pte_sharing enabled, no higher levels.)

I would expect the first page of pte's not to be shared if the alignment is 
not correct, similarly for the last page if the mapped region doesn't 
entirely fill up the last page of pte's.

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-25 22:48           ` Ray Bryant
@ 2006-01-25 22:52             ` Dave McCracken
  2006-01-26  0:16               ` Ray Bryant
  2006-01-26  0:58               ` Ray Bryant
  0 siblings, 2 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-25 22:52 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management


--On Wednesday, January 25, 2006 16:48:58 -0600 Ray Bryant
<raybry@mpdtxmail.amd.com> wrote:

> Empirically, at least on Opteron, it looks like the first page of pte's
> is  never shared, even if the alignment of the mapped region is correct
> (i. e. a  2MB boundary for X86_64).    Is that what you expected?

If the region is aligned it should be shared.  I'll investigate.

Thanks,
Dave


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-25 22:52             ` Dave McCracken
@ 2006-01-26  0:16               ` Ray Bryant
  2006-01-26  0:58               ` Ray Bryant
  1 sibling, 0 replies; 55+ messages in thread
From: Ray Bryant @ 2006-01-26  0:16 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

Dave,

Here's another one to keep you awake at night:

mmap a shared, anonymous region of 8MB (2MB aligned), and fork off some 
children (4 is good enough for me).    In each child, munmap() a 2 MB portion 
of the shared region (starting at offset 2MB in the shared region) and then 
mmap() a private, anonymous region in its place.   Have each child store some 
unique data in that region, sleep for a bit and then go look to see if its 
data is still there.

Under 2.6.15, this works just fine.   Under 2.6.15 + shpt patch, each child 
still points at the shared region and steps on the data of the other 
children.

I imagine the above gets even more interesting if the offset or length of the 
unmapped region is not a multiple of the 2MB alignment.

All of these test cases are for Opteron.   Your alignment may vary.

(Sorry about this... )

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-25 22:52             ` Dave McCracken
  2006-01-26  0:16               ` Ray Bryant
@ 2006-01-26  0:58               ` Ray Bryant
  2006-01-26  4:06                 ` Robin Holt
  1 sibling, 1 reply; 55+ messages in thread
From: Ray Bryant @ 2006-01-26  0:58 UTC (permalink / raw)
  To: Dave McCracken
  Cc: Robin Holt, Hugh Dickins, Linux Kernel, Linux Memory Management

Dave,

Hmph.... further analysis shows that the situation is a more complicated than 
described in my last note, lets compare notes off-list and see what 
conclusions, if any, we can come to.

Thanks,
-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-26  0:58               ` Ray Bryant
@ 2006-01-26  4:06                 ` Robin Holt
  0 siblings, 0 replies; 55+ messages in thread
From: Robin Holt @ 2006-01-26  4:06 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Dave McCracken, Robin Holt, Hugh Dickins, Linux Kernel,
	Linux Memory Management

On Wed, Jan 25, 2006 at 06:58:10PM -0600, Ray Bryant wrote:
> Dave,
> 
> Hmph.... further analysis shows that the situation is a more complicated than 
> described in my last note, lets compare notes off-list and see what 
> conclusions, if any, we can come to.

Why off-list.  I think the munmap() or mmap() in the middle cases are
interesting.  I was hoping Dave's test program has those cases in
there as well as mapping of hugetlbfs files.  If you do take this off-list,
I would like to ride along ;)

Thanks,
Robin

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  7:18               ` Andi Kleen
@ 2006-01-27 18:16                 ` Martin Bligh
  2006-02-01  9:49                 ` Nick Piggin
  1 sibling, 0 replies; 55+ messages in thread
From: Martin Bligh @ 2006-01-27 18:16 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Arjan van de Ven, Ray Bryant, Dave McCracken, Robin Holt,
	Hugh Dickins, Linux Kernel, Linux Memory Management

Andi Kleen wrote:
> On Tuesday 24 January 2006 08:06, Arjan van de Ven wrote:
> 
>>>The randomization is not for cache coloring, but for security purposes
>>>(except for the old very small stack randomization that was used
>>>to avoid conflicts on HyperThreaded CPUs). I would be surprised if the
>>>mmap made much difference because it's page aligned and at least
>>>on x86 the L2 and larger caches are usually PI.
>>
>>randomization to a large degree is more important between machines than
>>within the same machine (except for setuid stuff but lets call that a
>>special category for now). Imo prelink is one of the better bets to get
>>"all code for a binary/lib on the same 2 mb page",
> 
> 
> Probably yes.
> 
> 
>>all distros ship 
>>prelink nowadays anyway 
> 
> 
> SUSE doesn't use it.
> 
> 
>>(it's too much of a win that nobody can afford 
>>to not ship it ;) 
> 
> 
> KDE and some other people disagree on that. 
> 
> 
>>and within prelink the balance between randomization 
>>for security and 2Mb sharing can be struck best. In fact it needs know
>>about the 2Mb thing anyway to place it there properly and for all
>>binaries... the kernel just can't do that.
> 
> 
> Well, we first have to figure out if the shared page tables
> are really worth all the ugly code, nasty locking and other problems 
> (inefficient TLB flush etc.) I personally would prefer
> to make large pages work better before going down that path.

That needs defragmentation, etc, etc. etc. It also requires changes to 
userspace apps. Large pages are crippled right now, and it looks like 
they're going to stay that way. We need some sort of solution, and this 
is pretty clean and transparent, by comparison.

m.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-20 21:24 ` Hugh Dickins
  2006-01-20 21:54   ` Chen, Kenneth W
  2006-01-23 17:39   ` Dave McCracken
@ 2006-01-27 22:50   ` Brian Twichell
  2006-01-30 18:46     ` Ray Bryant
  2 siblings, 1 reply; 55+ messages in thread
From: Brian Twichell @ 2006-01-27 22:50 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Dave McCracken, Andrew Morton, Linux Kernel, Linux Memory Management

Hugh Dickins wrote:

>On Thu, 5 Jan 2006, Dave McCracken wrote:
>  
>
>>Here's a new version of my shared page tables patch.
>>
>>The primary purpose of sharing page tables is improved performance for
>>large applications that share big memory areas between multiple processes.
>>It eliminates the redundant page tables and significantly reduces the
>>number of minor page faults.  Tests show significant performance
>>improvement for large database applications, including those using large
>>pages.  There is no measurable performance degradation for small processes.
>>
>>This version of the patch uses Hugh's new locking mechanism, extending it
>>up the page table tree as far as necessary for proper concurrency control.
>>
>>The patch also includes the proper locking for following the vma chains.
>>
>>Hugh, I believe I have all the lock points nailed down.  I'd appreciate
>>your input on any I might have missed.
>>
>>The architectures supported are i386 and x86_64.  I'm working on 64 bit
>>ppc, but there are still some issues around proper segment handling that
>>need more testing.  This will be available in a separate patch once it's
>>solid.
>>
>>Dave McCracken
>>    
>>
>
>The locking looks much better now, and I like the way i_mmap_lock seems
>to fall naturally into place where the pte lock doesn't work.  But still
>some raciness noted in comments on patch below.
>
>The main thing I dislike is the
> 16 files changed, 937 insertions(+), 69 deletions(-)
>(with just i386 and x86_64 included): it's adding more complexity than
>I can welcome, and too many unavoidable "if (shared) ... else ..."s.
>With significant further change needed, not just adding architectures.
>
>Worthwhile additional complexity?  I'm not the one to judge that.
>Brian has posted dramatic improvments (25%, 49%) for the non-huge OLTP,
>and yes, it's sickening the amount of memory we're wasting on pagetables
>in that particular kind of workload.  Less dramatic (3%, 4%) in the
>hugetlb case: and as yet (since last summer even) no profiles to tell
>where that improvement actually comes from.
>
>  
>
Hi,

We collected more granular performance data for the ppc64/hugepage case.

CPI decreased by 3% when shared pagetables were used.  Underlying this was a
7% decrease in the overall TLB miss rate.  The TLB miss rate for hugepages
decreased 39%.  TLB miss rates are calculated per instruction executed.

We didn't collect a profile per se, as we would expect a CPI improvement
of this nature to be spread over a significant number of functions,
mostly in user-space.

Cheers,
Brian



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-27 22:50   ` Brian Twichell
@ 2006-01-30 18:46     ` Ray Bryant
  2006-01-31 18:47       ` Brian Twichell
  0 siblings, 1 reply; 55+ messages in thread
From: Ray Bryant @ 2006-01-30 18:46 UTC (permalink / raw)
  To: Brian Twichell
  Cc: Hugh Dickins, Dave McCracken, Andrew Morton, Linux Kernel,
	Linux Memory Management

On Friday 27 January 2006 16:50, Brian Twichell wrote:
<snip>

>
> Hi,
>
> We collected more granular performance data for the ppc64/hugepage case.
>
> CPI decreased by 3% when shared pagetables were used.  Underlying this was
> a 7% decrease in the overall TLB miss rate.  The TLB miss rate for
> hugepages decreased 39%.  TLB miss rates are calculated per instruction
> executed.
>

Interesting.

Do you know if Dave's patch supports sharing of pte's for 2 MB pages on 
X86_64?

Was there a corresponding improvement in overall transaction throughput for 
the hugetlb, shared pte case?    That is, did the 3% improvement in CPI 
translate to a measurable improvement in the overall OLTP benchmark score?

(I'm assuming your 25-50% improvement measurements, as mentioned in a previous 
note, was for small pages.)

> We didn't collect a profile per se, as we would expect a CPI improvement
> of this nature to be spread over a significant number of functions,
> mostly in user-space.
>
> Cheers,
> Brian
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-30 18:46     ` Ray Bryant
@ 2006-01-31 18:47       ` Brian Twichell
  2006-01-31 19:18         ` Dave McCracken
  0 siblings, 1 reply; 55+ messages in thread
From: Brian Twichell @ 2006-01-31 18:47 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Hugh Dickins, Dave McCracken, Andrew Morton, Linux Kernel,
	Linux Memory Management

Ray Bryant wrote:

>On Friday 27 January 2006 16:50, Brian Twichell wrote:
><snip>
>
>  
>
>>Hi,
>>
>>We collected more granular performance data for the ppc64/hugepage case.
>>
>>CPI decreased by 3% when shared pagetables were used.  Underlying this was
>>a 7% decrease in the overall TLB miss rate.  The TLB miss rate for
>>hugepages decreased 39%.  TLB miss rates are calculated per instruction
>>executed.
>>
>>    
>>
>
>Interesting.
>
>Do you know if Dave's patch supports sharing of pte's for 2 MB pages on 
>X86_64?
>  
>
I believe it does.  Dave, can you confirm ?

>Was there a corresponding improvement in overall transaction throughput for 
>the hugetlb, shared pte case?    That is, did the 3% improvement in CPI 
>translate to a measurable improvement in the overall OLTP benchmark score?
>  
>
Yes.  My original post with performance data described a 3% improvement
in the ppc64/hugepage case.  This is a transaction throughput statement.

>(I'm assuming your 25-50% improvement measurements, as mentioned in a previous 
>note, was for small pages.)
>
>  
>
That's correct.



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-31 18:47       ` Brian Twichell
@ 2006-01-31 19:18         ` Dave McCracken
  0 siblings, 0 replies; 55+ messages in thread
From: Dave McCracken @ 2006-01-31 19:18 UTC (permalink / raw)
  To: Brian Twichell, Ray Bryant
  Cc: Hugh Dickins, Andrew Morton, Linux Kernel, Linux Memory Management


--On Tuesday, January 31, 2006 12:47:51 -0600 Brian Twichell
<tbrian@us.ibm.com> wrote:

>> Do you know if Dave's patch supports sharing of pte's for 2 MB pages on 
>> X86_64?
>>  
>> 
> I believe it does.  Dave, can you confirm ?

It shares pmd pages for hugepages, which I assume is what you're talking
about.

Dave McCracken


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH/RFC] Shared page tables
  2006-01-24  7:18               ` Andi Kleen
  2006-01-27 18:16                 ` Martin Bligh
@ 2006-02-01  9:49                 ` Nick Piggin
  1 sibling, 0 replies; 55+ messages in thread
From: Nick Piggin @ 2006-02-01  9:49 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Arjan van de Ven, Ray Bryant, Dave McCracken, Robin Holt,
	Hugh Dickins, Linux Kernel, Linux Memory Management

Andi Kleen wrote:

> 
> Well, we first have to figure out if the shared page tables
> are really worth all the ugly code, nasty locking and other problems 
> (inefficient TLB flush etc.) I personally would prefer
> to make large pages work better before going down that path.
> 

Other thing I wonder about - less efficient page table placement
on NUMA systems might harm TLB miss latency on some systems
(although we don't always do a great job of trying to localise these
things yet anyway). Another is possible increased lock contention on
widely shared page tables like libc.

I agree that it is not something which needs to be rushed in any
time soon. We've already got significant concessions and complexity
in the memory manager for databases (hugepages, direct io / raw io)
so a few % improvement on database performance doesn't put it on our
must have list IMO.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 55+ messages in thread

end of thread, other threads:[~2006-02-01  9:49 UTC | newest]

Thread overview: 55+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-01-05 16:19 [PATCH/RFC] Shared page tables Dave McCracken
2006-01-07 12:25 ` Heiko Carstens
2006-01-07 18:09   ` Dave McCracken
2006-01-08 12:09     ` Heiko Carstens
2006-01-08 14:04       ` Dave McCracken
2006-01-13  5:15 ` Brian Twichell
2006-01-13 22:34   ` Ray Bryant
2006-01-17  4:50     ` Brian Twichell
2006-01-25  4:14   ` Brian Twichell
2006-01-13 15:18 ` Phillip Susi
2006-01-14 20:45   ` Brian Twichell
2006-01-17 23:53 ` Robin Holt
2006-01-18  0:17   ` Dave Hansen
2006-01-18  6:11     ` Dave McCracken
2006-01-18  1:27   ` Chen, Kenneth W
2006-01-18  3:32     ` Robin Holt
2006-01-23 23:58   ` Ray Bryant
2006-01-24  0:16     ` Ray Bryant
2006-01-24  0:39       ` Andi Kleen
2006-01-24  0:51         ` Dave McCracken
2006-01-24  1:11           ` Andi Kleen
2006-01-24  1:26             ` Dave McCracken
2006-01-24  0:53         ` Ray Bryant
2006-01-24  1:00           ` Dave McCracken
2006-01-24  1:10           ` Andi Kleen
2006-01-24  1:23             ` Benjamin LaHaise
2006-01-24  1:38               ` Andi Kleen
2006-01-24  7:08                 ` Arjan van de Ven
2006-01-24  7:06             ` Arjan van de Ven
2006-01-24  7:18               ` Andi Kleen
2006-01-27 18:16                 ` Martin Bligh
2006-02-01  9:49                 ` Nick Piggin
2006-01-24 14:48               ` Dave McCracken
2006-01-24 14:56                 ` Arjan van de Ven
2006-01-24  0:19     ` Dave McCracken
2006-01-24  0:46       ` Ray Bryant
2006-01-24 23:43       ` Ray Bryant
2006-01-24 23:50         ` Dave McCracken
2006-01-25  0:21           ` Ray Bryant
2006-01-25 22:48           ` Ray Bryant
2006-01-25 22:52             ` Dave McCracken
2006-01-26  0:16               ` Ray Bryant
2006-01-26  0:58               ` Ray Bryant
2006-01-26  4:06                 ` Robin Holt
2006-01-20 21:24 ` Hugh Dickins
2006-01-20 21:54   ` Chen, Kenneth W
2006-01-23 17:39   ` Dave McCracken
2006-01-23 20:19     ` Benjamin LaHaise
2006-01-24 17:50     ` Hugh Dickins
2006-01-24 18:07       ` Dave McCracken
2006-01-24 18:20         ` Hugh Dickins
2006-01-27 22:50   ` Brian Twichell
2006-01-30 18:46     ` Ray Bryant
2006-01-31 18:47       ` Brian Twichell
2006-01-31 19:18         ` Dave McCracken

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).