[RFC][PATCH 0/8] Speculative pagefault -v3

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC][PATCH 0/8] Speculative pagefault -v3
@ 2010-01-04 18:24 Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
                   ` (9 more replies)
  0 siblings, 10 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds

Patch series implementing speculative page faults for x86.

Still needs lots of things sorted, like:

  - call_srcu()
  - ptl, irq and tlb-flush
  - a 2nd VM_FAULT_LOCK? return code to distuinguish between
    simple retry and must take mmap_sem semantics?

Comments?
-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 1/8] mm: Remove pte reference from fault path
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure Peter Zijlstra
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-1.patch --]
[-- Type: text/plain, Size: 6032 bytes --]

Since we want to do speculative faults, where we can race against
unmap() and similar, we cannot trust pte pointers to remain valid.

Hence remove the relyance on those from the fault path.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/memory.c |   72 ++++++++++++++++--------------------------------------------
 1 file changed, 20 insertions(+), 52 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1919,31 +1919,6 @@ int apply_to_page_range(struct mm_struct
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
-{
-	int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spinlock_t *ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		same = pte_same(*page_table, orig_pte);
-		spin_unlock(ptl);
-	}
-#endif
-	pte_unmap(page_table);
-	return same;
-}
-
-/*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
  * servicing faults for write access.  In the normal case, do always want
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
@@ -2508,19 +2483,16 @@ int vmtruncate_range(struct inode *inode
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
 	struct page *page;
 	swp_entry_t entry;
-	pte_t pte;
+	pte_t *page_table, pte;
 	struct mem_cgroup *ptr = NULL;
 	int ret = 0;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		goto out;
-
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
@@ -2650,18 +2622,16 @@ out_release:
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags)
+		unsigned long address, pmd_t *pmd, unsigned int flags)
 {
 	struct page *page;
 	spinlock_t *ptl;
-	pte_t entry;
+	pte_t entry, *page_table;
 
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
-		ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
+		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 		if (!pte_none(*page_table))
 			goto unlock;
 		goto setpte;
@@ -2900,13 +2870,12 @@ unwritable_page:
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
@@ -2920,16 +2889,13 @@ static int do_linear_fault(struct mm_str
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff;
 
 	flags |= FAULT_FLAG_NONLINEAR;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
@@ -2957,31 +2923,29 @@ static int do_nonlinear_fault(struct mm_
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
-		pte_t *pte, pmd_t *pmd, unsigned int flags)
+		pte_t entry, pmd_t *pmd, unsigned int flags)
 {
-	pte_t entry;
 	spinlock_t *ptl;
+	pte_t *pte;
 
-	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
-						pte, pmd, flags, entry);
+						pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
-						 pte, pmd, flags);
+						 pmd, flags);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
+					pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
-					pte, pmd, flags, entry);
+					pmd, flags, entry);
 	}
 
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
+	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
@@ -3017,7 +2981,7 @@ int handle_mm_fault(struct mm_struct *mm
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, entry;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -3037,7 +3001,11 @@ int handle_mm_fault(struct mm_struct *mm
 	if (!pte)
 		return VM_FAULT_OOM;
 
-	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+	entry = *pte;
+
+	pte_unmap(pte);
+
+	return handle_pte_fault(mm, vma, address, entry, pmd, flags);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 3/8] mm: Add vma sequence count Peter Zijlstra
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-6.patch --]
[-- Type: text/plain, Size: 9926 bytes --]

Replace pte_offset_map_lock() usage in the pagefault path with
pte_map_lock() which when called with .flags & FAULT_FLAG_SPECULATIVE
can fail, in which case we should return VM_FAULT_RETRY, meaning we
need to retry the fault (or do one with mmap_sem held).

This patch adds both FAULT_FLAG_SPECULATIVE, VM_FAULT_RETRY and the
error paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h |    2 
 mm/memory.c        |  119 ++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 88 insertions(+), 33 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -136,6 +136,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 #define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_SPECULATIVE	0x08
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -711,6 +712,7 @@ static inline int page_mapped(struct pag
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
+#define VM_FAULT_RETRY  0x0400
 
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
 
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1957,6 +1957,14 @@ static inline void cow_user_page(struct 
 		copy_user_highpage(dst, src, va, vma);
 }
 
+static int pte_map_lock(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pmd_t *pmd, unsigned int flags,
+		pte_t **ptep, spinlock_t **ptl)
+{
+	*ptep = pte_offset_map_lock(mm, pmd, address, ptl);
+	return 1;
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1977,7 +1985,7 @@ static inline void cow_user_page(struct 
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		spinlock_t *ptl, pte_t orig_pte)
+		spinlock_t *ptl, unsigned int flags, pte_t orig_pte)
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
@@ -2009,8 +2017,14 @@ static int do_wp_page(struct mm_struct *
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 			lock_page(old_page);
-			page_table = pte_offset_map_lock(mm, pmd, address,
-							 &ptl);
+
+			if (!pte_map_lock(mm, vma, address, pmd, flags,
+						&page_table, &ptl)) {
+				unlock_page(old_page);
+				ret = VM_FAULT_RETRY;
+				goto err;
+			}
+
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				page_cache_release(old_page);
@@ -2052,14 +2066,14 @@ static int do_wp_page(struct mm_struct *
 			if (unlikely(tmp &
 					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
 				ret = tmp;
-				goto unwritable_page;
+				goto err;
 			}
 			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
 				lock_page(old_page);
 				if (!old_page->mapping) {
 					ret = 0; /* retry the fault */
 					unlock_page(old_page);
-					goto unwritable_page;
+					goto err;
 				}
 			} else
 				VM_BUG_ON(!PageLocked(old_page));
@@ -2070,8 +2084,13 @@ static int do_wp_page(struct mm_struct *
 			 * they did, we just return, as we can count on the
 			 * MMU to tell us if they didn't also make it writable.
 			 */
-			page_table = pte_offset_map_lock(mm, pmd, address,
-							 &ptl);
+			if (!pte_map_lock(mm, vma, address, pmd, flags,
+						&page_table, &ptl)) {
+				unlock_page(old_page);
+				ret = VM_FAULT_RETRY;
+				goto err;
+			}
+
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				page_cache_release(old_page);
@@ -2103,17 +2122,23 @@ reuse:
 gotten:
 	pte_unmap_unlock(page_table, ptl);
 
-	if (unlikely(anon_vma_prepare(vma)))
-		goto oom;
+	if (unlikely(anon_vma_prepare(vma))) {
+		ret = VM_FAULT_OOM;
+		goto err;
+	}
 
 	if (is_zero_pfn(pte_pfn(orig_pte))) {
 		new_page = alloc_zeroed_user_highpage_movable(vma, address);
-		if (!new_page)
-			goto oom;
+		if (!new_page) {
+			ret = VM_FAULT_OOM;
+			goto err;
+		}
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-		if (!new_page)
-			goto oom;
+		if (!new_page) {
+			ret = VM_FAULT_OOM;
+			goto err;
+		}
 		cow_user_page(new_page, old_page, address, vma);
 	}
 	__SetPageUptodate(new_page);
@@ -2128,13 +2153,20 @@ gotten:
 		unlock_page(old_page);
 	}
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
-		goto oom_free_new;
+	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+		ret = VM_FAULT_OOM;
+		goto err_free_new;
+	}
 
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		mem_cgroup_uncharge_page(new_page);
+		ret = VM_FAULT_RETRY;
+		goto err_free_new;
+	}
+
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
@@ -2233,9 +2265,9 @@ unlock:
 			file_update_time(vma->vm_file);
 	}
 	return ret;
-oom_free_new:
+err_free_new:
 	page_cache_release(new_page);
-oom:
+err:
 	if (old_page) {
 		if (page_mkwrite) {
 			unlock_page(old_page);
@@ -2243,10 +2275,6 @@ oom:
 		}
 		page_cache_release(old_page);
 	}
-	return VM_FAULT_OOM;
-
-unwritable_page:
-	page_cache_release(old_page);
 	return ret;
 }
 
@@ -2496,6 +2524,10 @@ static int do_swap_page(struct mm_struct
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
+			if (flags & FAULT_FLAG_SPECULATIVE) {
+				ret = VM_FAULT_RETRY;
+				goto out;
+			}
 			migration_entry_wait(mm, pmd, address);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
@@ -2516,7 +2548,11 @@ static int do_swap_page(struct mm_struct
 			 * Back out if somebody else faulted in this pte
 			 * while we released the pte lock.
 			 */
-			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+			if (!pte_map_lock(mm, vma, address, pmd, flags,
+						&page_table, &ptl)) {
+				ret = VM_FAULT_RETRY;
+				goto out;
+			}
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2553,7 +2589,11 @@ static int do_swap_page(struct mm_struct
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		ret = VM_FAULT_RETRY;
+		goto out_nolock;
+	}
+
 	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
 
@@ -2594,7 +2634,7 @@ static int do_swap_page(struct mm_struct
 	unlock_page(page);
 
 	if (flags & FAULT_FLAG_WRITE) {
-		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, flags, pte);
 		if (ret & VM_FAULT_ERROR)
 			ret &= VM_FAULT_ERROR;
 		goto out;
@@ -2607,8 +2647,9 @@ unlock:
 out:
 	return ret;
 out_nomap:
-	mem_cgroup_cancel_charge_swapin(ptr);
 	pte_unmap_unlock(page_table, ptl);
+out_nolock:
+	mem_cgroup_cancel_charge_swapin(ptr);
 out_page:
 	unlock_page(page);
 out_release:
@@ -2631,7 +2672,9 @@ static int do_anonymous_page(struct mm_s
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
-		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+		if (!pte_map_lock(mm, vma, address, pmd, flags,
+					&page_table, &ptl))
+			return VM_FAULT_RETRY;
 		if (!pte_none(*page_table))
 			goto unlock;
 		goto setpte;
@@ -2654,7 +2697,12 @@ static int do_anonymous_page(struct mm_s
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		mem_cgroup_uncharge_page(page);
+		page_cache_release(page);
+		return VM_FAULT_RETRY;
+	}
+
 	if (!pte_none(*page_table))
 		goto release;
 
@@ -2793,7 +2841,10 @@ static int __do_fault(struct mm_struct *
 
 	}
 
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		ret = VM_FAULT_RETRY;
+		goto out_uncharge;
+	}
 
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -2826,7 +2877,10 @@ static int __do_fault(struct mm_struct *
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
+		pte_unmap_unlock(page_table, ptl);
 	} else {
+		pte_unmap_unlock(page_table, ptl);
+out_uncharge:
 		if (charged)
 			mem_cgroup_uncharge_page(page);
 		if (anon)
@@ -2835,8 +2889,6 @@ static int __do_fault(struct mm_struct *
 			anon = 1; /* no anon but release faulted_page */
 	}
 
-	pte_unmap_unlock(page_table, ptl);
-
 out:
 	if (dirty_page) {
 		struct address_space *mapping = page->mapping;
@@ -2945,13 +2997,14 @@ static inline int handle_pte_fault(struc
 					pmd, flags, entry);
 	}
 
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &pte, &ptl))
+		return VM_FAULT_RETRY;
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+					pte, pmd, ptl, flags, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 3/8] mm: Add vma sequence count
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 4/8] mm: RCU free vmas Peter Zijlstra
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-5.patch --]
[-- Type: text/plain, Size: 1869 bytes --]

In order to detect VMA range changes, add a sequence count.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm_types.h |    2 ++
 mm/mmap.c                |   10 ++++++++++
 2 files changed, 12 insertions(+)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h
+++ linux-2.6/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
+#include <linux/seqlock.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -186,6 +187,7 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+	seqcount_t vm_sequence;
 };
 
 struct core_thread {
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -512,6 +512,10 @@ void vma_adjust(struct vm_area_struct *v
 	long adjust_next = 0;
 	int remove_next = 0;
 
+	write_seqcount_begin(&vma->vm_sequence);
+	if (next)
+		write_seqcount_begin(&next->vm_sequence);
+
 	if (next && !insert) {
 		if (end >= next->vm_end) {
 			/*
@@ -647,11 +651,17 @@ again:			remove_next = 1 + (end > next->
 		 * up the code too much to do both in one go.
 		 */
 		if (remove_next == 2) {
+			write_seqcount_end(&next->vm_sequence);
 			next = vma->vm_next;
+			write_seqcount_begin(&next->vm_sequence);
 			goto again;
 		}
 	}
 
+	if (next)
+		write_seqcount_end(&next->vm_sequence);
+	write_seqcount_end(&vma->vm_sequence);
+
 	validate_mm(mm);
 }
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 4/8] mm: RCU free vmas
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (2 preceding siblings ...)
  2010-01-04 18:24 ` [RFC][PATCH 3/8] mm: Add vma sequence count Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-05  2:43   ` Paul E. McKenney
  2010-01-04 18:24 ` [RFC][PATCH 5/8] mm: Speculative pte_map_lock() Peter Zijlstra
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-3.patch --]
[-- Type: text/plain, Size: 6711 bytes --]

TODO:
 - should be SRCU, lack of call_srcu()

In order to allow speculative vma lookups, RCU free the struct
vm_area_struct.

We use two means of detecting a vma is still valid:
 - firstly, we set RB_CLEAR_NODE once we remove a vma from the tree.
 - secondly, we check the vma sequence number.

These two things combined will guarantee that 1) the vma is still
present and two, it still covers the same range from when we looked it
up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h       |   12 ++++++++++++
 include/linux/mm_types.h |    2 ++
 init/Kconfig             |   34 +++++++++++++++++-----------------
 kernel/sched.c           |    9 ++++++++-
 mm/mmap.c                |   33 +++++++++++++++++++++++++++++++--
 5 files changed, 70 insertions(+), 20 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -765,6 +765,18 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
 
+static inline int vma_is_dead(struct vm_area_struct *vma, unsigned int sequence)
+{
+	int ret = RB_EMPTY_NODE(&vma->vm_rb);
+	unsigned seq = vma->vm_sequence.sequence;
+	/*
+	 * Matches both the wmb in write_seqlock_begin/end() and
+	 * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
+	 */
+	smp_rmb();
+	return ret || seq != sequence;
+}
+
 /**
  * mm_walk - callbacks for walk_page_range
  * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h
+++ linux-2.6/include/linux/mm_types.h
@@ -13,6 +13,7 @@
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
 #include <linux/seqlock.h>
+#include <linux/rcupdate.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -188,6 +189,7 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	seqcount_t vm_sequence;
+	struct rcu_head vm_rcu_head;
 };
 
 struct core_thread {
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -222,6 +222,19 @@ void unlink_file_vma(struct vm_area_stru
 	}
 }
 
+static void free_vma_rcu(struct rcu_head *head)
+{
+	struct vm_area_struct *vma =
+		container_of(head, struct vm_area_struct, vm_rcu_head);
+
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+static void free_vma(struct vm_area_struct *vma)
+{
+	call_rcu(&vma->vm_rcu_head, free_vma_rcu);
+}
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -238,7 +251,7 @@ static struct vm_area_struct *remove_vma
 			removed_exe_file_vma(vma->vm_mm);
 	}
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma);
 	return next;
 }
 
@@ -488,6 +501,14 @@ __vma_unlink(struct mm_struct *mm, struc
 {
 	prev->vm_next = vma->vm_next;
 	rb_erase(&vma->vm_rb, &mm->mm_rb);
+	/*
+	 * Ensure the removal is completely comitted to memory
+	 * before clearing the node.
+	 *
+	 * Matched by vma_is_dead()/handle_speculative_fault().
+	 */
+	smp_wmb();
+	RB_CLEAR_NODE(&vma->vm_rb);
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
 }
@@ -644,7 +665,7 @@ again:			remove_next = 1 + (end > next->
 		}
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1858,6 +1879,14 @@ detach_vmas_to_be_unmapped(struct mm_str
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
+		/*
+		 * Ensure the removal is completely comitted to memory
+		 * before clearing the node.
+		 *
+		 * Matched by vma_is_dead()/handle_speculative_fault().
+		 */
+		smp_wmb();
+		RB_CLEAR_NODE(&vma->vm_rb);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -314,19 +314,19 @@ menu "RCU Subsystem"
 
 choice
 	prompt "RCU Implementation"
-	default TREE_RCU
+	default TREE_PREEMPT_RCU
 
-config TREE_RCU
-	bool "Tree-based hierarchical RCU"
-	help
-	  This option selects the RCU implementation that is
-	  designed for very large SMP system with hundreds or
-	  thousands of CPUs.  It also scales down nicely to
-	  smaller systems.
+#config TREE_RCU
+#	bool "Tree-based hierarchical RCU"
+#	help
+#	  This option selects the RCU implementation that is
+#	  designed for very large SMP system with hundreds or
+#	  thousands of CPUs.  It also scales down nicely to
+#	  smaller systems.
 
 config TREE_PREEMPT_RCU
 	bool "Preemptable tree-based hierarchical RCU"
-	depends on PREEMPT
+#	depends on PREEMPT
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -334,14 +334,14 @@ config TREE_PREEMPT_RCU
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 
-config TINY_RCU
-	bool "UP-only small-memory-footprint RCU"
-	depends on !SMP
-	help
-	  This option selects the RCU implementation that is
-	  designed for UP systems from which real-time response
-	  is not required.  This option greatly reduces the
-	  memory footprint of RCU.
+#config TINY_RCU
+#	bool "UP-only small-memory-footprint RCU"
+#	depends on !SMP
+#	help
+#	  This option selects the RCU implementation that is
+#	  designed for UP systems from which real-time response
+#	  is not required.  This option greatly reduces the
+#	  memory footprint of RCU.
 
 endchoice
 
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -9689,7 +9689,14 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE)
+		/*
+		 * remove this for we need preemptible RCU
+		 * exactly because it needs to sleep..
+		 *
+		 + rcu_preempt_depth()
+		 */
+		;
 
 	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 5/8] mm: Speculative pte_map_lock()
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (3 preceding siblings ...)
  2010-01-04 18:24 ` [RFC][PATCH 4/8] mm: RCU free vmas Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-7.patch --]
[-- Type: text/plain, Size: 11649 bytes --]

Implement pte_map_lock(.flags & FAULT_FLAG_SPECULATIVE), in which case
we're not holding mmap_sem, so we can race against umap() and similar
routines.

Since we cannot rely on pagetable stability in the face of unmap, we
use the technique fast_gup() also uses for a lockless pagetable
lookup. For this we introduce the {un,}pin_page_tables() functions.

The only problem is that we do TLB flushes while holding the PTL,
which in turn means that we cannot acquire the PTL while having IRQs
disabled.

Fudge around this by open-coding a spinner which drops the page-table
pin, which on x86 will be IRQ-disable (that holds of the TLB flush,
which is done before freeing the pagetables).

Once we hold the PTL, we can validate the VMA, if that is still valid
we know we're good to go and holding the PTL will hold off unmap.

We need to propagate the VMA sequence count through the fault code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h |    2 
 mm/memory.c        |  111 ++++++++++++++++++++++++++++++++++++++---------------
 mm/util.c          |   12 ++++-
 3 files changed, 93 insertions(+), 32 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -848,6 +848,8 @@ int get_user_pages(struct task_struct *t
 			struct page **pages, struct vm_area_struct **vmas);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
+void pin_page_tables(void);
+void unpin_page_tables(void);
 struct page *get_dump_page(unsigned long addr);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1959,10 +1959,56 @@ static inline void cow_user_page(struct 
 
 static int pte_map_lock(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd, unsigned int flags,
-		pte_t **ptep, spinlock_t **ptl)
+		unsigned int seq, pte_t **ptep, spinlock_t **ptlp)
 {
-	*ptep = pte_offset_map_lock(mm, pmd, address, ptl);
+	pgd_t *pgd;
+	pud_t *pud;
+
+	if (!(flags & FAULT_FLAG_SPECULATIVE)) {
+		*ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+		return 1;
+	}
+
+again:
+	pin_page_tables();
+
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto out;
+
+	if (pmd_huge(*pmd))
+		goto out;
+
+	*ptlp = pte_lockptr(mm, pmd);
+	*ptep = pte_offset_map(pmd, address);
+	if (!spin_trylock(*ptlp)) {
+		pte_unmap(*ptep);
+		unpin_page_tables();
+		goto again;
+	}
+
+	if (!*ptep)
+		goto out;
+
+	if (vma_is_dead(vma, seq))
+		goto unlock;
+
+	unpin_page_tables();
 	return 1;
+
+unlock:
+	pte_unmap_unlock(*ptep, *ptlp);
+out:
+	unpin_page_tables();
+	return 0;
 }
 
 /*
@@ -1985,7 +2031,8 @@ static int pte_map_lock(struct mm_struct
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		spinlock_t *ptl, unsigned int flags, pte_t orig_pte)
+		spinlock_t *ptl, unsigned int flags, pte_t orig_pte,
+		unsigned int seq)
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
@@ -2018,7 +2065,7 @@ static int do_wp_page(struct mm_struct *
 			pte_unmap_unlock(page_table, ptl);
 			lock_page(old_page);
 
-			if (!pte_map_lock(mm, vma, address, pmd, flags,
+			if (!pte_map_lock(mm, vma, address, pmd, flags, seq,
 						&page_table, &ptl)) {
 				unlock_page(old_page);
 				ret = VM_FAULT_RETRY;
@@ -2084,7 +2131,7 @@ static int do_wp_page(struct mm_struct *
 			 * they did, we just return, as we can count on the
 			 * MMU to tell us if they didn't also make it writable.
 			 */
-			if (!pte_map_lock(mm, vma, address, pmd, flags,
+			if (!pte_map_lock(mm, vma, address, pmd, flags, seq,
 						&page_table, &ptl)) {
 				unlock_page(old_page);
 				ret = VM_FAULT_RETRY;
@@ -2161,7 +2208,7 @@ gotten:
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+	if (!pte_map_lock(mm, vma, address, pmd, flags, seq, &page_table, &ptl)) {
 		mem_cgroup_uncharge_page(new_page);
 		ret = VM_FAULT_RETRY;
 		goto err_free_new;
@@ -2511,8 +2558,8 @@ int vmtruncate_range(struct inode *inode
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
+		unsigned long address, pmd_t *pmd, unsigned int flags,
+		pte_t orig_pte, unsigned int seq)
 {
 	spinlock_t *ptl;
 	struct page *page;
@@ -2548,7 +2595,7 @@ static int do_swap_page(struct mm_struct
 			 * Back out if somebody else faulted in this pte
 			 * while we released the pte lock.
 			 */
-			if (!pte_map_lock(mm, vma, address, pmd, flags,
+			if (!pte_map_lock(mm, vma, address, pmd, flags, seq,
 						&page_table, &ptl)) {
 				ret = VM_FAULT_RETRY;
 				goto out;
@@ -2589,7 +2636,7 @@ static int do_swap_page(struct mm_struct
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
-	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+	if (!pte_map_lock(mm, vma, address, pmd, flags, seq, &page_table, &ptl)) {
 		ret = VM_FAULT_RETRY;
 		goto out_nolock;
 	}
@@ -2634,7 +2681,8 @@ static int do_swap_page(struct mm_struct
 	unlock_page(page);
 
 	if (flags & FAULT_FLAG_WRITE) {
-		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, flags, pte);
+		ret |= do_wp_page(mm, vma, address, page_table, pmd,
+				ptl, flags, pte, seq);
 		if (ret & VM_FAULT_ERROR)
 			ret &= VM_FAULT_ERROR;
 		goto out;
@@ -2663,7 +2711,8 @@ out_release:
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd, unsigned int flags)
+		unsigned long address, pmd_t *pmd, unsigned int flags,
+		unsigned int seq)
 {
 	struct page *page;
 	spinlock_t *ptl;
@@ -2672,7 +2721,7 @@ static int do_anonymous_page(struct mm_s
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
-		if (!pte_map_lock(mm, vma, address, pmd, flags,
+		if (!pte_map_lock(mm, vma, address, pmd, flags, seq,
 					&page_table, &ptl))
 			return VM_FAULT_RETRY;
 		if (!pte_none(*page_table))
@@ -2697,7 +2746,7 @@ static int do_anonymous_page(struct mm_s
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
-	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+	if (!pte_map_lock(mm, vma, address, pmd, flags, seq, &page_table, &ptl)) {
 		mem_cgroup_uncharge_page(page);
 		page_cache_release(page);
 		return VM_FAULT_RETRY;
@@ -2740,8 +2789,8 @@ oom:
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd,
-		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+		unsigned long address, pmd_t *pmd, pgoff_t pgoff,
+		unsigned int flags, pte_t orig_pte, unsigned int seq)
 {
 	pte_t *page_table;
 	spinlock_t *ptl;
@@ -2841,7 +2890,7 @@ static int __do_fault(struct mm_struct *
 
 	}
 
-	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+	if (!pte_map_lock(mm, vma, address, pmd, flags, seq, &page_table, &ptl)) {
 		ret = VM_FAULT_RETRY;
 		goto out_uncharge;
 	}
@@ -2923,12 +2972,12 @@ unwritable_page:
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte, unsigned int seq)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte, seq);
 }
 
 /*
@@ -2942,7 +2991,7 @@ static int do_linear_fault(struct mm_str
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte, unsigned int seq)
 {
 	pgoff_t pgoff;
 
@@ -2957,7 +3006,7 @@ static int do_nonlinear_fault(struct mm_
 	}
 
 	pgoff = pte_to_pgoff(orig_pte);
-	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte, seq);
 }
 
 /*
@@ -2975,7 +3024,8 @@ static int do_nonlinear_fault(struct mm_
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
-		pte_t entry, pmd_t *pmd, unsigned int flags)
+		pte_t entry, pmd_t *pmd, unsigned int flags,
+		unsigned int seq)
 {
 	spinlock_t *ptl;
 	pte_t *pte;
@@ -2985,26 +3035,27 @@ static inline int handle_pte_fault(struc
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
-						pmd, flags, entry);
+						pmd, flags, entry, seq);
 			}
 			return do_anonymous_page(mm, vma, address,
-						 pmd, flags);
+						 pmd, flags, seq);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
-					pmd, flags, entry);
+					pmd, flags, entry, seq);
 		return do_swap_page(mm, vma, address,
-					pmd, flags, entry);
+					pmd, flags, entry, seq);
 	}
 
-	if (!pte_map_lock(mm, vma, address, pmd, flags, &pte, &ptl))
+	if (!pte_map_lock(mm, vma, address, pmd, flags, seq, &pte, &ptl))
 		return VM_FAULT_RETRY;
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
-		if (!pte_write(entry))
+		if (!pte_write(entry)) {
 			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, flags, entry);
+					pte, pmd, ptl, flags, entry, seq);
+		}
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
@@ -3058,7 +3109,7 @@ int handle_mm_fault(struct mm_struct *mm
 
 	pte_unmap(pte);
 
-	return handle_pte_fault(mm, vma, address, entry, pmd, flags);
+	return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
Index: linux-2.6/mm/util.c
===================================================================
--- linux-2.6.orig/mm/util.c
+++ linux-2.6/mm/util.c
@@ -253,8 +253,8 @@ void arch_pick_mmap_layout(struct mm_str
  * callers need to carefully consider what to use. On many architectures,
  * get_user_pages_fast simply falls back to get_user_pages.
  */
-int __attribute__((weak)) get_user_pages_fast(unsigned long start,
-				int nr_pages, int write, struct page **pages)
+int __weak get_user_pages_fast(unsigned long start,
+			       int nr_pages, int write, struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	int ret;
@@ -268,6 +268,14 @@ int __attribute__((weak)) get_user_pages
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
+void __weak pin_page_tables(void)
+{
+}
+
+void __weak unpin_page_tables(void)
+{
+}
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (4 preceding siblings ...)
  2010-01-04 18:24 ` [RFC][PATCH 5/8] mm: Speculative pte_map_lock() Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-05  0:25   ` KAMEZAWA Hiroyuki
  2010-01-05 13:45   ` Arjan van de Ven
  2010-01-04 18:24 ` [RFC][PATCH 7/8] mm,x86: speculative pagefault support Peter Zijlstra
                   ` (3 subsequent siblings)
  9 siblings, 2 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-8.patch --]
[-- Type: text/plain, Size: 2857 bytes --]

Generic speculative fault handler, tries to service a pagefault
without holding mmap_sem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h |    2 +
 mm/memory.c        |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1998,7 +1998,7 @@ again:
 	if (!*ptep)
 		goto out;
 
-	if (vma_is_dead(vma, seq))
+	if (vma && vma_is_dead(vma, seq))
 		goto unlock;
 
 	unpin_page_tables();
@@ -3112,6 +3112,63 @@ int handle_mm_fault(struct mm_struct *mm
 	return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
 }
 
+int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+		unsigned int flags)
+{
+	pmd_t *pmd = NULL;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+	struct vm_area_struct *vma;
+	unsigned int seq;
+	int ret = VM_FAULT_RETRY;
+	int dead;
+
+	__set_current_state(TASK_RUNNING);
+	flags |= FAULT_FLAG_SPECULATIVE;
+
+	count_vm_event(PGFAULT);
+
+	rcu_read_lock();
+	if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
+		goto out_unlock;
+
+	vma = find_vma(mm, address);
+
+	if (!vma)
+		goto out_unmap;
+
+	dead = RB_EMPTY_NODE(&vma->vm_rb);
+	seq = vma->vm_sequence.sequence;
+	/*
+	 * Matches both the wmb in write_seqcount_begin/end() and
+	 * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
+	 */
+	smp_rmb();
+	if (dead || seq & 1)
+		goto out_unmap;
+
+	if (!(vma->vm_end > address && vma->vm_start <= address))
+		goto out_unmap;
+
+	if (read_seqcount_retry(&vma->vm_sequence, seq))
+		goto out_unmap;
+
+	entry = *pte;
+
+	pte_unmap_unlock(pte, ptl);
+
+	ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
+
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+
+out_unmap:
+	pte_unmap_unlock(pte, ptl);
+	goto out_unlock;
+}
+
+
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -829,6 +829,8 @@ int invalidate_inode_page(struct page *p
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
+extern int handle_speculative_fault(struct mm_struct *mm,
+			unsigned long address, unsigned int flags);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 7/8] mm,x86: speculative pagefault support
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (5 preceding siblings ...)
  2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-04 18:24 ` [RFC][PATCH 8/8] mm: Optimize pte_map_lock() Peter Zijlstra
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-9.patch --]
[-- Type: text/plain, Size: 3031 bytes --]

Implement the x86 architecture support for speculative pagefaults.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/mm/fault.c |   31 +++++++++++--------------------
 arch/x86/mm/gup.c   |   10 ++++++++++
 2 files changed, 21 insertions(+), 20 deletions(-)

Index: linux-2.6/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/fault.c
+++ linux-2.6/arch/x86/mm/fault.c
@@ -777,30 +777,13 @@ bad_area_access_error(struct pt_regs *re
 	__bad_area(regs, error_code, address, SEGV_ACCERR);
 }
 
-/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
-static void
-out_of_memory(struct pt_regs *regs, unsigned long error_code,
-	      unsigned long address)
-{
-	/*
-	 * We ran out of memory, call the OOM killer, and return the userspace
-	 * (which will retry the fault, or kill us if we got oom-killed):
-	 */
-	up_read(&current->mm->mmap_sem);
-
-	pagefault_out_of_memory();
-}
-
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	  unsigned int fault)
 {
 	struct task_struct *tsk = current;
-	struct mm_struct *mm = tsk->mm;
 	int code = BUS_ADRERR;
 
-	up_read(&mm->mmap_sem);
-
 	/* Kernel mode? Handle exceptions or die: */
 	if (!(error_code & PF_USER))
 		no_context(regs, error_code, address);
@@ -829,7 +812,7 @@ mm_fault_error(struct pt_regs *regs, uns
 	       unsigned long address, unsigned int fault)
 {
 	if (fault & VM_FAULT_OOM) {
-		out_of_memory(regs, error_code, address);
+		pagefault_out_of_memory();
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
 			do_sigbus(regs, error_code, address, fault);
@@ -1040,6 +1023,14 @@ do_page_fault(struct pt_regs *regs, unsi
 		return;
 	}
 
+	if (error_code & PF_USER) {
+		fault = handle_speculative_fault(mm, address,
+				error_code & PF_WRITE ? FAULT_FLAG_WRITE : 0);
+
+		if (!(fault & VM_FAULT_RETRY))
+			goto done;
+	}
+
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in
@@ -1119,6 +1110,8 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 
+	up_read(&mm->mmap_sem);
+done:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
 		return;
@@ -1135,6 +1128,4 @@ good_area:
 	}
 
 	check_v8086_mode(regs, address, tsk);
-
-	up_read(&mm->mmap_sem);
 }
Index: linux-2.6/arch/x86/mm/gup.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/gup.c
+++ linux-2.6/arch/x86/mm/gup.c
@@ -373,3 +373,13 @@ slow_irqon:
 		return ret;
 	}
 }
+
+void pin_page_tables(void)
+{
+	local_irq_disable();
+}
+
+void unpin_page_tables(void)
+{
+	local_irq_enable();
+}

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* [RFC][PATCH 8/8] mm: Optimize pte_map_lock()
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (6 preceding siblings ...)
  2010-01-04 18:24 ` [RFC][PATCH 7/8] mm,x86: speculative pagefault support Peter Zijlstra
@ 2010-01-04 18:24 ` Peter Zijlstra
  2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
  2010-01-05  2:26 ` Minchan Kim
  9 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 18:24 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds
  Cc: Peter Zijlstra

[-- Attachment #1: mm-foo-11.patch --]
[-- Type: text/plain, Size: 3725 bytes --]

If we ensure the pagetable invariance by also guarding against unmap,
we can skip part of the pagetable walk by validating the vma early.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/memory.c |   58 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(st
 		details = NULL;
 
 	BUG_ON(addr >= end);
+	write_seqcount_begin(&vma->vm_sequence);
 	mem_cgroup_uncharge_start();
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
@@ -970,6 +971,7 @@ static unsigned long unmap_page_range(st
 	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
 	tlb_end_vma(tlb, vma);
 	mem_cgroup_uncharge_end();
+	write_seqcount_end(&vma->vm_sequence);
 
 	return addr;
 }
@@ -1961,9 +1963,6 @@ static int pte_map_lock(struct mm_struct
 		unsigned long address, pmd_t *pmd, unsigned int flags,
 		unsigned int seq, pte_t **ptep, spinlock_t **ptlp)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-
 	if (!(flags & FAULT_FLAG_SPECULATIVE)) {
 		*ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
 		return 1;
@@ -1972,19 +1971,7 @@ static int pte_map_lock(struct mm_struct
 again:
 	pin_page_tables();
 
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto out;
-
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto out;
-
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		goto out;
-
-	if (pmd_huge(*pmd))
+	if (vma_is_dead(vma, seq))
 		goto out;
 
 	*ptlp = pte_lockptr(mm, pmd);
@@ -1998,7 +1985,7 @@ again:
 	if (!*ptep)
 		goto out;
 
-	if (vma && vma_is_dead(vma, seq))
+	if (vma_is_dead(vma, seq))
 		goto unlock;
 
 	unpin_page_tables();
@@ -3115,13 +3102,14 @@ int handle_mm_fault(struct mm_struct *mm
 int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
 		unsigned int flags)
 {
-	pmd_t *pmd = NULL;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
 	pte_t *pte, entry;
 	spinlock_t *ptl;
 	struct vm_area_struct *vma;
 	unsigned int seq;
-	int ret = VM_FAULT_RETRY;
-	int dead;
+	int dead, ret = VM_FAULT_RETRY;
 
 	__set_current_state(TASK_RUNNING);
 	flags |= FAULT_FLAG_SPECULATIVE;
@@ -3129,8 +3117,31 @@ int handle_speculative_fault(struct mm_s
 	count_vm_event(PGFAULT);
 
 	rcu_read_lock();
-	if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
-		goto out_unlock;
+again:
+	pin_page_tables();
+
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto out;
+
+	if (pmd_huge(*pmd))
+		goto out;
+
+	ptl = pte_lockptr(mm, pmd);
+	pte = pte_offset_map(pmd, address);
+	if (!spin_trylock(ptl)) {
+		pte_unmap(pte);
+		unpin_page_tables();
+		goto again;
+	}
 
 	vma = find_vma(mm, address);
 
@@ -3156,6 +3167,7 @@ int handle_speculative_fault(struct mm_s
 	entry = *pte;
 
 	pte_unmap_unlock(pte, ptl);
+	unpin_page_tables();
 
 	ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
 
@@ -3165,6 +3177,8 @@ out_unlock:
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+out:
+	unpin_page_tables();
 	goto out_unlock;
 }
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 0/8] Speculative pagefault -v3
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (7 preceding siblings ...)
  2010-01-04 18:24 ` [RFC][PATCH 8/8] mm: Optimize pte_map_lock() Peter Zijlstra
@ 2010-01-04 21:41 ` Rik van Riel
  2010-01-04 21:46   ` Peter Zijlstra
  2010-01-04 21:59   ` Christoph Lameter
  2010-01-05  2:26 ` Minchan Kim
  9 siblings, 2 replies; 121+ messages in thread
From: Rik van Riel @ 2010-01-04 21:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds

On 01/04/2010 01:24 PM, Peter Zijlstra wrote:
> Patch series implementing speculative page faults for x86.

Fun, but why do we need this?

What improvements did you measure?

I'll take a look over the patches to see whether they're
sane...

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 0/8] Speculative pagefault -v3
  2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
@ 2010-01-04 21:46   ` Peter Zijlstra
  2010-01-04 23:20     ` Rik van Riel
  2010-01-04 21:59   ` Christoph Lameter
  1 sibling, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-04 21:46 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Paul E. McKenney, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

On Mon, 2010-01-04 at 16:41 -0500, Rik van Riel wrote:
> On 01/04/2010 01:24 PM, Peter Zijlstra wrote:
> > Patch series implementing speculative page faults for x86.
> 
> Fun, but why do we need this?

People were once again concerned with mmap_sem contention on threaded
apps on large machines. Kame-san posted some patches, but I felt they
weren't quite crazy enough ;-)

> What improvements did you measure?

I got it not to crash :-) Although I'd not be surprised if other people
do manage, it needs more eyes.

> I'll take a look over the patches to see whether they're
> sane...

More appreciated, thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 0/8] Speculative pagefault -v3
  2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
  2010-01-04 21:46   ` Peter Zijlstra
@ 2010-01-04 21:59   ` Christoph Lameter
  2010-01-05  0:28     ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-04 21:59 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

On Mon, 4 Jan 2010, Rik van Riel wrote:

> Fun, but why do we need this?
>
> What improvements did you measure?

If it measures up to Kame-sans approach then the possible pagefault rate
will at least double ...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 0/8] Speculative pagefault -v3
  2010-01-04 21:46   ` Peter Zijlstra
@ 2010-01-04 23:20     ` Rik van Riel
  0 siblings, 0 replies; 121+ messages in thread
From: Rik van Riel @ 2010-01-04 23:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

On 01/04/2010 04:46 PM, Peter Zijlstra wrote:
> On Mon, 2010-01-04 at 16:41 -0500, Rik van Riel wrote:
>> On 01/04/2010 01:24 PM, Peter Zijlstra wrote:
>>> Patch series implementing speculative page faults for x86.
>>
>> Fun, but why do we need this?
>
> People were once again concerned with mmap_sem contention on threaded
> apps on large machines. Kame-san posted some patches, but I felt they
> weren't quite crazy enough ;-)

In that case, I assume that somebody else (maybe Kame-san or
Christoph) will end up posting a benchmark that shows how
these patches help.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
@ 2010-01-05  0:25   ` KAMEZAWA Hiroyuki
  2010-01-05  3:13     ` Linus Torvalds
  2010-01-05  4:29     ` Minchan Kim
  2010-01-05 13:45   ` Arjan van de Ven
  1 sibling, 2 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-05  0:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

On Mon, 04 Jan 2010 19:24:35 +0100
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Generic speculative fault handler, tries to service a pagefault
> without holding mmap_sem.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>


I'm sorry if I miss something...how does this patch series avoid
that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
("vma is removed" means all other things as freeing file struct etc..)

Thanks,
-Kame




> ---
>  include/linux/mm.h |    2 +
>  mm/memory.c        |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 60 insertions(+), 1 deletion(-)
> 
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c
> +++ linux-2.6/mm/memory.c
> @@ -1998,7 +1998,7 @@ again:
>  	if (!*ptep)
>  		goto out;
>  
> -	if (vma_is_dead(vma, seq))
> +	if (vma && vma_is_dead(vma, seq))
>  		goto unlock;
>  
>  	unpin_page_tables();
> @@ -3112,6 +3112,63 @@ int handle_mm_fault(struct mm_struct *mm
>  	return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
>  }
>  
> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
> +		unsigned int flags)
> +{
> +	pmd_t *pmd = NULL;
> +	pte_t *pte, entry;
> +	spinlock_t *ptl;
> +	struct vm_area_struct *vma;
> +	unsigned int seq;
> +	int ret = VM_FAULT_RETRY;
> +	int dead;
> +
> +	__set_current_state(TASK_RUNNING);
> +	flags |= FAULT_FLAG_SPECULATIVE;
> +
> +	count_vm_event(PGFAULT);
> +
> +	rcu_read_lock();
> +	if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
> +		goto out_unlock;
> +
> +	vma = find_vma(mm, address);
> +
> +	if (!vma)
> +		goto out_unmap;
> +
> +	dead = RB_EMPTY_NODE(&vma->vm_rb);
> +	seq = vma->vm_sequence.sequence;
> +	/*
> +	 * Matches both the wmb in write_seqcount_begin/end() and
> +	 * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
> +	 */
> +	smp_rmb();
> +	if (dead || seq & 1)
> +		goto out_unmap;
> +
> +	if (!(vma->vm_end > address && vma->vm_start <= address))
> +		goto out_unmap;
> +
> +	if (read_seqcount_retry(&vma->vm_sequence, seq))
> +		goto out_unmap;
> +
> +	entry = *pte;
> +
> +	pte_unmap_unlock(pte, ptl);
> +
> +	ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
> +
> +out_unlock:
> +	rcu_read_unlock();
> +	return ret;
> +
> +out_unmap:
> +	pte_unmap_unlock(pte, ptl);
> +	goto out_unlock;
> +}
> +
> +
>  #ifndef __PAGETABLE_PUD_FOLDED
>  /*
>   * Allocate page upper directory.
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -829,6 +829,8 @@ int invalidate_inode_page(struct page *p
>  #ifdef CONFIG_MMU
>  extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, unsigned int flags);
> +extern int handle_speculative_fault(struct mm_struct *mm,
> +			unsigned long address, unsigned int flags);
>  #else
>  static inline int handle_mm_fault(struct mm_struct *mm,
>  			struct vm_area_struct *vma, unsigned long address,
> 
> -- 
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 0/8] Speculative pagefault -v3
  2010-01-04 21:59   ` Christoph Lameter
@ 2010-01-05  0:28     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-05  0:28 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Rik van Riel, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, minchan.kim, hugh.dickins, Nick Piggin,
	Ingo Molnar, Linus Torvalds

On Mon, 4 Jan 2010 15:59:45 -0600 (CST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Mon, 4 Jan 2010, Rik van Riel wrote:
> 
> > Fun, but why do we need this?
> >
> > What improvements did you measure?
> 
> If it measures up to Kame-sans approach then the possible pagefault rate
> will at least double ...
> 
On 4-core/2 socket machine ;)

More than page fault rate, important fact is that we can reduce cache contention
by skipping mmap_sem in some situation.

And I think we have some chances.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 0/8] Speculative pagefault -v3
  2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
                   ` (8 preceding siblings ...)
  2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
@ 2010-01-05  2:26 ` Minchan Kim
  9 siblings, 0 replies; 121+ messages in thread
From: Minchan Kim @ 2010-01-05  2:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar, Linus Torvalds

Hi, Peter.

On Tue, Jan 5, 2010 at 3:24 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> Patch series implementing speculative page faults for x86.
>
> Still needs lots of things sorted, like:
>
>  - call_srcu()
>  - ptl, irq and tlb-flush
>  - a 2nd VM_FAULT_LOCK? return code to distuinguish between
>    simple retry and must take mmap_sem semantics?
>
> Comments?
> --
>
>

I looked over this patch series.
This series are most neat in things I have ever seen.
If we solve call_srcu problem, it would be good.

I will help you test this series in my machine to work well.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 4/8] mm: RCU free vmas
  2010-01-04 18:24 ` [RFC][PATCH 4/8] mm: RCU free vmas Peter Zijlstra
@ 2010-01-05  2:43   ` Paul E. McKenney
  2010-01-05  8:28     ` Peter Zijlstra
  0 siblings, 1 reply; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-05  2:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

On Mon, Jan 04, 2010 at 07:24:33PM +0100, Peter Zijlstra wrote:
> TODO:
>  - should be SRCU, lack of call_srcu()
> 
> In order to allow speculative vma lookups, RCU free the struct
> vm_area_struct.
> 
> We use two means of detecting a vma is still valid:
>  - firstly, we set RB_CLEAR_NODE once we remove a vma from the tree.
>  - secondly, we check the vma sequence number.
> 
> These two things combined will guarantee that 1) the vma is still
> present and two, it still covers the same range from when we looked it
> up.

OK, I think I see what you are up to here.  I could get you a very crude
throw-away call_srcu() fairly quickly.  I don't yet have a good estimate
of how long it will take me to merge SRCU into the treercu infrastructure,
but am getting there.

So, which release are you thinking in terms of?

							Thanx, Paul

> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/mm.h       |   12 ++++++++++++
>  include/linux/mm_types.h |    2 ++
>  init/Kconfig             |   34 +++++++++++++++++-----------------
>  kernel/sched.c           |    9 ++++++++-
>  mm/mmap.c                |   33 +++++++++++++++++++++++++++++++--
>  5 files changed, 70 insertions(+), 20 deletions(-)
> 
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -765,6 +765,18 @@ unsigned long unmap_vmas(struct mmu_gath
>  		unsigned long end_addr, unsigned long *nr_accounted,
>  		struct zap_details *);
> 
> +static inline int vma_is_dead(struct vm_area_struct *vma, unsigned int sequence)
> +{
> +	int ret = RB_EMPTY_NODE(&vma->vm_rb);
> +	unsigned seq = vma->vm_sequence.sequence;
> +	/*
> +	 * Matches both the wmb in write_seqlock_begin/end() and
> +	 * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
> +	 */
> +	smp_rmb();
> +	return ret || seq != sequence;
> +}
> +
>  /**
>   * mm_walk - callbacks for walk_page_range
>   * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h
> +++ linux-2.6/include/linux/mm_types.h
> @@ -13,6 +13,7 @@
>  #include <linux/cpumask.h>
>  #include <linux/page-debug-flags.h>
>  #include <linux/seqlock.h>
> +#include <linux/rcupdate.h>
>  #include <asm/page.h>
>  #include <asm/mmu.h>
> 
> @@ -188,6 +189,7 @@ struct vm_area_struct {
>  	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
>  #endif
>  	seqcount_t vm_sequence;
> +	struct rcu_head vm_rcu_head;
>  };
> 
>  struct core_thread {
> Index: linux-2.6/mm/mmap.c
> ===================================================================
> --- linux-2.6.orig/mm/mmap.c
> +++ linux-2.6/mm/mmap.c
> @@ -222,6 +222,19 @@ void unlink_file_vma(struct vm_area_stru
>  	}
>  }
> 
> +static void free_vma_rcu(struct rcu_head *head)
> +{
> +	struct vm_area_struct *vma =
> +		container_of(head, struct vm_area_struct, vm_rcu_head);
> +
> +	kmem_cache_free(vm_area_cachep, vma);
> +}
> +
> +static void free_vma(struct vm_area_struct *vma)
> +{
> +	call_rcu(&vma->vm_rcu_head, free_vma_rcu);
> +}
> +
>  /*
>   * Close a vm structure and free it, returning the next.
>   */
> @@ -238,7 +251,7 @@ static struct vm_area_struct *remove_vma
>  			removed_exe_file_vma(vma->vm_mm);
>  	}
>  	mpol_put(vma_policy(vma));
> -	kmem_cache_free(vm_area_cachep, vma);
> +	free_vma(vma);
>  	return next;
>  }
> 
> @@ -488,6 +501,14 @@ __vma_unlink(struct mm_struct *mm, struc
>  {
>  	prev->vm_next = vma->vm_next;
>  	rb_erase(&vma->vm_rb, &mm->mm_rb);
> +	/*
> +	 * Ensure the removal is completely comitted to memory
> +	 * before clearing the node.
> +	 *
> +	 * Matched by vma_is_dead()/handle_speculative_fault().
> +	 */
> +	smp_wmb();
> +	RB_CLEAR_NODE(&vma->vm_rb);
>  	if (mm->mmap_cache == vma)
>  		mm->mmap_cache = prev;
>  }
> @@ -644,7 +665,7 @@ again:			remove_next = 1 + (end > next->
>  		}
>  		mm->map_count--;
>  		mpol_put(vma_policy(next));
> -		kmem_cache_free(vm_area_cachep, next);
> +		free_vma(next);
>  		/*
>  		 * In mprotect's case 6 (see comments on vma_merge),
>  		 * we must remove another next too. It would clutter
> @@ -1858,6 +1879,14 @@ detach_vmas_to_be_unmapped(struct mm_str
>  	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
>  	do {
>  		rb_erase(&vma->vm_rb, &mm->mm_rb);
> +		/*
> +		 * Ensure the removal is completely comitted to memory
> +		 * before clearing the node.
> +		 *
> +		 * Matched by vma_is_dead()/handle_speculative_fault().
> +		 */
> +		smp_wmb();
> +		RB_CLEAR_NODE(&vma->vm_rb);
>  		mm->map_count--;
>  		tail_vma = vma;
>  		vma = vma->vm_next;
> Index: linux-2.6/init/Kconfig
> ===================================================================
> --- linux-2.6.orig/init/Kconfig
> +++ linux-2.6/init/Kconfig
> @@ -314,19 +314,19 @@ menu "RCU Subsystem"
> 
>  choice
>  	prompt "RCU Implementation"
> -	default TREE_RCU
> +	default TREE_PREEMPT_RCU
> 
> -config TREE_RCU
> -	bool "Tree-based hierarchical RCU"
> -	help
> -	  This option selects the RCU implementation that is
> -	  designed for very large SMP system with hundreds or
> -	  thousands of CPUs.  It also scales down nicely to
> -	  smaller systems.
> +#config TREE_RCU
> +#	bool "Tree-based hierarchical RCU"
> +#	help
> +#	  This option selects the RCU implementation that is
> +#	  designed for very large SMP system with hundreds or
> +#	  thousands of CPUs.  It also scales down nicely to
> +#	  smaller systems.
> 
>  config TREE_PREEMPT_RCU
>  	bool "Preemptable tree-based hierarchical RCU"
> -	depends on PREEMPT
> +#	depends on PREEMPT
>  	help
>  	  This option selects the RCU implementation that is
>  	  designed for very large SMP systems with hundreds or
> @@ -334,14 +334,14 @@ config TREE_PREEMPT_RCU
>  	  is also required.  It also scales down nicely to
>  	  smaller systems.
> 
> -config TINY_RCU
> -	bool "UP-only small-memory-footprint RCU"
> -	depends on !SMP
> -	help
> -	  This option selects the RCU implementation that is
> -	  designed for UP systems from which real-time response
> -	  is not required.  This option greatly reduces the
> -	  memory footprint of RCU.
> +#config TINY_RCU
> +#	bool "UP-only small-memory-footprint RCU"
> +#	depends on !SMP
> +#	help
> +#	  This option selects the RCU implementation that is
> +#	  designed for UP systems from which real-time response
> +#	  is not required.  This option greatly reduces the
> +#	  memory footprint of RCU.
> 
>  endchoice
> 
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -9689,7 +9689,14 @@ void __init sched_init(void)
>  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
>  static inline int preempt_count_equals(int preempt_offset)
>  {
> -	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
> +	int nested = (preempt_count() & ~PREEMPT_ACTIVE)
> +		/*
> +		 * remove this for we need preemptible RCU
> +		 * exactly because it needs to sleep..
> +		 *
> +		 + rcu_preempt_depth()
> +		 */
> +		;
> 
>  	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
>  }
> 
> -- 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  0:25   ` KAMEZAWA Hiroyuki
@ 2010-01-05  3:13     ` Linus Torvalds
  2010-01-05  8:17       ` Peter Zijlstra
                         ` (2 more replies)
  2010-01-05  4:29     ` Minchan Kim
  1 sibling, 3 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05  3:13 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra, linux-kernel,
	linux-mm, minchan.kim, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> 
> I'm sorry if I miss something...how does this patch series avoid
> that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
> ("vma is removed" means all other things as freeing file struct etc..)

I don't think you're missing anything. 

Protecting the vma isn't enough. You need to protect the whole FS stack 
with rcu. Probably by moving _all_ of "free_vma()" into the RCU path 
(which means that the whole file/inode gets de-allocated at that later RCU 
point, rather than synchronously). Not just the actual kfree.

However, it's worth noting that that actually has some very subtle and 
major consequences. If you have a temporary file that was removed, where 
the mmap() was the last user that kind of delayed freeing would also delay 
the final fput of that file that actually deletes it. 

Or put another way: if the vma was a writable mapping, a user may do

	munmap(mapping, size);

and the backing file is still active and writable AFTER THE MUNMAP! This 
can be a huge problem for something that wants to unmount the volume, for 
example, or depends on the whole writability-vs-executability thing. The 
user may have unmapped it, and expects the file to be immediately 
non-busy, but with the delayed free that isn't the case any more.

In other words, now you may well need to make munmap() wait for the RCU 
grace period, so that the user who did the unmap really is synchronous wrt 
the file accesses. We've had things like that before, and they have been 
_huge_ performance problems (ie it may take just a timer tick or two, but 
then people do tens of thousands of munmaps, and now that takes many 
seconds just due to RCU grace period waiting.

I would say that this whole series is _very_ far from being mergeable. 
Peter seems to have been thinking about the details, while missing all the 
subtle big picture effects that seem to actually change semantics.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  0:25   ` KAMEZAWA Hiroyuki
  2010-01-05  3:13     ` Linus Torvalds
@ 2010-01-05  4:29     ` Minchan Kim
  2010-01-05  4:43       ` KAMEZAWA Hiroyuki
  2010-01-05  4:48       ` Linus Torvalds
  1 sibling, 2 replies; 121+ messages in thread
From: Minchan Kim @ 2010-01-05  4:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra, linux-kernel,
	linux-mm, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

Hi, Kame.

On Tue, Jan 5, 2010 at 9:25 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Mon, 04 Jan 2010 19:24:35 +0100
> Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
>> Generic speculative fault handler, tries to service a pagefault
>> without holding mmap_sem.
>>
>> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>
>
> I'm sorry if I miss something...how does this patch series avoid
> that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
> ("vma is removed" means all other things as freeing file struct etc..)

Isn't it protected by get_file and iget?
Am I miss something?

>
> Thanks,
> -Kame
>
>
>
>
>> ---
>>  include/linux/mm.h |    2 +
>>  mm/memory.c        |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 60 insertions(+), 1 deletion(-)
>>
>> Index: linux-2.6/mm/memory.c
>> ===================================================================
>> --- linux-2.6.orig/mm/memory.c
>> +++ linux-2.6/mm/memory.c
>> @@ -1998,7 +1998,7 @@ again:
>>       if (!*ptep)
>>               goto out;
>>
>> -     if (vma_is_dead(vma, seq))
>> +     if (vma && vma_is_dead(vma, seq))
>>               goto unlock;
>>
>>       unpin_page_tables();
>> @@ -3112,6 +3112,63 @@ int handle_mm_fault(struct mm_struct *mm
>>       return handle_pte_fault(mm, vma, address, entry, pmd, flags, 0);
>>  }
>>
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> +             unsigned int flags)
>> +{
>> +     pmd_t *pmd = NULL;
>> +     pte_t *pte, entry;
>> +     spinlock_t *ptl;
>> +     struct vm_area_struct *vma;
>> +     unsigned int seq;
>> +     int ret = VM_FAULT_RETRY;
>> +     int dead;
>> +
>> +     __set_current_state(TASK_RUNNING);
>> +     flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +     count_vm_event(PGFAULT);
>> +
>> +     rcu_read_lock();
>> +     if (!pte_map_lock(mm, NULL, address, pmd, flags, 0, &pte, &ptl))
>> +             goto out_unlock;
>> +
>> +     vma = find_vma(mm, address);
>> +
>> +     if (!vma)
>> +             goto out_unmap;
>> +
>> +     dead = RB_EMPTY_NODE(&vma->vm_rb);
>> +     seq = vma->vm_sequence.sequence;
>> +     /*
>> +      * Matches both the wmb in write_seqcount_begin/end() and
>> +      * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
>> +      */
>> +     smp_rmb();
>> +     if (dead || seq & 1)
>> +             goto out_unmap;
>> +
>> +     if (!(vma->vm_end > address && vma->vm_start <= address))
>> +             goto out_unmap;
>> +
>> +     if (read_seqcount_retry(&vma->vm_sequence, seq))
>> +             goto out_unmap;
>> +
>> +     entry = *pte;
>> +
>> +     pte_unmap_unlock(pte, ptl);
>> +
>> +     ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
>> +
>> +out_unlock:
>> +     rcu_read_unlock();
>> +     return ret;
>> +
>> +out_unmap:
>> +     pte_unmap_unlock(pte, ptl);
>> +     goto out_unlock;
>> +}
>> +
>> +
>>  #ifndef __PAGETABLE_PUD_FOLDED
>>  /*
>>   * Allocate page upper directory.
>> Index: linux-2.6/include/linux/mm.h
>> ===================================================================
>> --- linux-2.6.orig/include/linux/mm.h
>> +++ linux-2.6/include/linux/mm.h
>> @@ -829,6 +829,8 @@ int invalidate_inode_page(struct page *p
>>  #ifdef CONFIG_MMU
>>  extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>>                       unsigned long address, unsigned int flags);
>> +extern int handle_speculative_fault(struct mm_struct *mm,
>> +                     unsigned long address, unsigned int flags);
>>  #else
>>  static inline int handle_mm_fault(struct mm_struct *mm,
>>                       struct vm_area_struct *vma, unsigned long address,
>>
>> --
>>
>>
>
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  4:29     ` Minchan Kim
@ 2010-01-05  4:43       ` KAMEZAWA Hiroyuki
  2010-01-05  5:10         ` Linus Torvalds
  2010-01-05  6:00         ` Minchan Kim
  2010-01-05  4:48       ` Linus Torvalds
  1 sibling, 2 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-05  4:43 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra, linux-kernel,
	linux-mm, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

On Tue, 5 Jan 2010 13:29:40 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> Hi, Kame.
> 
> On Tue, Jan 5, 2010 at 9:25 AM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Mon, 04 Jan 2010 19:24:35 +0100
> > Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> >
> >> Generic speculative fault handler, tries to service a pagefault
> >> without holding mmap_sem.
> >>
> >> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> >
> >
> > I'm sorry if I miss something...how does this patch series avoid
> > that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
> > ("vma is removed" means all other things as freeing file struct etc..)
> 
> Isn't it protected by get_file and iget?
> Am I miss something?
> 
Only kmem_cache_free() part of following code is modified by the patch.

==
 229 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 230 {
 231         struct vm_area_struct *next = vma->vm_next;
 232 
 233         might_sleep();
 234         if (vma->vm_ops && vma->vm_ops->close)
 235                 vma->vm_ops->close(vma);
 236         if (vma->vm_file) {
 237                 fput(vma->vm_file);
 238                 if (vma->vm_flags & VM_EXECUTABLE)
 239                         removed_exe_file_vma(vma->vm_mm);
 240         }
 241         mpol_put(vma_policy(vma));
 242         kmem_cache_free(vm_area_cachep, vma);
 243         return next;
 244 }
==

Then, fput() can be called. The whole above code should be delayd until RCU
glace period if we use RCU here.

Then, my patch dropped speculative trial of page fault and did synchronous
job here. I'm still considering how to insert some barrier to delay calling
remove_vma() until all page fault goes. One idea was reference count but
it was said not-enough crazy.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  4:29     ` Minchan Kim
  2010-01-05  4:43       ` KAMEZAWA Hiroyuki
@ 2010-01-05  4:48       ` Linus Torvalds
  2010-01-05  6:09         ` Minchan Kim
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05  4:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: KAMEZAWA Hiroyuki, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, linux-kernel, linux-mm, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Minchan Kim wrote:
> 
> Isn't it protected by get_file and iget?

When the vma is mapped, yes.

> Am I miss something?

remove_vma() will have done a

	fput(vma->vm_file);

and other house-keeping (removing the executable info, doing 
vm_ops->close() etc). 

And that is _not_ done delayed by RCU, and as outlined in my previous 
email I think that if the code really _does_ delay it, then munmap() (and 
exit) need to wait for the RCU callbacks to have been done, because 
otherwise the file may end up being busy "asynchronously" in ways that 
break existing semantics.

Just as an example: imagine a script that does "fork()+execve()" on a 
temporary file, and then after waiting for it all to finish with wait4() 
does some re-write of the file. It currently works. But what if the 
open-for-writing gets ETXTBUSY because the file is still marked as being 
VM_DENYWRITE, and RCU hasn't done all the callbacks?

Or if you do the i_writecount handling synchronously (which is likely fine 
- it really is just for ETXTBUSY handling, and I don't think speculative 
page faults matter), what about a shutdown sequence (or whatever) that 
wants to unmount the filesystem, but the file is still open - as it has to 
be - because the actual close is delayed by RCU.

So the patch-series as-is is fundamentally buggy - and trying to fix it 
seems painful.

I'm also not entirely clear on how the race with page table tear-down vs 
page-fault got handled, but I didn't read the whole patch-series very 
carefully. I skimmed through it and got rather nervous about it all. It 
doesn't seem too large, but it _does_ seem rather cavalier about all the 
object lifetimes.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  4:43       ` KAMEZAWA Hiroyuki
@ 2010-01-05  5:10         ` Linus Torvalds
  2010-01-05  5:30           ` KAMEZAWA Hiroyuki
  2010-01-05  8:18           ` Peter Zijlstra
  2010-01-05  6:00         ` Minchan Kim
  1 sibling, 2 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05  5:10 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> 
> Then, my patch dropped speculative trial of page fault and did synchronous
> job here. I'm still considering how to insert some barrier to delay calling
> remove_vma() until all page fault goes. One idea was reference count but
> it was said not-enough crazy.

What lock would you use to protect the vma lookup (in order to then 
increase the refcount)? A sequence lock with RCU lookup of the vma?

Sounds doable. But it also sounds way more expensive than the current VM 
fault handling, which is pretty close to optimal for single-threaded 
cases.. That RCU lookup might be cheap, but just the refcount is generally 
going to be as expensive as a lock.

Are there some particular mappings that people care about more than 
others? If we limit the speculative lookup purely to anonymous memory, 
that might simplify the problem space?

[ From past experiences, I suspect DB people would be upset and really 
  want it for the general file mapping case.. But maybe the main usage 
  scenario is something else this time? ]

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  5:10         ` Linus Torvalds
@ 2010-01-05  5:30           ` KAMEZAWA Hiroyuki
  2010-01-05  7:39             ` KAMEZAWA Hiroyuki
  2010-01-05 15:14             ` Christoph Lameter
  2010-01-05  8:18           ` Peter Zijlstra
  1 sibling, 2 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-05  5:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Mon, 4 Jan 2010 21:10:29 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > 
> > Then, my patch dropped speculative trial of page fault and did synchronous
> > job here. I'm still considering how to insert some barrier to delay calling
> > remove_vma() until all page fault goes. One idea was reference count but
> > it was said not-enough crazy.
> 
> What lock would you use to protect the vma lookup (in order to then 
> increase the refcount)? A sequence lock with RCU lookup of the vma?
> 

Ah, I just used reference counter to show "how many threads are in
page fault to this vma now". Below is from my post.

==
+			rb_node = rcu_dereference(rb_node->rb_left);
+		} else
+			rb_node = rcu_dereference(rb_node->rb_right);
+	}
+	if (vma) {
+		if ((vma->vm_start <= addr) && (addr < vma->vm_end)) {
+			if (!atomic_inc_not_zero(&vma->refcnt))
+				vma = NULL;
+		} else
+			vma = NULL;
+	}
+	rcu_read_unlock();

...
+void vma_put(struct vm_area_struct *vma)
+{
+	if ((atomic_dec_return(&vma->refcnt) == 1) &&
+	    waitqueue_active(&vma->wait_queue))
+		wake_up(&vma->wait_queue);
+	return;
+}
==

And wait for this reference count to be good number before calling
remove_vma()
==
+/* called when vma is unlinked and wait for all racy access.*/
+static void invalidate_vma_before_free(struct vm_area_struct *vma)
+{
+	atomic_dec(&vma->refcnt);
+	wait_event(vma->wait_queue, !atomic_read(&vma->refcnt));
+}
+
....
 		 * us to remove next before dropping the locks.
 		 */
 		__vma_unlink(mm, next, vma);
+		invalidate_vma_before_free(next);
 		if (file)
 			__remove_shared_vm_struct(next, file, mapping);

etc....
==
Above codes are a bit heavy(and buggy). I have some fixes.

> Sounds doable. But it also sounds way more expensive than the current VM 
> fault handling, which is pretty close to optimal for single-threaded 
> cases.. That RCU lookup might be cheap, but just the refcount is generally 
> going to be as expensive as a lock.
>
For single-threaded apps, my patch will have no benefits.
(but will not make anything worse.)
I'll add CONFIG and I wonder I can enable speculave_vma_lookup
only after mm_struct is shared.(but the patch may be messy...)

> Are there some particular mappings that people care about more than 
> others? If we limit the speculative lookup purely to anonymous memory, 
> that might simplify the problem space?
> 

I wonder, for usual people who don't write highly optimized programs,
some small benefit of skipping mmap_sem is to reduce mmap_sem() ping-pong
after doing fork()->exec(). This can cause some jitter to the application.
So, I'm glad if I can help file-backed vmas.

> [ From past experiences, I suspect DB people would be upset and really 
>   want it for the general file mapping case.. But maybe the main usage 
>   scenario is something else this time? ]
> 

I'd like to hear use cases of really heavy users, too. Christoph ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  4:43       ` KAMEZAWA Hiroyuki
  2010-01-05  5:10         ` Linus Torvalds
@ 2010-01-05  6:00         ` Minchan Kim
  1 sibling, 0 replies; 121+ messages in thread
From: Minchan Kim @ 2010-01-05  6:00 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra, linux-kernel,
	linux-mm, cl, hugh.dickins, Nick Piggin, Ingo Molnar,
	Linus Torvalds

On Tue, Jan 5, 2010 at 1:43 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 5 Jan 2010 13:29:40 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
>
>> Hi, Kame.
>>
>> On Tue, Jan 5, 2010 at 9:25 AM, KAMEZAWA Hiroyuki
>> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> > On Mon, 04 Jan 2010 19:24:35 +0100
>> > Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>> >
>> >> Generic speculative fault handler, tries to service a pagefault
>> >> without holding mmap_sem.
>> >>
>> >> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>> >
>> >
>> > I'm sorry if I miss something...how does this patch series avoid
>> > that vma is removed while __do_fault()->vma->vm_ops->fault() is called ?
>> > ("vma is removed" means all other things as freeing file struct etc..)
>>
>> Isn't it protected by get_file and iget?
>> Am I miss something?
>>
> Only kmem_cache_free() part of following code is modified by the patch.

That's it I missed. Thanks, Kame. ;-)
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  6:09         ` Minchan Kim
@ 2010-01-05  6:09           ` KAMEZAWA Hiroyuki
  2010-01-05  6:24             ` Minchan Kim
  2010-01-05  8:35           ` Peter Zijlstra
  1 sibling, 1 reply; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-05  6:09 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Linus Torvalds, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 5 Jan 2010 15:09:47 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:
> My humble opinion is following as.
> 
> Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
> It can delay munap and exit but it would be better than handling them by more
> complicated things, I think. And both cases aren't often cases so we
> can achieve advantage than disadvantage?
> 

In most case, a program is single threaded. And sychronize_rcu() in unmap path
just adds very big overhead.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  4:48       ` Linus Torvalds
@ 2010-01-05  6:09         ` Minchan Kim
  2010-01-05  6:09           ` KAMEZAWA Hiroyuki
  2010-01-05  8:35           ` Peter Zijlstra
  0 siblings, 2 replies; 121+ messages in thread
From: Minchan Kim @ 2010-01-05  6:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, linux-kernel, linux-mm, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

Hi, Linus.

On Tue, Jan 5, 2010 at 1:48 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
>
> On Tue, 5 Jan 2010, Minchan Kim wrote:
>>
>> Isn't it protected by get_file and iget?
>
> When the vma is mapped, yes.
>
>> Am I miss something?
>
> remove_vma() will have done a
>
>        fput(vma->vm_file);
>
> and other house-keeping (removing the executable info, doing
> vm_ops->close() etc).
>
> And that is _not_ done delayed by RCU, and as outlined in my previous
> email I think that if the code really _does_ delay it, then munmap() (and
> exit) need to wait for the RCU callbacks to have been done, because
> otherwise the file may end up being busy "asynchronously" in ways that
> break existing semantics.
>
> Just as an example: imagine a script that does "fork()+execve()" on a
> temporary file, and then after waiting for it all to finish with wait4()
> does some re-write of the file. It currently works. But what if the
> open-for-writing gets ETXTBUSY because the file is still marked as being
> VM_DENYWRITE, and RCU hasn't done all the callbacks?
>
> Or if you do the i_writecount handling synchronously (which is likely fine
> - it really is just for ETXTBUSY handling, and I don't think speculative
> page faults matter), what about a shutdown sequence (or whatever) that
> wants to unmount the filesystem, but the file is still open - as it has to
> be - because the actual close is delayed by RCU.
>
> So the patch-series as-is is fundamentally buggy - and trying to fix it
> seems painful.
>
> I'm also not entirely clear on how the race with page table tear-down vs
> page-fault got handled, but I didn't read the whole patch-series very
> carefully. I skimmed through it and got rather nervous about it all. It
> doesn't seem too large, but it _does_ seem rather cavalier about all the
> object lifetimes.
>
>                Linus
>

Thanks for careful explanation, Linus.

My humble opinion is following as.

Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
It can delay munap and exit but it would be better than handling them by more
complicated things, I think. And both cases aren't often cases so we
can achieve advantage than disadvantage?


-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  6:09           ` KAMEZAWA Hiroyuki
@ 2010-01-05  6:24             ` Minchan Kim
  0 siblings, 0 replies; 121+ messages in thread
From: Minchan Kim @ 2010-01-05  6:24 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Linus Torvalds, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, Jan 5, 2010 at 3:09 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 5 Jan 2010 15:09:47 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
>> My humble opinion is following as.
>>
>> Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
>> It can delay munap and exit but it would be better than handling them by more
>> complicated things, I think. And both cases aren't often cases so we
>> can achieve advantage than disadvantage?
>>
>
> In most case, a program is single threaded. And sychronize_rcu() in unmap path
> just adds very big overhead.

Yes.
I suggested you that consider single-thread app's regression, please. :)

First I come to my head is we can count number of thread.
Yes. thread number is a not good choice.

As a matter of fact, I want to work it adaptively.
If the process start to have many threads, speculative page fault turn
on or turn off.
I know it's not easy. I hope other guys have good ideas.

>
> Thanks,
> -Kame
>
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  5:30           ` KAMEZAWA Hiroyuki
@ 2010-01-05  7:39             ` KAMEZAWA Hiroyuki
  2010-01-05 15:26               ` Linus Torvalds
  2010-01-05 15:14             ` Christoph Lameter
  1 sibling, 1 reply; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-05  7:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Linus Torvalds, Minchan Kim, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, linux-kernel, linux-mm, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 13057 bytes --]

On Tue, 5 Jan 2010 14:30:46 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> ==
> Above codes are a bit heavy(and buggy). I have some fixes.
> 
Here is my latest one. But I myself think this should be improved more
and don't think this patch is bug-free. I have some concern
s around fork() etc..and still considering how to avoid atomic ops.

I post this just as a reference rather than cut-out from old e-mails.

Here is a comarison before after patch. A test program does following in
multi-thread. (attached)

  while (1) {
	touch memory (with write)
   	pthread_barrier_wait().
   	fork() if I'm the first thread.
     	-> exit() immediately
        wait()
   	pthread_barrier_wait()
  }

And see overheads by perf. This is result on 8core/2socket hosts.
This is highly affected by the number of sockets.

# Samples: 1164449628090
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    43.23%  multi-fault-all  [kernel]                  [k] smp_invalidate_interrupt
    16.27%  multi-fault-all  [kernel]                  [k] flush_tlb_others_ipi
    11.55%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irqsave    <========(*)
     6.23%  multi-fault-all  [kernel]                  [k] intel_pmu_enable_all
     2.17%  multi-fault-all  [kernel]                  [k] _raw_spin_unlock_irqrestore
     1.59%  multi-fault-all  [kernel]                  [k] page_fault
     1.45%  multi-fault-all  [kernel]                  [k] do_wp_page
     1.35%  multi-fault-all  ./multi-fault-all-fork    [.] worker
     1.26%  multi-fault-all  [kernel]                  [k] handle_mm_fault
     1.19%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     1.03%  multi-fault-all  [kernel]                  [k] invalidate_interrupt7
     1.00%  multi-fault-all  [kernel]                  [k] invalidate_interrupt6
     0.99%  multi-fault-all  [kernel]                  [k] invalidate_interrupt3


# Samples: 181505050964
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    45.08%  multi-fault-all  [kernel]                  [k] smp_invalidate_interrupt
    19.45%  multi-fault-all  [kernel]                  [k] intel_pmu_enable_all
    14.17%  multi-fault-all  [kernel]                  [k] flush_tlb_others_ipi
     1.89%  multi-fault-all  [kernel]                  [k] do_wp_page
     1.58%  multi-fault-all  [kernel]                  [k] page_fault
     1.46%  multi-fault-all  ./multi-fault-all-fork    [.] worker
     1.26%  multi-fault-all  [kernel]                  [k] do_page_fault
     1.14%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     1.10%  multi-fault-all  [kernel]                  [k] flush_tlb_page
     1.09%  multi-fault-all  [kernel]                  [k] vma_put           <========(**)
     0.99%  multi-fault-all  [kernel]                  [k] invalidate_interrupt0
     0.98%  multi-fault-all  [kernel]                  [k] find_vma_speculative <=====(**)
     0.81%  multi-fault-all  [kernel]                  [k] invalidate_interrupt3
     0.81%  multi-fault-all  [kernel]                  [k] native_apic_mem_write
     0.79%  multi-fault-all  [kernel]                  [k] invalidate_interrupt7
     0.76%  multi-fault-all  [kernel]                  [k] invalidate_interrupt5

(*) is removed overhead of mmap_sem and (**) is added overhead of atomic ops.
And yes, 1% seems still not ideally small. And it's unknown how this false-sharing
of atomic ops can affect to very big SMP.....maybe very big.

BTW, I'm not sure why intel_pmu_enable_all() is very big...because of fork() ?

My patch is below, just as a reference of my work in the last year, not
very clean yet. I need more time for improvements (and does not adhere to
this implementation.)

When you test this, please turn off SPINLOCK_DEBUG to enable split-page-table-lock.

Regards,
-Kame
==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Asynchronous page fault.

This patch is for avoidng mmap_sem in usual page fault. At running highly
multi-threaded programs, mm->mmap_sem can use much CPU because of false
sharing when it causes page fault in parallel. (Run after fork() is a typical
case, I think.)
This patch uses a speculative vma lookup to reduce that cost.

Considering vma lookup, rb-tree lookup, the only operation we do is checking
node->rb_left,rb_right. And there are no complicated operation.
At page fault, there are no demands for accessing sorted-vma-list or access
prev or next in many case. Except for stack-expansion, we always need a vma
which contains page-fault address. Then, we can access vma's RB-tree in
speculative way.
Even if RB-tree rotation occurs while we walk tree for look-up, we just
miss vma without oops. In other words, we can _try_ to find vma in lockless
manner. If failed, retry is ok.... we take lock and access vma.

For lockess walking, this uses RCU and adds find_vma_speculative(). And
per-vma wait-queue and reference count. This refcnt+wait_queue guarantees that
there are no thread which access the vma when we call subsystem's unmap
functions.

Changelog:
 - removed rcu_xxx macros.
 - fixed reference count handling bug at vma_put().
 - removed atomic_add_unless() etc. But use Big number for handling race.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 arch/x86/mm/fault.c      |   14 ++++++
 include/linux/mm.h       |   14 ++++++
 include/linux/mm_types.h |    5 ++
 mm/mmap.c                |   98 +++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 127 insertions(+), 4 deletions(-)

Index: linux-2.6.33-rc2/include/linux/mm_types.h
===================================================================
--- linux-2.6.33-rc2.orig/include/linux/mm_types.h
+++ linux-2.6.33-rc2/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
+#include <linux/rcupdate.h>
 #include <linux/page-debug-flags.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -180,6 +181,10 @@ struct vm_area_struct {
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
+	atomic_t refcnt;
+	wait_queue_head_t wait_queue;
+	struct rcu_head	rcuhead;
+
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
Index: linux-2.6.33-rc2/mm/mmap.c
===================================================================
--- linux-2.6.33-rc2.orig/mm/mmap.c
+++ linux-2.6.33-rc2/mm/mmap.c
@@ -187,6 +187,26 @@ error:
 	return -ENOMEM;
 }
 
+static void __free_vma_rcu_cb(struct rcu_head *head)
+{
+	struct vm_area_struct *vma;
+	vma = container_of(head, struct vm_area_struct, rcuhead);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+/* Call this if vma was linked to rb-tree */
+static void free_vma_rcu(struct vm_area_struct *vma)
+{
+	call_rcu(&vma->rcuhead, __free_vma_rcu_cb);
+}
+#define VMA_FREE_MAGIC	(10000000)
+/* called when vma is unlinked and wait for all racy access.*/
+static void invalidate_vma_before_free(struct vm_area_struct *vma)
+{
+	atomic_sub(VMA_FREE_MAGIC, &vma->refcnt);
+	wait_event(vma->wait_queue,
+		(atomic_read(&vma->refcnt) == -VMA_FREE_MAGIC));
+}
+
 /*
  * Requires inode->i_mapping->i_mmap_lock
  */
@@ -238,7 +258,7 @@ static struct vm_area_struct *remove_vma
 			removed_exe_file_vma(vma->vm_mm);
 	}
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma_rcu(vma);
 	return next;
 }
 
@@ -404,8 +424,12 @@ __vma_link_list(struct mm_struct *mm, st
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct rb_node **rb_link, struct rb_node *rb_parent)
 {
+	atomic_set(&vma->refcnt, 0);
+	init_waitqueue_head(&vma->wait_queue);
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+	/* For speculative vma lookup */
+	smp_wmb();
 }
 
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -490,6 +514,7 @@ __vma_unlink(struct mm_struct *mm, struc
 	rb_erase(&vma->vm_rb, &mm->mm_rb);
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
+	smp_wmb();
 }
 
 /*
@@ -614,6 +639,7 @@ again:			remove_next = 1 + (end > next->
 		 * us to remove next before dropping the locks.
 		 */
 		__vma_unlink(mm, next, vma);
+		invalidate_vma_before_free(next);
 		if (file)
 			__remove_shared_vm_struct(next, file, mapping);
 		if (next->anon_vma)
@@ -640,7 +666,7 @@ again:			remove_next = 1 + (end > next->
 		}
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma_rcu(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1544,6 +1570,71 @@ out:
 }
 
 /*
+ * Returns vma which contains given address. This scans rb-tree in speculative
+ * way and increment a reference count if found. Even if vma exists in rb-tree,
+ * this function may return NULL in racy case. So, this function cannot be used
+ * for checking whether given address is valid or not.
+ */
+
+struct vm_area_struct *
+find_vma_speculative(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL;
+	struct vm_area_struct *vma_tmp;
+	struct rb_node *rb_node;
+
+	if (unlikely(!mm))
+		return NULL;;
+
+	rcu_read_lock();
+	/*
+ 	 * Barreir against modification of rb-tree
+ 	 * rb-tree update is not an atomic ops and no barreir is used while
+ 	 * modification. Then, modification to rb-tree can be reordered. This
+ 	 * memory barrier is against vma_(un)link_rb() for avoiding to read
+ 	 * too old data to catch all changes we get rcu_read_lock.
+ 	 *
+ 	 * We may see broken RB-tree and can't find existing vma. But it's ok.
+ 	 * We allowed to return NULL even if valid one exists. The caller will
+ 	 * use find_vma() with read-semaphore.
+ 	 */
+
+  	smp_read_barrier_depends();
+	rb_node = mm->mm_rb.rb_node;
+	vma = NULL;
+	while (rb_node) {
+		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+		if (vma_tmp->vm_end > addr) {
+			vma = vma_tmp;
+			if (vma_tmp->vm_start <= addr)
+				break;
+			rb_node = rb_node->rb_left;
+		} else
+			rb_node = rb_node->rb_right;
+	}
+	if (vma) {
+		if ((vma->vm_start <= addr) && (addr < vma->vm_end)) {
+			if (atomic_inc_return(&vma->refcnt) < 0) {
+				vma_put(vma);
+				vma = NULL;
+			}
+		} else
+			vma = NULL;
+	}
+	rcu_read_unlock();
+	return vma;
+}
+
+void vma_put(struct vm_area_struct *vma)
+{
+	if ((atomic_dec_return(&vma->refcnt) == -VMA_FREE_MAGIC) &&
+	    waitqueue_active(&vma->wait_queue))
+		wake_up(&vma->wait_queue);
+	return;
+}
+
+/*
  * Verify that the stack growth is acceptable and
  * update accounting. This is shared with both the
  * grow-up and grow-down cases.
@@ -1808,6 +1899,7 @@ detach_vmas_to_be_unmapped(struct mm_str
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
+		invalidate_vma_before_free(vma);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
Index: linux-2.6.33-rc2/include/linux/mm.h
===================================================================
--- linux-2.6.33-rc2.orig/include/linux/mm.h
+++ linux-2.6.33-rc2/include/linux/mm.h
@@ -1235,6 +1235,20 @@ static inline struct vm_area_struct * fi
 	return vma;
 }
 
+/*
+ * Look up vma for given address in speculative way. This allows lockless lookup
+ * of vmas but in racy case, vma may no be found. You have to call find_vma()
+ * under read lock of mm->mmap_sem even if this function returns NULL.
+ * And, returned vma's reference count is incremented to show vma is accessed
+ * asynchronously, the caller has to call vma_put().
+ *
+ * Unlike find_vma(), this returns a vma which contains specified address.
+ * This doesn't return the nearest vma.
+ */
+extern struct vm_area_struct *find_vma_speculative(struct mm_struct *mm,
+	unsigned long addr);
+void vma_put(struct vm_area_struct *vma);
+
 static inline unsigned long vma_pages(struct vm_area_struct *vma)
 {
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
Index: linux-2.6.33-rc2/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.33-rc2.orig/arch/x86/mm/fault.c
+++ linux-2.6.33-rc2/arch/x86/mm/fault.c
@@ -952,6 +952,7 @@ do_page_fault(struct pt_regs *regs, unsi
 	struct mm_struct *mm;
 	int write;
 	int fault;
+	int speculative = 0;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1040,6 +1041,14 @@ do_page_fault(struct pt_regs *regs, unsi
 		return;
 	}
 
+	if ((error_code & PF_USER)) {
+		vma = find_vma_speculative(mm, address);
+		if (vma) {
+			speculative = 1;
+			goto good_area;
+		}
+	}
+
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in
@@ -1136,5 +1145,8 @@ good_area:
 
 	check_v8086_mode(regs, address, tsk);
 
-	up_read(&mm->mmap_sem);
+	if (speculative)
+		vma_put(vma);
+	else
+		up_read(&mm->mmap_sem);
 }







[-- Attachment #2: multi-fault-all-fork.c --]
[-- Type: text/x-csrc, Size: 1861 bytes --]

/*
 * multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
 * % gcc -O2 -o multi-fault multi-fault.c -lpthread
 * % multi-fault # of cpus.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>

#define NR_THREADS	8
pthread_t threads[NR_THREADS];
/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define MMAP_LENGTH	(8 * 1024 * 1024)
#define FAULT_LENGTH	(2 * 1024 * 1024)
void *mmap_area[NR_THREADS];
#define PAGE_SIZE	4096

pthread_barrier_t barrier;
int name[NR_THREADS];

void segv_handler(int sig)
{
	sleep(100);
}
void *worker(void *data)
{
	cpu_set_t set;
	int cpu;
	int status;

	cpu = *(int *)data;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);

	while (1) {
		char *c;
		char *start = mmap_area[cpu];
		char *end = mmap_area[cpu] + FAULT_LENGTH;
		pthread_barrier_wait(&barrier);
		//printf("fault into %p-%p\n",start, end);

		for (c = start; c < end; c += PAGE_SIZE)
			*c = 0;
		pthread_barrier_wait(&barrier);

		if ((cpu == 0) && !fork())
			exit(0);
		wait(&status);
		pthread_barrier_wait(&barrier);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int i, ret;
	unsigned int num;

	if (argc < 2)
		return 0;

	num = atoi(argv[1]);	
	pthread_barrier_init(&barrier, NULL, num);

	mmap_area[0] = mmap(NULL, MMAP_LENGTH * num, PROT_WRITE|PROT_READ,
				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	for (i = 1; i < num; i++) {
		mmap_area[i] = mmap_area[i - 1]+ MMAP_LENGTH;
	}

	for (i = 0; i < num; ++i) {
		name[i] = i;
		ret = pthread_create(&threads[i], NULL, worker, &name[i]);
		if (ret < 0) {
			perror("pthread create");
			return 0;
		}
	}
	sleep(60);
	return 0;
}

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  3:13     ` Linus Torvalds
@ 2010-01-05  8:17       ` Peter Zijlstra
  2010-01-05  8:57       ` Peter Zijlstra
  2010-01-05  9:37       ` Peter Zijlstra
  2 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05  8:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Paul E. McKenney, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar

On Mon, 2010-01-04 at 19:13 -0800, Linus Torvalds wrote:

> I would say that this whole series is _very_ far from being mergeable. 

Never said anything else ;-)

> Peter seems to have been thinking about the details, while missing all the 
> subtle big picture effects that seem to actually change semantics.

Yes, I seem to have missed a rather significant side of the whole
issue..  back to the drawing room for me then.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  5:10         ` Linus Torvalds
  2010-01-05  5:30           ` KAMEZAWA Hiroyuki
@ 2010-01-05  8:18           ` Peter Zijlstra
  1 sibling, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05  8:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney, linux-kernel,
	linux-mm, cl, hugh.dickins, Nick Piggin, Ingo Molnar

On Mon, 2010-01-04 at 21:10 -0800, Linus Torvalds wrote:
> Sounds doable. But it also sounds way more expensive than the current VM 
> fault handling, which is pretty close to optimal for single-threaded 
> cases.. That RCU lookup might be cheap, but just the refcount is generally 
> going to be as expensive as a lock. 

Right, that refcount adds two atomic ops, the only grace it has is that
its in the vma as opposed to the mm, but there are plenty workloads that
concentrate on a single vma, in which case you get an equally contended
cacheline as with the mmap_sem.

I was trying to avoid having to have that refcount, but then sorta
forgot about the actual fault handlers also poking at the vma :/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 4/8] mm: RCU free vmas
  2010-01-05  2:43   ` Paul E. McKenney
@ 2010-01-05  8:28     ` Peter Zijlstra
  2010-01-05 16:05       ` Paul E. McKenney
  0 siblings, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05  8:28 UTC (permalink / raw)
  To: paulmck
  Cc: KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim, cl,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

On Mon, 2010-01-04 at 18:43 -0800, Paul E. McKenney wrote:
> On Mon, Jan 04, 2010 at 07:24:33PM +0100, Peter Zijlstra wrote:
> > TODO:
> >  - should be SRCU, lack of call_srcu()
> > 
> > In order to allow speculative vma lookups, RCU free the struct
> > vm_area_struct.
> > 
> > We use two means of detecting a vma is still valid:
> >  - firstly, we set RB_CLEAR_NODE once we remove a vma from the tree.
> >  - secondly, we check the vma sequence number.
> > 
> > These two things combined will guarantee that 1) the vma is still
> > present and two, it still covers the same range from when we looked it
> > up.
> 
> OK, I think I see what you are up to here.  I could get you a very crude
> throw-away call_srcu() fairly quickly.  I don't yet have a good estimate
> of how long it will take me to merge SRCU into the treercu infrastructure,
> but am getting there.
> 
> So, which release are you thinking in terms of?

I'm not thinking any release yet, its very early and as Linus has
pointed out, I seem to have forgotten a rather big piece of the
picture :/

So I need to try and fix this glaring hole before we can continue.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  6:09         ` Minchan Kim
  2010-01-05  6:09           ` KAMEZAWA Hiroyuki
@ 2010-01-05  8:35           ` Peter Zijlstra
  1 sibling, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05  8:35 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Linus Torvalds, KAMEZAWA Hiroyuki, Paul E. McKenney,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 2010-01-05 at 15:09 +0900, Minchan Kim wrote:

> Couldn't we synchronize rcu in that cases(munmap, exit and so on)?
> It can delay munap and exit but it would be better than handling them by more
> complicated things, I think. And both cases aren't often cases so we
> can achieve advantage than disadvantage?

Sadly there are programs that mmap()/munmap() at a staggering rate
(clamav comes to mind), so munmap() performance is important too.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  3:13     ` Linus Torvalds
  2010-01-05  8:17       ` Peter Zijlstra
@ 2010-01-05  8:57       ` Peter Zijlstra
  2010-01-05 15:34         ` Linus Torvalds
  2010-01-05  9:37       ` Peter Zijlstra
  2 siblings, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05  8:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Paul E. McKenney, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar

On Mon, 2010-01-04 at 19:13 -0800, Linus Torvalds wrote:
> Or put another way: if the vma was a writable mapping, a user may do
> 
>         munmap(mapping, size);
> 
> and the backing file is still active and writable AFTER THE MUNMAP! This 
> can be a huge problem for something that wants to unmount the volume, for 
> example, or depends on the whole writability-vs-executability thing. The 
> user may have unmapped it, and expects the file to be immediately 
> non-busy, but with the delayed free that isn't the case any more. 

If it were only unmount it would be rather easy to fix by putting that
RCU synchronization in unmount, unmount does a lot of sync things
anyway. But I suspect there's more cases where that non-busy matters
(but I'd need to educate myself on filesystems/vfs to come up with any).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  3:13     ` Linus Torvalds
  2010-01-05  8:17       ` Peter Zijlstra
  2010-01-05  8:57       ` Peter Zijlstra
@ 2010-01-05  9:37       ` Peter Zijlstra
  2010-01-05 23:35         ` Linus Torvalds
  2 siblings, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05  9:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Paul E. McKenney, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar

On Mon, 2010-01-04 at 19:13 -0800, Linus Torvalds wrote:
> 
> Protecting the vma isn't enough. You need to protect the whole FS stack 
> with rcu. Probably by moving _all_ of "free_vma()" into the RCU path 
> (which means that the whole file/inode gets de-allocated at that later RCU 
> point, rather than synchronously). Not just the actual kfree. 

Right, looking at that I found another interesting challenge, fput() can
sleep and I suspect that even with call_srcu() its callbacks have to be
atomic.

While looking at that code, I found the placement of might_sleep() a tad
confusing, I'd expect that to be in fput() since that is the regular
entry point (all except AIO, which does crazy things).

---
 fs/file_table.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 69652c5..6070c32 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -196,6 +196,8 @@ EXPORT_SYMBOL(alloc_file);
 
 void fput(struct file *file)
 {
+	might_sleep();
+
 	if (atomic_long_dec_and_test(&file->f_count))
 		__fput(file);
 }
@@ -236,8 +238,6 @@ void __fput(struct file *file)
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = dentry->d_inode;
 
-	might_sleep();
-
 	fsnotify_close(file);
 	/*
 	 * The function eventpoll_release() should be the first called


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
  2010-01-05  0:25   ` KAMEZAWA Hiroyuki
@ 2010-01-05 13:45   ` Arjan van de Ven
  2010-01-05 14:15     ` Andi Kleen
  2010-01-05 15:17     ` Christoph Lameter
  1 sibling, 2 replies; 121+ messages in thread
From: Arjan van de Ven @ 2010-01-05 13:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Peter Zijlstra, KAMEZAWA Hiroyuki,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar, Linus Torvalds

On Mon, 04 Jan 2010 19:24:35 +0100
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Generic speculative fault handler, tries to service a pagefault
> without holding mmap_sem.


while I appreciate the goal of reducing contention on this lock...
wouldn't step one be to remove the page zeroing from under this lock?
that's by far (easily by 10x I would guess) the most expensive thing
that's done under the lock, and I would expect a first order of
contention reduction just by having the zeroing of a page not done
under the lock...


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 13:45   ` Arjan van de Ven
@ 2010-01-05 14:15     ` Andi Kleen
  2010-01-05 15:17     ` Christoph Lameter
  1 sibling, 0 replies; 121+ messages in thread
From: Andi Kleen @ 2010-01-05 14:15 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim, cl,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

Arjan van de Ven <arjan@infradead.org> writes:

> On Mon, 04 Jan 2010 19:24:35 +0100
> Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
>> Generic speculative fault handler, tries to service a pagefault
>> without holding mmap_sem.
>
>
> while I appreciate the goal of reducing contention on this lock...
> wouldn't step one be to remove the page zeroing from under this lock?
> that's by far (easily by 10x I would guess) the most expensive thing
> that's done under the lock, and I would expect a first order of
> contention reduction just by having the zeroing of a page not done
> under the lock...

The cache line bouncing of the shared cache lines hurts too.

I suspect fixing this all properly will need some deeper changes.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  5:30           ` KAMEZAWA Hiroyuki
  2010-01-05  7:39             ` KAMEZAWA Hiroyuki
@ 2010-01-05 15:14             ` Christoph Lameter
  1 sibling, 0 replies; 121+ messages in thread
From: Christoph Lameter @ 2010-01-05 15:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Linus Torvalds, Minchan Kim, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:

> I'd like to hear use cases of really heavy users, too. Christoph ?

A typical use case is a highly parallel memory intensive process
(simulations f.e.). Those are configured with the number of hardware
threads supported in mind to max out the performance of the underlying
hardware. On startup they start N threads and then each thread begins
initializing its memory (to get proper locality it has to be done this
way, you also want concurrency during this expensive operation).

The larger the number of threads the more contention on the
cachelines containing mmap_sem and the other cacheline containing the rss
counters. In extreme cases we had to wait 30mins to an hour in order for
the cacheline bouncing to complete (startup of a big HPC app on
IA64 with 1k threads).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 13:45   ` Arjan van de Ven
  2010-01-05 14:15     ` Andi Kleen
@ 2010-01-05 15:17     ` Christoph Lameter
  2010-01-06  3:22       ` Arjan van de Ven
  1 sibling, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-05 15:17 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

On Tue, 5 Jan 2010, Arjan van de Ven wrote:

> while I appreciate the goal of reducing contention on this lock...
> wouldn't step one be to remove the page zeroing from under this lock?
> that's by far (easily by 10x I would guess) the most expensive thing
> that's done under the lock, and I would expect a first order of
> contention reduction just by having the zeroing of a page not done
> under the lock...

The main issue is cacheline bouncing. mmap sem is a rw semaphore and only
held for read during a fault.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  7:39             ` KAMEZAWA Hiroyuki
@ 2010-01-05 15:26               ` Linus Torvalds
  2010-01-05 16:14                 ` Linus Torvalds
  2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 15:26 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar



On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> #
> # Overhead          Command             Shared Object  Symbol
> # ........  ...............  ........................  ......
> #
>     43.23%  multi-fault-all  [kernel]                  [k] smp_invalidate_interrupt
>     16.27%  multi-fault-all  [kernel]                  [k] flush_tlb_others_ipi
>     11.55%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irqsave    <========(*)
>      6.23%  multi-fault-all  [kernel]                  [k] intel_pmu_enable_all
>      2.17%  multi-fault-all  [kernel]                  [k] _raw_spin_unlock_irqrestore

Hmm.. The default rwsem implementation shouldn't have any spin-locks in 
the fast-path. And your profile doesn't seem to have any scheduler 
footprint, so I wonder what is going on.

Oh.

Lookie here:

 - arch/x86/Kconfig.cpu:

	config X86_XADD
		def_bool y
		depends on X86_32 && !M386

 - arch/x86/Kconfig:

	config RWSEM_GENERIC_SPINLOCK
	        def_bool !X86_XADD

	config RWSEM_XCHGADD_ALGORITHM
	        def_bool X86_XADD   

it looks like X86_XADD only gets enabled on 32-bit builds. Which means 
that x86-64 in turn seems to end up always using the slower "generic 
spinlock" version.

Are you sure this isn't the reason why your profiles are horrible?

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  8:57       ` Peter Zijlstra
@ 2010-01-05 15:34         ` Linus Torvalds
  2010-01-05 15:40           ` Al Viro
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 15:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: KAMEZAWA Hiroyuki, Paul E. McKenney, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar



On Tue, 5 Jan 2010, Peter Zijlstra wrote:
> 
> If it were only unmount it would be rather easy to fix by putting that
> RCU synchronization in unmount, unmount does a lot of sync things
> anyway. But I suspect there's more cases where that non-busy matters
> (but I'd need to educate myself on filesystems/vfs to come up with any).

unmount may well be the only really huge piece.

The only other effects of delaying closing a file I can see are

 - the ETXTBUSY thing, but we don't need to delay _that_ part, so this may 
   be a non-issue.

 - the actual freeing of the data on disk (ie people may expect that the 
   last close really frees up the space on the filesystem). However, this 
   is _such_ a subtle semantic thing that maybe nobody cares.

It's perhaps worth noting that I think Nick's VFS scalability patches did 
at least _some_ of the "struct filp" freeing in RCU context too, so this 
whole "vfs delays things in RCU" is not a new thing.

But I think that in Nick's case it was stricly just the freeing of the 
inode/dentry data structure (because he needed to traverse the dentry list 
locklessly - he didn't then _use_ the results locklessly). So the actual 
filesystem operations didn't get deferred, and as a result it didn't have 
this particular semantic nightmare.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 15:34         ` Linus Torvalds
@ 2010-01-05 15:40           ` Al Viro
  2010-01-05 16:10             ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Al Viro @ 2010-01-05 15:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Paul E. McKenney,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Tue, Jan 05, 2010 at 07:34:02AM -0800, Linus Torvalds wrote:

> The only other effects of delaying closing a file I can see are
> 
>  - the ETXTBUSY thing, but we don't need to delay _that_ part, so this may 
>    be a non-issue.
> 
>  - the actual freeing of the data on disk (ie people may expect that the 
>    last close really frees up the space on the filesystem). However, this 
>    is _such_ a subtle semantic thing that maybe nobody cares.

- a bunch of fs operations done from RCU callbacks.  Including severely
blocking ones.  As in "for minutes" in the most severe cases, and with
large numbers of objects involved.  Can get very unpleasant...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 4/8] mm: RCU free vmas
  2010-01-05  8:28     ` Peter Zijlstra
@ 2010-01-05 16:05       ` Paul E. McKenney
  0 siblings, 0 replies; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-05 16:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim, cl,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

On Tue, Jan 05, 2010 at 09:28:36AM +0100, Peter Zijlstra wrote:
> On Mon, 2010-01-04 at 18:43 -0800, Paul E. McKenney wrote:
> > On Mon, Jan 04, 2010 at 07:24:33PM +0100, Peter Zijlstra wrote:
> > > TODO:
> > >  - should be SRCU, lack of call_srcu()
> > > 
> > > In order to allow speculative vma lookups, RCU free the struct
> > > vm_area_struct.
> > > 
> > > We use two means of detecting a vma is still valid:
> > >  - firstly, we set RB_CLEAR_NODE once we remove a vma from the tree.
> > >  - secondly, we check the vma sequence number.
> > > 
> > > These two things combined will guarantee that 1) the vma is still
> > > present and two, it still covers the same range from when we looked it
> > > up.
> > 
> > OK, I think I see what you are up to here.  I could get you a very crude
> > throw-away call_srcu() fairly quickly.  I don't yet have a good estimate
> > of how long it will take me to merge SRCU into the treercu infrastructure,
> > but am getting there.
> > 
> > So, which release are you thinking in terms of?
> 
> I'm not thinking any release yet, its very early and as Linus has
> pointed out, I seem to have forgotten a rather big piece of the
> picture :/
> 
> So I need to try and fix this glaring hole before we can continue.

OK, then I will think in terms of merging SRCU into treercu before
providing any sort of call_srcu().

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 15:40           ` Al Viro
@ 2010-01-05 16:10             ` Linus Torvalds
  2010-01-06 15:41               ` Peter Zijlstra
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 16:10 UTC (permalink / raw)
  To: Al Viro
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Paul E. McKenney,
	linux-kernel, linux-mm, minchan.kim, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar



On Tue, 5 Jan 2010, Al Viro wrote:
> 
> - a bunch of fs operations done from RCU callbacks.  Including severely
> blocking ones.

Yeah, you're right (and Peter also pointed out the might_sleep). That is 
likely to be the really fundamental issue. 

You _can_ handle it (make the RCU callback just schedule the work instead 
of doing it directly), but it does sound really nasty. I suspect we should 
explore just about any other approach over this one. 

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 15:26               ` Linus Torvalds
@ 2010-01-05 16:14                 ` Linus Torvalds
  2010-01-05 17:25                   ` Andi Kleen
  2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 16:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar



On Tue, 5 Jan 2010, Linus Torvalds wrote:
> 
> Lookie here:
> 
>  - arch/x86/Kconfig.cpu:
> 
> 	config X86_XADD
> 		def_bool y
> 		depends on X86_32 && !M386
> 
>  - arch/x86/Kconfig:
> 
> 	config RWSEM_GENERIC_SPINLOCK
> 	        def_bool !X86_XADD
> 
> 	config RWSEM_XCHGADD_ALGORITHM
> 	        def_bool X86_XADD   
> 
> it looks like X86_XADD only gets enabled on 32-bit builds. Which means 
> that x86-64 in turn seems to end up always using the slower "generic 
> spinlock" version.

Sadly, it's not as easy as just changing the X86_XADD "depends on" to say 
"X86_64 || !M386" instead. That just results in

	kernel/built-in.o: In function `up_read':
	(.text+0x2d8e5): undefined reference to `call_rwsem_wake'

etc, because the x86-64 code has obviously never seen the optimized 
call-paths, and they need the asm wrappers for full semantics.

Oh well. Somebody who is bored might look at trying to make the wrapper 
code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the 
successful rwsem cases much faster.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 16:14                 ` Linus Torvalds
@ 2010-01-05 17:25                   ` Andi Kleen
  2010-01-05 17:47                     ` Christoph Lameter
  2010-01-05 17:55                     ` Linus Torvalds
  0 siblings, 2 replies; 121+ messages in thread
From: Andi Kleen @ 2010-01-05 17:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, linux-kernel, linux-mm, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

Linus Torvalds <torvalds@linux-foundation.org> writes:
>
> etc, because the x86-64 code has obviously never seen the optimized 
> call-paths, and they need the asm wrappers for full semantics.

iirc Andrea ran benchmarks at some point and it didn't make too much 
difference on the systems back then (K8 era). Given K8 has fast atomics.

> Oh well. Somebody who is bored might look at trying to make the wrapper 
> code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the 
> successful rwsem cases much faster.

Maybe, maybe not.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 17:25                   ` Andi Kleen
@ 2010-01-05 17:47                     ` Christoph Lameter
  2010-01-05 18:00                       ` Andi Kleen
  2010-01-05 17:55                     ` Linus Torvalds
  1 sibling, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-05 17:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Linus Torvalds, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Andi Kleen wrote:

> > Oh well. Somebody who is bored might look at trying to make the wrapper
> > code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
> > successful rwsem cases much faster.
>
> Maybe, maybe not.

What we saw in the past was that the cost of acquiring exclusive
access to the contended cachelines containing mmap_sem and rss counters
was most important.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 17:25                   ` Andi Kleen
  2010-01-05 17:47                     ` Christoph Lameter
@ 2010-01-05 17:55                     ` Linus Torvalds
  2010-01-05 18:13                       ` Christoph Lameter
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 17:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, linux-kernel, linux-mm, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Andi Kleen wrote:
> 
> > Oh well. Somebody who is bored might look at trying to make the wrapper 
> > code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the 
> > successful rwsem cases much faster.
> 
> Maybe, maybe not.

If there is actual contention on the lock, but mainly just readers (which 
is what the profile indicates: since there is no scheduler footprint, the 
actual writer-vs-reader case is probably very rare), then the xadd is 
likely to be _much_ faster than the spinlock.

Sure, the cacheline is going to bounce regardless (since it's a shared 
per-mm data structure), but the spinlock is going to bounce wildly 
back-and-forth between everybody who _tries_ to get it, while the regular 
xadd is going to bounce just once per actual successful xadd.

So a spinlock is as cheap as an atomic when there is no contention (which 
is the common single-thread case - the real cost of both lock and atomic 
is simply the fact that CPU serialization is expensive), but when there is 
actual lock contention, I bet the atomic xadd is going to be shown to be 
superior.

Remember: we commonly claim that 'spin_unlock' is basically free on x86 - 
and that's true, but it is _only_ true for the uncontended state. 

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 17:47                     ` Christoph Lameter
@ 2010-01-05 18:00                       ` Andi Kleen
  0 siblings, 0 replies; 121+ messages in thread
From: Andi Kleen @ 2010-01-05 18:00 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Linus Torvalds, KAMEZAWA Hiroyuki, Minchan Kim,
	Peter Zijlstra, Paul E. McKenney, Peter Zijlstra, linux-kernel,
	linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, Jan 05, 2010 at 11:47:49AM -0600, Christoph Lameter wrote:
> On Tue, 5 Jan 2010, Andi Kleen wrote:
> 
> > > Oh well. Somebody who is bored might look at trying to make the wrapper
> > > code in arch/x86/lib/semaphore_32.S work on x86-64 too. It should make the
> > > successful rwsem cases much faster.
> >
> > Maybe, maybe not.
> 
> What we saw in the past was that the cost of acquiring exclusive
> access to the contended cachelines containing mmap_sem and rss counters
> was most important.

That cost should be about the same between the spinlock and the atomic version.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 17:55                     ` Linus Torvalds
@ 2010-01-05 18:13                       ` Christoph Lameter
  2010-01-05 18:25                         ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-05 18:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Linus Torvalds wrote:

> So a spinlock is as cheap as an atomic when there is no contention (which
> is the common single-thread case - the real cost of both lock and atomic
> is simply the fact that CPU serialization is expensive), but when there is
> actual lock contention, I bet the atomic xadd is going to be shown to be
> superior.
>
> Remember: we commonly claim that 'spin_unlock' is basically free on x86 -
> and that's true, but it is _only_ true for the uncontended state.

Its also free if the MESI algorithm has been tuned in such a way that the
exclusive cacheline that was just acquired is not immediately released
after a single access.

If the critical section protected by the spinlock is small then the
delay will keep the cacheline exclusive until we hit the unlock. This
is the case here as far as I can tell.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:13                       ` Christoph Lameter
@ 2010-01-05 18:25                         ` Linus Torvalds
  2010-01-05 18:46                           ` Christoph Lameter
                                             ` (2 more replies)
  0 siblings, 3 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 18:25 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Christoph Lameter wrote:
> 
> If the critical section protected by the spinlock is small then the
> delay will keep the cacheline exclusive until we hit the unlock. This
> is the case here as far as I can tell.

I hope somebody can time it. Because I think the idle reads on all the 
(unsuccessful) spinlocks will kill it.

Think of it this way: under heavy contention, you'll see a lot of people 
waiting for the spinlocks and one of them succeeds at writing it, reading 
the line. So you get an O(n^2) bus traffic access pattern. In contrast, 
with an xadd, you get O(n) behavior - everybody does _one_ acquire-for- 
write bus access.

Remember: the critical section is small, but since you're contending on 
the spinlock, that doesn't much _help_. The readers are all hitting the 
lock (and you can try to solve the O(n*2) issue with back-off, but quite 
frankly, anybody who does that has basically already lost - I'm personally 
convinced you should never do lock backoff, and instead look at what you 
did wrong at a higher level instead).

				Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:25                         ` Linus Torvalds
@ 2010-01-05 18:46                           ` Christoph Lameter
  2010-01-05 18:56                             ` Linus Torvalds
  2010-01-05 18:55                           ` Paul E. McKenney
  2010-01-05 20:29                           ` Peter Zijlstra
  2 siblings, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-05 18:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Linus Torvalds wrote:

> I hope somebody can time it. Because I think the idle reads on all the
> (unsuccessful) spinlocks will kill it.
>
> Think of it this way: under heavy contention, you'll see a lot of people
> waiting for the spinlocks and one of them succeeds at writing it, reading
> the line. So you get an O(n^2) bus traffic access pattern. In contrast,
> with an xadd, you get O(n) behavior - everybody does _one_ acquire-for-
> write bus access.

There will not be much spinning. The cacheline will be held exclusively by
one processor. A request by other processors for shared access to the
cacheline will effectively stop the execution on those processors until
the cacheline is available.

Access to main memory is mediated by the cache controller anyways. As
long as the cacheline has not been invalidated it will be reused. Maybe
Andi can give us more details on the exact behavior on recent Intel
processors?

One thing that could change with the removal of the spinlock is that the
initial request for the cacheline will not be for read anymore but for
write. This saves us the upgrade from read / shared state to exclusive access.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:25                         ` Linus Torvalds
  2010-01-05 18:46                           ` Christoph Lameter
@ 2010-01-05 18:55                           ` Paul E. McKenney
  2010-01-05 19:08                             ` Linus Torvalds
  2010-01-05 20:29                           ` Peter Zijlstra
  2 siblings, 1 reply; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-05 18:55 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim,
	Peter Zijlstra, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, Jan 05, 2010 at 10:25:43AM -0800, Linus Torvalds wrote:
> 
> 
> On Tue, 5 Jan 2010, Christoph Lameter wrote:
> > 
> > If the critical section protected by the spinlock is small then the
> > delay will keep the cacheline exclusive until we hit the unlock. This
> > is the case here as far as I can tell.
> 
> I hope somebody can time it. Because I think the idle reads on all the 
> (unsuccessful) spinlocks will kill it.

But on many systems, it does take some time for the idle reads to make
their way to the CPU that just acquired the lock.  My (admittedly dated)
experience is that the CPU acquiring the lock has a few bus clocks
before the cache line containing the lock gets snatched away.

> Think of it this way: under heavy contention, you'll see a lot of people 
> waiting for the spinlocks and one of them succeeds at writing it, reading 
> the line. So you get an O(n^2) bus traffic access pattern. In contrast, 
> with an xadd, you get O(n) behavior - everybody does _one_ acquire-for- 
> write bus access.

xadd (and xchg) certainly are nicer where they apply!

> Remember: the critical section is small, but since you're contending on 
> the spinlock, that doesn't much _help_. The readers are all hitting the 
> lock (and you can try to solve the O(n*2) issue with back-off, but quite 
> frankly, anybody who does that has basically already lost - I'm personally 
> convinced you should never do lock backoff, and instead look at what you 
> did wrong at a higher level instead).

Music to my ears!  ;-)

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:46                           ` Christoph Lameter
@ 2010-01-05 18:56                             ` Linus Torvalds
  2010-01-05 19:15                               ` Christoph Lameter
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 18:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Christoph Lameter wrote:
> 
> There will not be much spinning. The cacheline will be held exclusively by
> one processor. A request by other processors for shared access to the
> cacheline will effectively stop the execution on those processors until
> the cacheline is available.

AT WHICH POINT THEY ALL RACE TO GET IT TO SHARED MODE, only to then have 
one of them actually see that it got a ticket!

Don't you see the problem? The spinlock (with ticket locks) essentially 
does the "xadd" atomic increment anyway, but then it _waits_ for it. All 
totally pointless, and all just making for problems, and wasting CPU time 
and causing more cross-node traffic.

In contrast, a native xadd-based rwsem will basically do the atomic 
increment (that ticket spinlocks also do) and then just go on with its 
life. Never waiting for all the other CPU's to also do their ticket 
spinlocks.

The "short critical region" you talk about is totally meaningless, since 
the contention will mean that everybody is waiting and hitting that 
cacheline for a long time regardless - they'll be waiting for O(n) 
_other_ CPU's to get the lock first!

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:55                           ` Paul E. McKenney
@ 2010-01-05 19:08                             ` Linus Torvalds
  2010-01-05 19:23                               ` Paul E. McKenney
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 19:08 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim,
	Peter Zijlstra, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Paul E. McKenney wrote:
> 
> But on many systems, it does take some time for the idle reads to make
> their way to the CPU that just acquired the lock.

Yes. But the point is that there is lots of them.

So think of it this way: every time _one_ CPU acquires a lock (and 
then releases it), _all_ CPU's will read the new value. Imagine the 
cross-socket traffic.

In contrast, doing just a single xadd (which replaces the whole 
"spin_lock+non-atomics+spin_unlock"), every times _once_ CPU cquires a 
lock, that's it. The other CPU's arent' all waiting in line for the lock 
to be released, and reading the cacheline to see if it's their turn.

Sure, after they got the lock they'll all eventually end up reading from 
that cacheline that contains 'struct mm_struct', but that's something we 
could even think about trying to minimize by putting the mmap_sem as far 
away from the other fields as possible.

Now, it's very possible that if you have a broadcast model of cache 
coherency, none of this much matters and you end up with almost all the 
same bus traffic anyway. But I do think it could matter a lot.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:56                             ` Linus Torvalds
@ 2010-01-05 19:15                               ` Christoph Lameter
  2010-01-05 19:28                                 ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-05 19:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Linus Torvalds wrote:

> Don't you see the problem? The spinlock (with ticket locks) essentially
> does the "xadd" atomic increment anyway, but then it _waits_ for it. All
> totally pointless, and all just making for problems, and wasting CPU time
> and causing more cross-node traffic.

For the xadd to work it first would have to acquire the cacheline
exclusively. That can only occur for a single processor and with the
fitting minimum holdtime for exclusive cachelines everything should be
fine. If the critical section is done (unlock complete) then the next
processor (which had to stopped waiting to access the cacheline) can
acquire the lock.

The wait state is the processor being stopped due to not being able to
access the cacheline. Not the processor spinning in the xadd loop. That
only occurs if the critical section is longer than the timeout.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 19:08                             ` Linus Torvalds
@ 2010-01-05 19:23                               ` Paul E. McKenney
  0 siblings, 0 replies; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-05 19:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim,
	Peter Zijlstra, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, Jan 05, 2010 at 11:08:46AM -0800, Linus Torvalds wrote:
> 
> 
> On Tue, 5 Jan 2010, Paul E. McKenney wrote:
> > 
> > But on many systems, it does take some time for the idle reads to make
> > their way to the CPU that just acquired the lock.
> 
> Yes. But the point is that there is lots of them.
> 
> So think of it this way: every time _one_ CPU acquires a lock (and 
> then releases it), _all_ CPU's will read the new value. Imagine the 
> cross-socket traffic.

Yep, been there, and with five-microsecond cross-"socket" latencies.
Of course, the CPU clock frequencies were a bit slower back then.

> In contrast, doing just a single xadd (which replaces the whole 
> "spin_lock+non-atomics+spin_unlock"), every times _once_ CPU cquires a 
> lock, that's it. The other CPU's arent' all waiting in line for the lock 
> to be released, and reading the cacheline to see if it's their turn.
> 
> Sure, after they got the lock they'll all eventually end up reading from 
> that cacheline that contains 'struct mm_struct', but that's something we 
> could even think about trying to minimize by putting the mmap_sem as far 
> away from the other fields as possible.
> 
> Now, it's very possible that if you have a broadcast model of cache 
> coherency, none of this much matters and you end up with almost all the 
> same bus traffic anyway. But I do think it could matter a lot.

I have seen systems that work both ways.  If the CPU has enough time
between getting the cache line in unlocked state to lock it, do the
modification, and release the lock before the first in the flurry of
reads arrives, then performance will be just fine.  Each reader will
see the cache line with an unlocked lock and life will be good.

On the other hand, as you say, if the first in the flurry of reads arrives
before the CPU has managed to make its way through the critical section,
then your n^2 nightmare comes true.

If the critical section operates only on the cache line containing the
lock, and if the critical section has only a handful of instructions,
I bet you win on almost all platforms.  But, like you, I would still
prefer the xadd (or xchg or whatever) to the lock, where feasible.
But you cannot expect all systems to see a major performance boost from
switching from locks to xadds.  Often, all you get is sounder sleep.
Which is valuable in its own right.  ;-)

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 19:15                               ` Christoph Lameter
@ 2010-01-05 19:28                                 ` Linus Torvalds
  0 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 19:28 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim, Peter Zijlstra,
	Paul E. McKenney, Peter Zijlstra, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Christoph Lameter wrote:
> 
> The wait state is the processor being stopped due to not being able to
> access the cacheline. Not the processor spinning in the xadd loop. That
> only occurs if the critical section is longer than the timeout.

You don't know what you're talking about, do you?

Just go and read the source code.

The process is spinning currently in the spin_lock loop. Here, I'll quote 
it to you:

                LOCK_PREFIX "xaddw %w0, %1\n"
                "1:\t"
                "cmpb %h0, %b0\n\t"
                "je 2f\n\t"
                "rep ; nop\n\t"
                "movb %1, %b0\n\t"
                /* don't need lfence here, because loads are in-order */
                "jmp 1b\n"

note the loop that spins - reading the thing over and over - waiting for 
_that_ CPU to be the owner of the xadd ticket?

That's the one you have now, only because x86-64 uses the STUPID FALLBACK 
CODE for the rwsemaphores!

In contrast, look at what the non-stupid rwsemaphore code does (which 
triggers on x86-32):

                     LOCK_PREFIX "  incl      (%%eax)\n\t"
                     /* adds 0x00000001, returns the old value */
                     "  jns        1f\n"
                     "  call call_rwsem_down_read_failed\n"

(that's a "down_read()", which happens to be the op we care most about. 
See? That's a single locked "inc" (it avoids the xadd on the read side 
because of how we've biased things). In particular, notice how this means 
that we do NOT have fifty million CPU's all trying to read the same 
location while one writes to it successfully.

Spot the difference?

Here's putting it another way. Which of these schenarios do you think 
should result in less cross-node traffic:

 - multiple CPU's that - one by one - get the cacheline for exclusive 
   access.

 - multiple CPU's that - one by one - get the cacheline for exclusive 
   access, while other CPU's are all trying to read the same cacheline at 
   the same time, over and over again, in a loop.

See the shared part? See the difference? If you look at just a single lock 
acquire, it boils down to these two scenarios

 - one CPU gets the cacheline exclusively

 - one CPU gets the cacheline exclusively while <n> other CPU's are all 
   trying to read the old and the new value.

It really is that simple.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 18:25                         ` Linus Torvalds
  2010-01-05 18:46                           ` Christoph Lameter
  2010-01-05 18:55                           ` Paul E. McKenney
@ 2010-01-05 20:29                           ` Peter Zijlstra
  2010-01-05 20:46                             ` Linus Torvalds
  2010-01-05 23:29                             ` Paul E. McKenney
  2 siblings, 2 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-05 20:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Tue, 2010-01-05 at 10:25 -0800, Linus Torvalds wrote:
> The readers are all hitting the 
> lock (and you can try to solve the O(n*2) issue with back-off, but quite 
> frankly, anybody who does that has basically already lost 

/me sneaks in a reference to local spinning spinlocks just to have them
mentioned.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 20:29                           ` Peter Zijlstra
@ 2010-01-05 20:46                             ` Linus Torvalds
  2010-01-05 21:00                               ` Linus Torvalds
  2010-01-05 23:29                             ` Paul E. McKenney
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 20:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar



On Tue, 5 Jan 2010, Peter Zijlstra wrote:
>
> On Tue, 2010-01-05 at 10:25 -0800, Linus Torvalds wrote:
> > The readers are all hitting the 
> > lock (and you can try to solve the O(n*2) issue with back-off, but quite 
> > frankly, anybody who does that has basically already lost 
> 
> /me sneaks in a reference to local spinning spinlocks just to have them
> mentioned.

Were you in on the discussion with (I think) Andy Glew about that? I don't 
think he ever came up with an actual workable solution, but he tried. He 
worked for Intel at the time (maybe still does), and was looking at 
architectural improvements. I think you need a "compare-and-exchange-2- 
separate-words" instruction to make it work (not "cmpxchg8/16b" - 
literally two _different_ words).

I think m68k can do it. Even then it's a lot more expensive than a regular 
lock for any reasonable common case.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 20:46                             ` Linus Torvalds
@ 2010-01-05 21:00                               ` Linus Torvalds
  0 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 21:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Tue, 5 Jan 2010, Linus Torvalds wrote:
>
> I think you need a "compare-and-exchange-2-separate-words" instruction 
> to make it work (not "cmpxchg8/16b" - literally two _different_ words).

Btw, I might be misremembering - Andy was looking at various lockless 
algorithms too. Maybe the problem was the non-local space requirement. 
There were several spin-lock variants that would be improved if we could 
pass a cookie from the 'lock' to the 'unlock'.

In fact, even the ticket locks would be improved by that, since we could 
then possibly do the unlock as a plain 'store' rather than an 'add', and 
keep the nex-owner cookie in a register over the lock rather than unlock 
by just incrementing it in the nasty lock cacheline.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 20:29                           ` Peter Zijlstra
  2010-01-05 20:46                             ` Linus Torvalds
@ 2010-01-05 23:29                             ` Paul E. McKenney
  1 sibling, 0 replies; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-05 23:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Christoph Lameter, Andi Kleen, KAMEZAWA Hiroyuki,
	Minchan Kim, linux-kernel, linux-mm, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, Jan 05, 2010 at 09:29:16PM +0100, Peter Zijlstra wrote:
> On Tue, 2010-01-05 at 10:25 -0800, Linus Torvalds wrote:
> > The readers are all hitting the 
> > lock (and you can try to solve the O(n*2) issue with back-off, but quite 
> > frankly, anybody who does that has basically already lost 
> 
> /me sneaks in a reference to local spinning spinlocks just to have them
> mentioned.

Been there, done that.  More than once.  One of them remains in
production use.

The trick is to use a normal spinlock at low levels of contention, and
switch to a more elaborate structure if contention becomes a problem --
if a task spins for too long on the normal spinlock, it set a bit
in the normal spinlock word.  A separate structure allowed tasks to
spin on their own lock word, and also arranged to hand the lock off
to requestors on the same NUMA node as the task releasing the lock in
order to reduce the average cache-miss latency, but also bounding the
resulting unfairness.  It also avoided handing locks off to tasks whose
local spins were interrupted, the idea being that if contention is high,
the probability of being interrupted while spinning is higher than that
of being interrupted while holding the lock (since the time spent spinning
is much greater than the time spent actually holding the lock).

The race conditions were extremely challenging, so, in most cases,
designing for low contention seems -much- more productive.  ;-)

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05  9:37       ` Peter Zijlstra
@ 2010-01-05 23:35         ` Linus Torvalds
  0 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-05 23:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: KAMEZAWA Hiroyuki, Paul E. McKenney, linux-kernel, linux-mm,
	minchan.kim, cl, hugh.dickins, Nick Piggin, Ingo Molnar



On Tue, 5 Jan 2010, Peter Zijlstra wrote:
> 
> While looking at that code, I found the placement of might_sleep() a tad
> confusing, I'd expect that to be in fput() since that is the regular
> entry point (all except AIO, which does crazy things).

I think it's the crazy AIO thing that is one of the reasons for doing it 
in __fput. If it's just in fput() it covers the _normal_ cases better, but 
I doubt that those were the primary worry ;). 

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 15:26               ` Linus Torvalds
  2010-01-05 16:14                 ` Linus Torvalds
@ 2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
  2010-01-06  1:37                   ` Linus Torvalds
  1 sibling, 1 reply; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-06  0:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 5 Jan 2010 07:26:31 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 5 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > #
> > # Overhead          Command             Shared Object  Symbol
> > # ........  ...............  ........................  ......
> > #
> >     43.23%  multi-fault-all  [kernel]                  [k] smp_invalidate_interrupt
> >     16.27%  multi-fault-all  [kernel]                  [k] flush_tlb_others_ipi
> >     11.55%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irqsave    <========(*)
> >      6.23%  multi-fault-all  [kernel]                  [k] intel_pmu_enable_all
> >      2.17%  multi-fault-all  [kernel]                  [k] _raw_spin_unlock_irqrestore
> 
> Hmm.. The default rwsem implementation shouldn't have any spin-locks in 
> the fast-path. And your profile doesn't seem to have any scheduler 
> footprint, so I wonder what is going on.
> 
> Oh.
> 
> Lookie here:
> 
>  - arch/x86/Kconfig.cpu:
> 
> 	config X86_XADD
> 		def_bool y
> 		depends on X86_32 && !M386
> 
>  - arch/x86/Kconfig:
> 
> 	config RWSEM_GENERIC_SPINLOCK
> 	        def_bool !X86_XADD
> 
> 	config RWSEM_XCHGADD_ALGORITHM
> 	        def_bool X86_XADD   
> 
> it looks like X86_XADD only gets enabled on 32-bit builds. Which means 
> that x86-64 in turn seems to end up always using the slower "generic 
> spinlock" version.
> 
> Are you sure this isn't the reason why your profiles are horrible?
> 
I think this is the 1st reason but haven't rewrote rwsem itself and tested,
sorry.

This is a profile in other test.
==
2.6.33-rc2's score of the same test program is here.

    75.42%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irqsav
            |
            --- _raw_spin_lock_irqsave
               |
               |--49.13%-- __down_read_trylock
               |          down_read_trylock
               |          do_page_fault
               |          page_fault
               |          0x400950
               |          |
               |           --100.00%-- (nil)
               |
               |--46.92%-- __up_read
               |          up_read
               |          |
               |          |--99.99%-- do_page_fault
               |          |          page_fault
               |          |          0x400950
               |          |          (nil)
               |           --0.01%-- [...]
==
yes, spinlock is from rwsem.

Why I tried "skipping rwsem" is because I like avoid locking rather than rewrite
lock itself when I think of the influence of the patch....


Thanks,
-Kame




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
@ 2010-01-06  1:37                   ` Linus Torvalds
  2010-01-06  2:52                     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-06  1:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar



On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> I think this is the 1st reason but haven't rewrote rwsem itself and tested,
> sorry.

Here's a totally untested patch! It may or may not work. It builds for me, 
but that may be some cosmic accident. I _think_ I got the callee-clobbered 
register set right, but somebody should check the comment in the new 
rwsem_64.S, and double-check that the code actually matches what I tried 
to do.

I had to change the inline asm to get the register sizes right too, so for 
all I know this screws up x86-32 too.

In other words: UNTESTED! It may molest your pets and drink all your beer. 
You have been warned.

(Is a 16-bit rwsem count even enough for x86-64? Possibly not. That's a 
separate issue entirely, though).

		Linus

---
 arch/x86/Kconfig.cpu         |    2 +-
 arch/x86/include/asm/rwsem.h |   18 +++++-----
 arch/x86/lib/Makefile        |    1 +
 arch/x86/lib/rwsem_64.S      |   81 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index f20ddf8..a198293 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT
 
 config X86_XADD
 	def_bool y
-	depends on X86_32 && !M386
+	depends on X86_64 || !M386
 
 config X86_PPRO_FENCE
 	bool "PentiumPro memory ordering errata workaround"
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ca7517d..625baca 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -65,7 +65,7 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
 struct rw_semaphore {
-	signed long		count;
+	__s32			count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +105,7 @@ do {								\
 static inline void __down_read(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning down_read\n\t"
-		     LOCK_PREFIX "  incl      (%%eax)\n\t"
+		     LOCK_PREFIX "  incl      (%1)\n\t"
 		     /* adds 0x00000001, returns the old value */
 		     "  jns        1f\n"
 		     "  call call_rwsem_down_read_failed\n"
@@ -147,7 +147,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 
 	tmp = RWSEM_ACTIVE_WRITE_BIAS;
 	asm volatile("# beginning down_write\n\t"
-		     LOCK_PREFIX "  xadd      %%edx,(%%eax)\n\t"
+		     LOCK_PREFIX "  xaddl     %%edx,(%2)\n\t"
 		     /* subtract 0x0000ffff, returns the old value */
 		     "  testl     %%edx,%%edx\n\t"
 		     /* was the count 0 before? */
@@ -170,9 +170,9 @@ static inline void __down_write(struct rw_semaphore *sem)
  */
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
-	signed long ret = cmpxchg(&sem->count,
-				  RWSEM_UNLOCKED_VALUE,
-				  RWSEM_ACTIVE_WRITE_BIAS);
+	__s32 ret = cmpxchg(&sem->count,
+			    RWSEM_UNLOCKED_VALUE,
+			    RWSEM_ACTIVE_WRITE_BIAS);
 	if (ret == RWSEM_UNLOCKED_VALUE)
 		return 1;
 	return 0;
@@ -185,7 +185,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	asm volatile("# beginning __up_read\n\t"
-		     LOCK_PREFIX "  xadd      %%edx,(%%eax)\n\t"
+		     LOCK_PREFIX "  xaddl     %%edx,(%2)\n\t"
 		     /* subtracts 1, returns the old value */
 		     "  jns        1f\n\t"
 		     "  call call_rwsem_wake\n"
@@ -203,7 +203,7 @@ static inline void __up_write(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning __up_write\n\t"
 		     "  movl      %2,%%edx\n\t"
-		     LOCK_PREFIX "  xaddl     %%edx,(%%eax)\n\t"
+		     LOCK_PREFIX "  xaddl     %%edx,(%1)\n\t"
 		     /* tries to transition
 			0xffff0001 -> 0x00000000 */
 		     "  jz       1f\n"
@@ -221,7 +221,7 @@ static inline void __up_write(struct rw_semaphore *sem)
 static inline void __downgrade_write(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning __downgrade_write\n\t"
-		     LOCK_PREFIX "  addl      %2,(%%eax)\n\t"
+		     LOCK_PREFIX "  addl      %2,(%1)\n\t"
 		     /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
 		     "  jns       1f\n\t"
 		     "  call call_rwsem_downgrade_wake\n"
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754..c802451 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -39,4 +39,5 @@ else
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
 endif
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 0000000..15acecf
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
+/*
+ * x86-64 rwsem wrappers
+ *
+ * This interfaces the inline asm code to the slow-path
+ * C routines. We need to save the call-clobbered regs
+ * that the asm does not mark as clobbered, and move the
+ * argument from %rax to %rdi.
+ *
+ * NOTE! We don't need to save %rax, because the functions
+ * will always return the semaphore pointer in %rax (which
+ * is also the input argument to these helpers)
+ *
+ * The following can clobber %rdx because the asm clobbers it:
+ *   call_rwsem_down_write_failed
+ *   call_rwsem_wake
+ * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
+ */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+#define save_common_regs \
+	pushq %rdi; \
+	pushq %rsi; \
+	pushq %rcx; \
+	pushq %r8; \
+	pushq %r9; \
+	pushq %r10; \
+	pushq %r11
+
+#define restore_common_regs \
+	popq %r11; \
+	popq %r10; \
+	popq %r9; \
+	popq %r8; \
+	popq %rcx; \
+	popq %rsi; \
+	popq %rdi
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+	save_common_regs
+	pushq %rdx
+	movq %rax,%rdi
+	call rwsem_down_read_failed
+	popq %rdx
+	restore_common_regs
+	ret
+	ENDPROC(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+	save_common_regs
+	movq %rax,%rdi
+	call rwsem_down_write_failed
+	restore_common_regs
+	ret
+	ENDPROC(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+	decw %dx    /* do nothing if still outstanding active readers */
+	jnz 1f
+	save_common_regs
+	movq %rax,%rdi
+	call rwsem_wake
+	restore_common_regs
+1:	ret
+	ENDPROC(call_rwsem_wake)
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_downgrade_wake)
+	save_common_regs
+	pushq %rdx
+	movq %rax,%rdi
+	call rwsem_downgrade_wake
+	popq %rdx
+	restore_common_regs
+	ret
+	ENDPROC(call_rwsem_downgrade_wake)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  1:37                   ` Linus Torvalds
@ 2010-01-06  2:52                     ` KAMEZAWA Hiroyuki
  2010-01-06  3:27                       ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-06  2:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 5 Jan 2010 17:37:08 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > I think this is the 1st reason but haven't rewrote rwsem itself and tested,
> > sorry.
> 
> Here's a totally untested patch! It may or may not work. It builds for me, 
> but that may be some cosmic accident. I _think_ I got the callee-clobbered 
> register set right, but somebody should check the comment in the new 
> rwsem_64.S, and double-check that the code actually matches what I tried 
> to do.
> 
> I had to change the inline asm to get the register sizes right too, so for 
> all I know this screws up x86-32 too.
> 
> In other words: UNTESTED! It may molest your pets and drink all your beer. 
> You have been warned.
> 
Thank you for warning ;)
My host boots successfully. Here is the result.


Result of Linus's rwmutex XADD patch.

Test:
	while (1) {
		touch memory
		barrier
		fork()->exit() if cpu==0
		berrier
	}

# Samples: 1121655736712
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    50.26%  multi-fault-all  [kernel]                  [k] smp_invalidate_interrup
    15.94%  multi-fault-all  [kernel]                  [k] flush_tlb_others_ipi
     6.50%  multi-fault-all  [kernel]                  [k] intel_pmu_enable_all
     3.17%  multi-fault-all  [kernel]                  [k] down_read_trylock
     2.08%  multi-fault-all  [kernel]                  [k] do_wp_page
     1.69%  multi-fault-all  [kernel]                  [k] page_fault
     1.63%  multi-fault-all  ./multi-fault-all-fork    [.] worker
     1.53%  multi-fault-all  [kernel]                  [k] up_read
     1.35%  multi-fault-all  [kernel]                  [k] do_page_fault
     1.24%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     1.10%  multi-fault-all  [kernel]                  [k] flush_tlb_page
     0.96%  multi-fault-all  [kernel]                  [k] invalidate_interrupt0
     0.92%  multi-fault-all  [kernel]                  [k] invalidate_interrupt3
     0.90%  multi-fault-all  [kernel]                  [k] invalidate_interrupt2


Test:
	while (1) {
		touch memory
		barrier
		madvice DONTNEED to locally touched memory.
		barrier
	}


# Samples: 1335012531823
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    32.17%  multi-fault-all  [kernel]                  [k] clear_page_c
     9.60%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     8.14%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irqsave
     6.23%  multi-fault-all  [kernel]                  [k] down_read_trylock
     4.98%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irq
     4.63%  multi-fault-all  [kernel]                  [k] __mem_cgroup_try_charge
     4.45%  multi-fault-all  [kernel]                  [k] up_read
     3.83%  multi-fault-all  [kernel]                  [k] handle_mm_fault
     3.19%  multi-fault-all  [kernel]                  [k] __rmqueue
     3.05%  multi-fault-all  [kernel]                  [k] __mem_cgroup_commit_cha
     2.39%  multi-fault-all  [kernel]                  [k] bad_range
     1.78%  multi-fault-all  [kernel]                  [k] page_fault
     1.74%  multi-fault-all  [kernel]                  [k] mem_cgroup_charge_commo
     1.71%  multi-fault-all  [kernel]                  [k] lookup_page_cgroup

Then, the result is much improved by XADD rwsem.

In above profile, rwsem is still there.
But page-fault/sec is good. I hope some "big" machine users join to the test.
(I hope 4 sockets, at least..)


Here is peformance counter result of DONTNEED test. Counting the number of page
faults in 60 sec. So, bigger number of page fault is better.

[XADD rwsem]
[root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8

 Performance counter stats for './multi-fault-all 8' (5 runs):

       41950863  page-faults                ( +-   1.355% )
      502983592  cache-misses               ( +-   0.628% )

   60.002682206  seconds time elapsed   ( +-   0.000% )

[my patch]
[root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8

 Performance counter stats for './multi-fault-all 8' (5 runs):

       35835485  page-faults                ( +-   0.257% )
      511445661  cache-misses               ( +-   0.770% )

   60.004243198  seconds time elapsed   ( +-   0.002% )

Ah....xadd-rwsem seems to be faster than my patch ;)
Maybe my patch adds some big overhead (see below)

Then, on my host, I can get enough page-fault throughput by modifing rwsem.


Just for my interest, profile on my patch is here.

    24.69%  multi-fault-all  [kernel]                  [k] clear_page_c
    20.26%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     8.59%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irq
     4.88%  multi-fault-all  [kernel]                  [k] page_add_new_anon_rmap
     4.33%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irqsave
     4.27%  multi-fault-all  [kernel]                  [k] vma_put
     3.55%  multi-fault-all  [kernel]                  [k] __mem_cgroup_try_charge
     3.36%  multi-fault-all  [kernel]                  [k] find_vma_speculative
     2.90%  multi-fault-all  [kernel]                  [k] handle_mm_fault
     2.77%  multi-fault-all  [kernel]                  [k] __rmqueue
     2.49%  multi-fault-all  [kernel]                  [k] bad_range

Hmm...spinlock contention is twice bigger.....????


  20.46%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
            |
            --- _raw_spin_lock
               |
               |--81.42%-- free_pcppages_bulk
               |          free_hot_cold_page
               |          __pagevec_free
               |          release_pages
               |          free_pages_and_swap_cache
               |          |
               |          |--99.57%-- unmap_vmas
               |          |          zap_page_range
               |          |          sys_madvise
               |          |          system_call_fastpath
               |          |          0x3f6b0e2cf7
               |           --0.43%-- [...]
               |
               |--17.86%-- get_page_from_freelist
               |          __alloc_pages_nodemask
               |          handle_mm_fault
               |          do_page_fault
               |          page_fault
               |          0x400940
               |          (nil)
                --0.71%-- [...]

This seems to be page allocator lock. Hmm...why this big..


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 15:17     ` Christoph Lameter
@ 2010-01-06  3:22       ` Arjan van de Ven
  2010-01-07 16:11         ` Christoph Lameter
  0 siblings, 1 reply; 121+ messages in thread
From: Arjan van de Ven @ 2010-01-06  3:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

On Tue, 5 Jan 2010 09:17:11 -0600 (CST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Tue, 5 Jan 2010, Arjan van de Ven wrote:
> 
> > while I appreciate the goal of reducing contention on this lock...
> > wouldn't step one be to remove the page zeroing from under this
> > lock? that's by far (easily by 10x I would guess) the most
> > expensive thing that's done under the lock, and I would expect a
> > first order of contention reduction just by having the zeroing of a
> > page not done under the lock...
> 
> The main issue is cacheline bouncing. mmap sem is a rw semaphore and
> only held for read during a fault.

depends on the workload; on a many-threads-java workload, you also get
it for write quite a bit (lots of malloc/frees in userspace in addition
to pagefaults).. at which point you do end up serializing on the
zeroing.

There's some real life real big workloads that show this pretty badly;
so far the workaround is to have glibc batch up a lot of the free()s..
but that's just pushing it a little further out.

> 


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  2:52                     ` KAMEZAWA Hiroyuki
@ 2010-01-06  3:27                       ` Linus Torvalds
  2010-01-06  3:56                         ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-06  3:27 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
> My host boots successfully. Here is the result.

Hey, looks good. It does have that 3% trylock overhead:

      3.17%  multi-fault-all  [kernel]                  [k] down_read_trylock

but that doesn't seem excessive.

Of course, your other load with MADV_DONTNEED seems to be horrible, and 
has some nasty spinlock issues, but that looks like a separate deal (I 
assume that load is just very hard on the pgtable lock).

That said, profiles are hard to compare performance with - the main thing 
that matters for performance is not how the profile looks, but how it 
actually performs. So:

> Then, the result is much improved by XADD rwsem.
> 
> In above profile, rwsem is still there.
> But page-fault/sec is good. I hope some "big" machine users join to the test.

That "page-fault/sec" number is ultimately the only thing that matters. 

> Here is peformance counter result of DONTNEED test. Counting the number of page
> faults in 60 sec. So, bigger number of page fault is better.
> 
> [XADD rwsem]
> [root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> 
>  Performance counter stats for './multi-fault-all 8' (5 runs):
> 
>        41950863  page-faults                ( +-   1.355% )
>       502983592  cache-misses               ( +-   0.628% )
> 
>    60.002682206  seconds time elapsed   ( +-   0.000% )
> 
> [my patch]
> [root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> 
>  Performance counter stats for './multi-fault-all 8' (5 runs):
> 
>        35835485  page-faults                ( +-   0.257% )
>       511445661  cache-misses               ( +-   0.770% )
> 
>    60.004243198  seconds time elapsed   ( +-   0.002% )
> 
> Ah....xadd-rwsem seems to be faster than my patch ;)

Hey, that sounds great. NOTE! My patch really could be improved. In 
particular, I suspect that on x86-64, we should take advantage of the 
64-bit counter, and use a different RW_BIAS. That way we're not limited to 
32k readers, which _could_ otherwise be a problem.

So consider my rwsem patch to be purely preliminary. Now that you've 
tested it, I feel a lot better about it being basically correct, but it 
has room for improvement.

				Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  3:27                       ` Linus Torvalds
@ 2010-01-06  3:56                         ` KAMEZAWA Hiroyuki
  2010-01-06  4:20                           ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-06  3:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 5 Jan 2010 19:27:07 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> > My host boots successfully. Here is the result.
> 
> Hey, looks good. It does have that 3% trylock overhead:
> 
>       3.17%  multi-fault-all  [kernel]                  [k] down_read_trylock
> 
> but that doesn't seem excessive.
> 
> Of course, your other load with MADV_DONTNEED seems to be horrible, and 
> has some nasty spinlock issues, but that looks like a separate deal (I 
> assume that load is just very hard on the pgtable lock).
> 
It's zone->lock, I guess. My test program avoids pgtable lock problem.


> That said, profiles are hard to compare performance with - the main thing 
> that matters for performance is not how the profile looks, but how it 
> actually performs. So:
> 
> > Then, the result is much improved by XADD rwsem.
> > 
> > In above profile, rwsem is still there.
> > But page-fault/sec is good. I hope some "big" machine users join to the test.
> 
> That "page-fault/sec" number is ultimately the only thing that matters. 
> 
yes.

> > Here is peformance counter result of DONTNEED test. Counting the number of page
> > faults in 60 sec. So, bigger number of page fault is better.
> > 
> > [XADD rwsem]
> > [root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> > 
> >  Performance counter stats for './multi-fault-all 8' (5 runs):
> > 
> >        41950863  page-faults                ( +-   1.355% )
> >       502983592  cache-misses               ( +-   0.628% )
> > 
> >    60.002682206  seconds time elapsed   ( +-   0.000% )
> > 
> > [my patch]
> > [root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> > 
> >  Performance counter stats for './multi-fault-all 8' (5 runs):
> > 
> >        35835485  page-faults                ( +-   0.257% )
> >       511445661  cache-misses               ( +-   0.770% )
> > 
> >    60.004243198  seconds time elapsed   ( +-   0.002% )
> > 
> > Ah....xadd-rwsem seems to be faster than my patch ;)
> 
> Hey, that sounds great. NOTE! My patch really could be improved. In 
> particular, I suspect that on x86-64, we should take advantage of the 
> 64-bit counter, and use a different RW_BIAS. That way we're not limited to 
> 32k readers, which _could_ otherwise be a problem.
> 
> So consider my rwsem patch to be purely preliminary. Now that you've 
> tested it, I feel a lot better about it being basically correct, but it 
> has room for improvement.
> 

I'd like to stop updating my patch and wait and see how this issue goes.
Anyway, test on a big machine is appreciated because I cat test only on 
2 sockets host.

Thanks,
-Kame



 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  3:56                         ` KAMEZAWA Hiroyuki
@ 2010-01-06  4:20                           ` Linus Torvalds
  2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
  2010-01-08 16:53                             ` Peter Zijlstra
  0 siblings, 2 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-06  4:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar



On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > 
> > Of course, your other load with MADV_DONTNEED seems to be horrible, and 
> > has some nasty spinlock issues, but that looks like a separate deal (I 
> > assume that load is just very hard on the pgtable lock).
> 
> It's zone->lock, I guess. My test program avoids pgtable lock problem.

Yeah, I should have looked more at your callchain. That's nasty. Much 
worse than the per-mm lock. I thought the page buffering would avoid the 
zone lock becoming a huge problem, but clearly not in this case.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  4:20                           ` Linus Torvalds
@ 2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
  2010-01-06  7:49                               ` Minchan Kim
  2010-01-06  9:39                               ` Linus Torvalds
  2010-01-08 16:53                             ` Peter Zijlstra
  1 sibling, 2 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-06  7:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 4341 bytes --]

On Tue, 5 Jan 2010 20:20:56 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > > 
> > > Of course, your other load with MADV_DONTNEED seems to be horrible, and 
> > > has some nasty spinlock issues, but that looks like a separate deal (I 
> > > assume that load is just very hard on the pgtable lock).
> > 
> > It's zone->lock, I guess. My test program avoids pgtable lock problem.
> 
> Yeah, I should have looked more at your callchain. That's nasty. Much 
> worse than the per-mm lock. I thought the page buffering would avoid the 
> zone lock becoming a huge problem, but clearly not in this case.
> 
For my mental peace, I rewrote test program as

  while () {
	touch memory
	barrier
	madvice DONTNEED all range by cpu 0
	barrier
  }
And serialize madivce().

Then, zone->lock disappears and I don't see big difference with XADD rwsem and
my tricky patch. I think I got reasonable result and fixing rwsem is the sane way.

next target will be clear_page()? hehe.
What catches my eyes is cost of memcg... (>_<  

Thank you all, 
-Kame
==
[XADD rwsem]
[root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8

 Performance counter stats for './multi-fault-all 8' (5 runs):

       33029186  page-faults                ( +-   0.146% )
      348698659  cache-misses               ( +-   0.149% )

   60.002876268  seconds time elapsed   ( +-   0.001% )

# Samples: 815596419603
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    41.51%  multi-fault-all  [kernel]                  [k] clear_page_c
     9.08%  multi-fault-all  [kernel]                  [k] down_read_trylock
     6.23%  multi-fault-all  [kernel]                  [k] up_read
     6.17%  multi-fault-all  [kernel]                  [k] __mem_cgroup_try_charg
     4.76%  multi-fault-all  [kernel]                  [k] handle_mm_fault
     3.77%  multi-fault-all  [kernel]                  [k] __mem_cgroup_commit_ch
     3.62%  multi-fault-all  [kernel]                  [k] __rmqueue
     2.30%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     2.30%  multi-fault-all  [kernel]                  [k] page_fault
     2.12%  multi-fault-all  [kernel]                  [k] mem_cgroup_charge_comm
     2.05%  multi-fault-all  [kernel]                  [k] bad_range
     1.78%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irq
     1.53%  multi-fault-all  [kernel]                  [k] lookup_page_cgroup
     1.44%  multi-fault-all  [kernel]                  [k] __mem_cgroup_uncharge_
     1.41%  multi-fault-all  ./multi-fault-all         [.] worker
     1.30%  multi-fault-all  [kernel]                  [k] get_page_from_freelist
     1.06%  multi-fault-all  [kernel]                  [k] page_remove_rmap



[async page fault]
[root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8

 Performance counter stats for './multi-fault-all 8' (5 runs):

       33345089  page-faults                ( +-   0.555% )
      357660074  cache-misses               ( +-   1.438% )

   60.003711279  seconds time elapsed   ( +-   0.002% )


    40.94%  multi-fault-all  [kernel]                  [k] clear_page_c
     6.96%  multi-fault-all  [kernel]                  [k] vma_put
     6.82%  multi-fault-all  [kernel]                  [k] page_add_new_anon_rmap
     5.86%  multi-fault-all  [kernel]                  [k] __mem_cgroup_try_charg
     4.40%  multi-fault-all  [kernel]                  [k] __rmqueue
     4.14%  multi-fault-all  [kernel]                  [k] find_vma_speculative
     3.97%  multi-fault-all  [kernel]                  [k] handle_mm_fault
     3.52%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
     3.46%  multi-fault-all  [kernel]                  [k] __mem_cgroup_commit_ch
     2.23%  multi-fault-all  [kernel]                  [k] bad_range
     2.16%  multi-fault-all  [kernel]                  [k] mem_cgroup_charge_comm
     1.96%  multi-fault-all  [kernel]                  [k] _raw_spin_lock_irq
     1.75%  multi-fault-all  [kernel]                  [k] mem_cgroup_add_lru_lis
     1.73%  multi-fault-all  [kernel]                  [k] page_fault

[-- Attachment #2: multi-fault-all.c --]
[-- Type: text/x-csrc, Size: 1877 bytes --]

/*
 * multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
 * % gcc -O2 -o multi-fault multi-fault.c -lpthread
 * % multi-fault # of cpus.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>

#define NR_THREADS	8
pthread_t threads[NR_THREADS];
/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define MMAP_LENGTH	(8 * 1024 * 1024)
#define FAULT_LENGTH	(2 * 1024 * 1024)
void *mmap_area[NR_THREADS];
#define PAGE_SIZE	4096

pthread_barrier_t barrier;
int name[NR_THREADS];

void segv_handler(int sig)
{
	sleep(100);
}

int num;
void *worker(void *data)
{
	cpu_set_t set;
	int i, cpu;

	cpu = *(int *)data;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);

	while (1) {
		char *c;
		char *start = mmap_area[cpu];
		char *end = mmap_area[cpu] + FAULT_LENGTH;
		pthread_barrier_wait(&barrier);
		//printf("fault into %p-%p\n",start, end);

		for (c = start; c < end; c += PAGE_SIZE)
			*c = 0;

		pthread_barrier_wait(&barrier);
		for (i = 0; cpu==0 && i < num; i++)
			madvise(mmap_area[i], FAULT_LENGTH, MADV_DONTNEED);
		pthread_barrier_wait(&barrier);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int i, ret;

	if (argc < 2)
		return 0;

	num = atoi(argv[1]);	
	pthread_barrier_init(&barrier, NULL, num);

	mmap_area[0] = mmap(NULL, MMAP_LENGTH * num, PROT_WRITE|PROT_READ,
				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	for (i = 1; i < num; i++) {
		mmap_area[i] = mmap_area[i - 1]+ MMAP_LENGTH;
	}

	for (i = 0; i < num; ++i) {
		name[i] = i;
		ret = pthread_create(&threads[i], NULL, worker, &name[i]);
		if (ret < 0) {
			perror("pthread create");
			return 0;
		}
	}
	sleep(60);
	return 0;
}

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
@ 2010-01-06  7:49                               ` Minchan Kim
  2010-01-06  9:39                               ` Linus Torvalds
  1 sibling, 0 replies; 121+ messages in thread
From: Minchan Kim @ 2010-01-06  7:49 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Linus Torvalds, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 5400 bytes --]

At last your patient try makes the problem solve
although it's from not your patch series.

Thanks for very patient try and testing until now, Kame. :)
I learned lot of things from this thread.

Thanks, all.

On Wed, Jan 6, 2010 at 4:06 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 5 Jan 2010 20:20:56 -0800 (PST)
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>>
>>
>> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>> > >
>> > > Of course, your other load with MADV_DONTNEED seems to be horrible, and
>> > > has some nasty spinlock issues, but that looks like a separate deal (I
>> > > assume that load is just very hard on the pgtable lock).
>> >
>> > It's zone->lock, I guess. My test program avoids pgtable lock problem.
>>
>> Yeah, I should have looked more at your callchain. That's nasty. Much
>> worse than the per-mm lock. I thought the page buffering would avoid the
>> zone lock becoming a huge problem, but clearly not in this case.
>>
> For my mental peace, I rewrote test program as
>
> Â while () {
> Â  Â  Â  Â touch memory
> Â  Â  Â  Â barrier
> Â  Â  Â  Â madvice DONTNEED all range by cpu 0
> Â  Â  Â  Â barrier
> Â }
> And serialize madivce().
>
> Then, zone->lock disappears and I don't see big difference with XADD rwsem and
> my tricky patch. I think I got reasonable result and fixing rwsem is the sane way.
>
> next target will be clear_page()? hehe.
> What catches my eyes is cost of memcg... (>_<
>
> Thank you all,
> -Kame
> ==
> [XADD rwsem]
> [root@bluextal memory]# Â /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
>
> Â Performance counter stats for './multi-fault-all 8' (5 runs):
>
> Â  Â  Â  33029186 Â page-faults Â  Â  Â  Â  Â  Â  Â  Â ( +- Â  0.146% )
> Â  Â  Â 348698659 Â cache-misses Â  Â  Â  Â  Â  Â  Â  ( +- Â  0.149% )
>
> Â  60.002876268 Â seconds time elapsed Â  ( +- Â  0.001% )
>
> # Samples: 815596419603
> #
> # Overhead Â  Â  Â  Â  Â Command Â  Â  Â  Â  Â  Â  Shared Object Â Symbol
> # ........ Â ............... Â ........................ Â ......
> #
> Â  Â 41.51% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] clear_page_c
> Â  Â  9.08% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] down_read_trylock
> Â  Â  6.23% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] up_read
> Â  Â  6.17% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __mem_cgroup_try_charg
> Â  Â  4.76% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] handle_mm_fault
> Â  Â  3.77% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __mem_cgroup_commit_ch
> Â  Â  3.62% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __rmqueue
> Â  Â  2.30% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] _raw_spin_lock
> Â  Â  2.30% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] page_fault
> Â  Â  2.12% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] mem_cgroup_charge_comm
> Â  Â  2.05% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] bad_range
> Â  Â  1.78% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] _raw_spin_lock_irq
> Â  Â  1.53% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] lookup_page_cgroup
> Â  Â  1.44% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __mem_cgroup_uncharge_
> Â  Â  1.41% Â multi-fault-all Â ./multi-fault-all Â  Â  Â  Â  [.] worker
> Â  Â  1.30% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] get_page_from_freelist
> Â  Â  1.06% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] page_remove_rmap
>
>
>
> [async page fault]
> [root@bluextal memory]# Â /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
>
> Â Performance counter stats for './multi-fault-all 8' (5 runs):
>
> Â  Â  Â  33345089 Â page-faults Â  Â  Â  Â  Â  Â  Â  Â ( +- Â  0.555% )
> Â  Â  Â 357660074 Â cache-misses Â  Â  Â  Â  Â  Â  Â  ( +- Â  1.438% )
>
> Â  60.003711279 Â seconds time elapsed Â  ( +- Â  0.002% )
>
>
> Â  Â 40.94% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] clear_page_c
> Â  Â  6.96% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] vma_put
> Â  Â  6.82% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] page_add_new_anon_rmap
> Â  Â  5.86% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __mem_cgroup_try_charg
> Â  Â  4.40% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __rmqueue
> Â  Â  4.14% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] find_vma_speculative
> Â  Â  3.97% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] handle_mm_fault
> Â  Â  3.52% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] _raw_spin_lock
> Â  Â  3.46% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] __mem_cgroup_commit_ch
> Â  Â  2.23% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] bad_range
> Â  Â  2.16% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] mem_cgroup_charge_comm
> Â  Â  1.96% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] _raw_spin_lock_irq
> Â  Â  1.75% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] mem_cgroup_add_lru_lis
> Â  Â  1.73% Â multi-fault-all Â [kernel] Â  Â  Â  Â  Â  Â  Â  Â  Â [k] page_fault
>



-- 
Kind regards,
Minchan Kim
N‹§²æìr¸›zÇ§u©ž²Æ {\b†éì¹»\x1c®&Þ–)îÆi¢žØ^n‡r¶‰šŽŠÝ¢j$½§$¢¸\x05¢¹¨è§~Š'.)îÄÃ,yèm¶ŸÿÃ\f%Š{±šj+ƒðèž×¦j)Z†·Ÿ

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
  2010-01-06  7:49                               ` Minchan Kim
@ 2010-01-06  9:39                               ` Linus Torvalds
  2010-01-07  1:00                                 ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-06  9:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>
>      9.08%  multi-fault-all  [kernel]                  [k] down_read_trylock

Btw, the "trylock" implementation of rwsemaphores doesn't look very good 
on x86, even after teaching x64-64 to use the xadd versions of rwsems.

The reason? It sadly does a "load/inc/cmpxchg" sequence to do an "add if 
there are no writers", and that may be the obvious model, but it sucks 
from a cache access standpoint.

Why? Because it starts the sequence with a plain read. So if it doesn't 
have the cacheline, it will likely get it first in non-exclusive mode, and 
then the cmpxchg a few instructions later will need to turn it into an 
exclusive ownership.

So now that trylock may well end up doing two bus accesses rather than 
one.

It's possible that an OoO x86 CPU might notice the "read followed by 
r-m-w" pattern and just turn it into an exclusive cache fetch immediately, 
but I don't think they are quite that smart. But who knows? 

Anyway, for the above reason it might actually be better to _avoid_ the 
load entirely, and just make __down_read_trylock() assume the rwsem starts 
out unlocked - replace the initial memory load with just loading a 
constant.

That way, it will do the cmpxchg first, and if it wasn't unlocked and had 
other readers active, it will end up doing an extra cmpxchg, but still 
hopefully avoid the extra bus cycles.

So it might be worth testing this trivial patch on top of my other one.

UNTESTED. But the patch is small and simple, so maybe it works anyway. It 
would be interesting to hear if it makes any difference. Better? Worse? Or 
is it a "There's no difference at all, Linus. You're on drugs with that 
whole initial bus cycle thing"

		Linus

---
 arch/x86/include/asm/rwsem.h |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 625baca..275c0a1 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -123,7 +123,6 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	asm volatile("# beginning __down_read_trylock\n\t"
-		     "  movl      %0,%1\n\t"
 		     "1:\n\t"
 		     "  movl	     %1,%2\n\t"
 		     "  addl      %3,%2\n\t"
@@ -133,7 +132,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 		     "2:\n\t"
 		     "# ending __down_read_trylock\n\t"
 		     : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
-		     : "i" (RWSEM_ACTIVE_READ_BIAS)
+		     : "i" (RWSEM_ACTIVE_READ_BIAS), "1" (RWSEM_UNLOCKED_VALUE)
 		     : "memory", "cc");
 	return result >= 0 ? 1 : 0;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-05 16:10             ` Linus Torvalds
@ 2010-01-06 15:41               ` Peter Zijlstra
  0 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-06 15:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, KAMEZAWA Hiroyuki, Paul E. McKenney, linux-kernel,
	linux-mm, minchan.kim, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Tue, 2010-01-05 at 08:10 -0800, Linus Torvalds wrote:

> You _can_ handle it (make the RCU callback just schedule the work instead 
> of doing it directly), but it does sound really nasty. I suspect we should 
> explore just about any other approach over this one. 

Agreed, scheduling work also has the bonus that sync rcu doesn't quite
do what you expect it to etc..

Anyway, the best I could come up with is something like the below, I
tried to come up with something that would iterate mm->cpu_vm_mask, but
since faults can schedule that doesn't work out.

Iterating the full thread group seems like a mighty expensive thing to
do in light of these massive thread freaks like Java and Google.

Will ponder things more...

---
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1277,6 +1277,7 @@ struct task_struct {
 	struct plist_node pushable_tasks;
 
 	struct mm_struct *mm, *active_mm;
+	struct vm_area_struct *fault_vma;
 
 /* task state */
 	int exit_state;
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -3157,6 +3157,7 @@ int handle_speculative_fault(struct mm_s
 		goto out_unmap;
 
 	entry = *pte;
+	rcu_assign_pointer(current->fault_vma, vma);
 
 	if (read_seqcount_retry(&vma->vm_sequence, seq))
 		goto out_unmap;
@@ -3167,6 +3168,7 @@ int handle_speculative_fault(struct mm_s
 	ret = handle_pte_fault(mm, vma, address, entry, pmd, flags, seq);
 
 out_unlock:
+	rcu_assign_pointer(current->fault_vma, NULL);
 	rcu_read_unlock();
 	return ret;
 
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -235,6 +235,27 @@ static void free_vma(struct vm_area_stru
 	call_rcu(&vma->vm_rcu_head, free_vma_rcu);
 }
 
+static void sync_vma(struct vm_area_struct *vma)
+{
+	struct task_struct *t;
+	int block = 0;
+
+	if (!vma->vm_file)
+		return;
+
+	rcu_read_lock();
+	for (t = current; (t = next_thread(t)) != current; ) {
+		if (rcu_dereference(t->fault_vma) == vma) {
+			block = 1;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (block)
+		synchronize_rcu();
+}
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -243,6 +264,7 @@ static struct vm_area_struct *remove_vma
 	struct vm_area_struct *next = vma->vm_next;
 
 	might_sleep();
+	sync_vma(vma);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file) {


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  9:39                               ` Linus Torvalds
@ 2010-01-07  1:00                                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-07  1:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Minchan Kim, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar

On Wed, 6 Jan 2010 01:39:17 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> >
> >      9.08%  multi-fault-all  [kernel]                  [k] down_read_trylock
<snip>
> That way, it will do the cmpxchg first, and if it wasn't unlocked and had 
> other readers active, it will end up doing an extra cmpxchg, but still 
> hopefully avoid the extra bus cycles.
> 
> So it might be worth testing this trivial patch on top of my other one.
> 
Test: on 8-core/2-socket x86-64
  while () {
	touch memory
	barrier
	madvice DONTNEED all range by cpu 0
	barrier
  }

<Before> (cut from my post)
> [root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8
> 
>  Performance counter stats for './multi-fault-all 8' (5 runs):
> 
>        33029186  page-faults                ( +-   0.146% )
>       348698659  cache-misses               ( +-   0.149% )
> 
>    60.002876268  seconds time elapsed   ( +-   0.001% )
>     41.51%  multi-fault-all  [kernel]                  [k] clear_page_c
>      9.08%  multi-fault-all  [kernel]                  [k] down_read_trylock
>      6.23%  multi-fault-all  [kernel]                  [k] up_read
>      6.17%  multi-fault-all  [kernel]                  [k] __mem_cgroup_try_charg


<After>
[root@bluextal memory]#  /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all 8

 Performance counter stats for './multi-fault-all 8' (5 runs):

       33782787  page-faults                ( +-   2.650% )
      332753197  cache-misses               ( +-   0.477% )

   60.003984337  seconds time elapsed   ( +-   0.004% )

# Samples: 1014408915089
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    44.42%  multi-fault-all  [kernel]                  [k] clear_page_c
     7.73%  multi-fault-all  [kernel]                  [k] down_read_trylock
     6.65%  multi-fault-all  [kernel]                  [k] __mem_cgroup_try_char
     6.15%  multi-fault-all  [kernel]                  [k] up_read
     4.87%  multi-fault-all  [kernel]                  [k] handle_mm_fault
     3.70%  multi-fault-all  [kernel]                  [k] __rmqueue
     3.69%  multi-fault-all  [kernel]                  [k] __mem_cgroup_commit_c
     2.35%  multi-fault-all  [kernel]                  [k] bad_range


yes, it seems slightly improved, at least on this test.
but page-fault-throughput test score is within error range.


Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  3:22       ` Arjan van de Ven
@ 2010-01-07 16:11         ` Christoph Lameter
  2010-01-07 16:19           ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-07 16:11 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar, Linus Torvalds

On Tue, 5 Jan 2010, Arjan van de Ven wrote:

> On Tue, 5 Jan 2010 09:17:11 -0600 (CST)
> Christoph Lameter <cl@linux-foundation.org> wrote:
>
> > On Tue, 5 Jan 2010, Arjan van de Ven wrote:
> >
> > > while I appreciate the goal of reducing contention on this lock...
> > > wouldn't step one be to remove the page zeroing from under this
> > > lock? that's by far (easily by 10x I would guess) the most
> > > expensive thing that's done under the lock, and I would expect a
> > > first order of contention reduction just by having the zeroing of a
> > > page not done under the lock...
> >
> > The main issue is cacheline bouncing. mmap sem is a rw semaphore and
> > only held for read during a fault.
>
> depends on the workload; on a many-threads-java workload, you also get
> it for write quite a bit (lots of malloc/frees in userspace in addition
> to pagefaults).. at which point you do end up serializing on the
> zeroing.
>
> There's some real life real big workloads that show this pretty badly;
> so far the workaround is to have glibc batch up a lot of the free()s..
> but that's just pushing it a little further out.

Again mmap_sem is a rwsem and only a read lock is held. Zeroing in
do_anonymous_page can occur concurrently on multiple processors in the
same address space. The pte lock is intentionally taken *after* zeroing to
allow concurrent zeroing to occur.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 16:11         ` Christoph Lameter
@ 2010-01-07 16:19           ` Linus Torvalds
  2010-01-07 16:31             ` Linus Torvalds
                               ` (3 more replies)
  0 siblings, 4 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 16:19 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Arjan van de Ven, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Christoph Lameter wrote:
> >
> > depends on the workload; on a many-threads-java workload, you also get
> > it for write quite a bit (lots of malloc/frees in userspace in addition
> > to pagefaults).. at which point you do end up serializing on the
> > zeroing.
> >
> > There's some real life real big workloads that show this pretty badly;
> > so far the workaround is to have glibc batch up a lot of the free()s..
> > but that's just pushing it a little further out.
> 
> Again mmap_sem is a rwsem and only a read lock is held. Zeroing in
> do_anonymous_page can occur concurrently on multiple processors in the
> same address space. The pte lock is intentionally taken *after* zeroing to
> allow concurrent zeroing to occur.

You're missing what Arjan said - the jav workload does a lot of memory 
allocations too, causing mmap/munmap.

So now some paths are indeed holding it for writing (or need to wait for 
it to become writable). And the fairness of rwsems quite possibly then 
impacts throughput a _lot_..

(Side note: I wonder if we should wake up _all_ readers when we wake up 
any. Right now, we wake up all readers - but only until we hit a writer. 
Which is the _fair_ thing to do, but it does mean that we can end up in 
horrible patterns of alternating readers/writers, when it could be much 
better to just say "release the hounds" and let all pending readers go 
after a writer has had its turn).

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 16:19           ` Linus Torvalds
@ 2010-01-07 16:31             ` Linus Torvalds
  2010-01-07 16:34             ` Paul E. McKenney
                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 16:31 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Arjan van de Ven, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Linus Torvalds wrote:
> 
> (Side note: I wonder if we should wake up _all_ readers when we wake up 
> any. Right now, we wake up all readers - but only until we hit a writer. 
> Which is the _fair_ thing to do, but it does mean that we can end up in 
> horrible patterns of alternating readers/writers, when it could be much 
> better to just say "release the hounds" and let all pending readers go 
> after a writer has had its turn).

Btw, this would still be "mostly fair" in the sense that you couldn't 
starve writers. Any writer on the list is still guaranteed to be woken up 
next, because now it will be at the front of the queue.

So it would be starvation-proof - new readers that come in _after_ we've 
woken up all the old ones would not get to pass the writers. It might be 
interesting to test, if somebody has a problematic threaded workload with 
lots of page faults and allocations mixxed.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 16:19           ` Linus Torvalds
  2010-01-07 16:31             ` Linus Torvalds
@ 2010-01-07 16:34             ` Paul E. McKenney
  2010-01-07 16:36             ` Christoph Lameter
  2010-01-07 17:22             ` Peter Zijlstra
  3 siblings, 0 replies; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-07 16:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Arjan van de Ven, Peter Zijlstra,
	Peter Zijlstra, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, Jan 07, 2010 at 08:19:56AM -0800, Linus Torvalds wrote:
> 
> 
> On Thu, 7 Jan 2010, Christoph Lameter wrote:
> > >
> > > depends on the workload; on a many-threads-java workload, you also get
> > > it for write quite a bit (lots of malloc/frees in userspace in addition
> > > to pagefaults).. at which point you do end up serializing on the
> > > zeroing.
> > >
> > > There's some real life real big workloads that show this pretty badly;
> > > so far the workaround is to have glibc batch up a lot of the free()s..
> > > but that's just pushing it a little further out.
> > 
> > Again mmap_sem is a rwsem and only a read lock is held. Zeroing in
> > do_anonymous_page can occur concurrently on multiple processors in the
> > same address space. The pte lock is intentionally taken *after* zeroing to
> > allow concurrent zeroing to occur.
> 
> You're missing what Arjan said - the jav workload does a lot of memory 
> allocations too, causing mmap/munmap.
> 
> So now some paths are indeed holding it for writing (or need to wait for 
> it to become writable). And the fairness of rwsems quite possibly then 
> impacts throughput a _lot_..
> 
> (Side note: I wonder if we should wake up _all_ readers when we wake up 
> any. Right now, we wake up all readers - but only until we hit a writer. 
> Which is the _fair_ thing to do, but it does mean that we can end up in 
> horrible patterns of alternating readers/writers, when it could be much 
> better to just say "release the hounds" and let all pending readers go 
> after a writer has had its turn).

This can indeed work well in many cases.  The situation where it can
get you in trouble is where there are many more readers than CPUs (or
disk spindles or whatever it is that limits the amount of effective
parallelism readers can attain).  In this case, releasing more readers
than can run in parallel will delay the writers for no good reason.

So one strategy is to release readers, but no more than the number of
CPUs (or whatever the limit is).  More complicated strategies are out
there, but there is a limit to how much of the scheduler one should
involve in lock-granting decisions.

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 16:19           ` Linus Torvalds
  2010-01-07 16:31             ` Linus Torvalds
  2010-01-07 16:34             ` Paul E. McKenney
@ 2010-01-07 16:36             ` Christoph Lameter
  2010-01-08  4:49               ` Arjan van de Ven
  2010-01-07 17:22             ` Peter Zijlstra
  3 siblings, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-07 16:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Arjan van de Ven, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Linus Torvalds wrote:

> You're missing what Arjan said - the jav workload does a lot of memory
> allocations too, causing mmap/munmap.

Well isnt that tunable on the app level? Get bigger chunks of memory in
order to reduce the frequency of mmap operations? If you want concurrency
of faults then mmap_sem write locking currently needs to be limited.

> So now some paths are indeed holding it for writing (or need to wait for
> it to become writable). And the fairness of rwsems quite possibly then
> impacts throughput a _lot_..

Very true. Doing range locking (maybe using the split pte lock
boundaries, shifting some state from mm_struct into vmas) may be a way to
avoid hold mmap_sem for write in that case.

> (Side note: I wonder if we should wake up _all_ readers when we wake up
> any. Right now, we wake up all readers - but only until we hit a writer.
> Which is the _fair_ thing to do, but it does mean that we can end up in
> horrible patterns of alternating readers/writers, when it could be much
> better to just say "release the hounds" and let all pending readers go
> after a writer has had its turn).

Have a cycle with concurrent readers followed by a cycle of serialized
writers may be best under heavy load. The writers need to be limited in
frequency otherwise they will starve the readers.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 16:19           ` Linus Torvalds
                               ` (2 preceding siblings ...)
  2010-01-07 16:36             ` Christoph Lameter
@ 2010-01-07 17:22             ` Peter Zijlstra
  2010-01-07 17:36               ` Linus Torvalds
  3 siblings, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-07 17:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 2010-01-07 at 08:19 -0800, Linus Torvalds wrote:
> 
> So now some paths are indeed holding it for writing (or need to wait for 
> it to become writable). And the fairness of rwsems quite possibly then 
> impacts throughput a _lot_.. 

Right, supposing we can make this speculative fault stuff work, then we
can basically reduce the mmap_sem usage in fault to:

  - allocating new page tables
  - extending the growable vmas

And do everything else without holding it, including zeroing and IO.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 17:22             ` Peter Zijlstra
@ 2010-01-07 17:36               ` Linus Torvalds
  2010-01-07 17:49                 ` Linus Torvalds
  2010-01-07 23:51                 ` Rik van Riel
  0 siblings, 2 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 17:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> 
> Right, supposing we can make this speculative fault stuff work, then we
> can basically reduce the mmap_sem usage in fault to:
> 
>   - allocating new page tables
>   - extending the growable vmas
> 
> And do everything else without holding it, including zeroing and IO.

Well, I have yet to hear a realistic scenario of _how_ to do it all 
speculatively in the first place, at least not without horribly subtle 
complexity issues. So I'd really rather see how far we can possibly get by 
just improving mmap_sem.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 17:36               ` Linus Torvalds
@ 2010-01-07 17:49                 ` Linus Torvalds
  2010-01-07 18:00                   ` Peter Zijlstra
  2010-01-07 18:44                   ` Linus Torvalds
  2010-01-07 23:51                 ` Rik van Riel
  1 sibling, 2 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 17:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Linus Torvalds wrote:
> 
> Well, I have yet to hear a realistic scenario of _how_ to do it all 
> speculatively in the first place, at least not without horribly subtle 
> complexity issues. So I'd really rather see how far we can possibly get by 
> just improving mmap_sem.

For an example of this: it's entirely possible that one avenue of mmap_sem 
improvement would be to look at the _writer_ side, and see how that can be 
improved. 

An example of where we've done that is in madvise(): we used to always 
take it for writing (because _some_ madvise versions needed the exclusive 
access). And suddenly some operations got way more scalable, and work in 
the presense of concurrent page faults.

And quite frankly, I'd _much_ rather look at that kind of simple and 
logically fairly straightforward solutions, instead of doing the whole 
speculative page fault work.

For example: there's no real reason why we take mmap_sem for writing when 
extending an existing vma. And while 'brk()' is a very oldfashioned way of 
doing memory management, it's still quite common. So rather than looking 
at subtle lockless algorithms, why not look at doing the common cases of 
an extending brk? Make that one take the mmap_sem for _reading_, and then 
do the extending of the brk area with a simple cmpxchg or something?

And "extending brk" is actually a lot more common than shrinking it, and 
is common for exactly the kind of workloads that are often nasty right now 
(threaded allocators with lots and lots of smallish allocations)

The thing is, I can pretty much _guarantee_ that the speculative page 
fault is going to end up doing a lot of nasty stuff that still needs 
almost-global locking, and it's likely to be more complicated and slower 
for the single-threaded case (you end up needing refcounts, a new "local" 
lock or something).

Sure, moving to a per-vma lock can help, but it doesn't help a lot. It 
doesn't help AT ALL for the single-threaded case, and for the 
multi-threaded case I will bet you that a _lot_ of cases will have one 
very hot vma - the regular data vma that gets shared for normal malloc() 
etc. 

So I'm personally rather doubtful about the whole speculative work. It's a 
fair amount of complexity without any really obvious upside. Yes, the 
mmap_sem can be very annoying, but nobody can really honestly claim that 
we've really optimized it all that much.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 17:49                 ` Linus Torvalds
@ 2010-01-07 18:00                   ` Peter Zijlstra
  2010-01-07 18:15                     ` Linus Torvalds
  2010-01-07 18:44                   ` Linus Torvalds
  1 sibling, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-07 18:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 2010-01-07 at 09:49 -0800, Linus Torvalds wrote:
> 
> The thing is, I can pretty much _guarantee_ that the speculative page 
> fault is going to end up doing a lot of nasty stuff that still needs 
> almost-global locking, and it's likely to be more complicated and slower 
> for the single-threaded case (you end up needing refcounts, a new "local" 
> lock or something).

Well, with that sync_vma() thing I posted the other day all the
speculative page fault needs is a write to current->fault_vma, the ptl
and an O(nr_threads) loop on unmap() for file vmas -- aside from writing
the pte itself of course.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 18:00                   ` Peter Zijlstra
@ 2010-01-07 18:15                     ` Linus Torvalds
  2010-01-07 21:49                       ` Peter Zijlstra
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 18:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> 
> Well, with that sync_vma() thing I posted the other day all the
> speculative page fault needs is a write to current->fault_vma, the ptl
> and an O(nr_threads) loop on unmap() for file vmas -- aside from writing
> the pte itself of course.

How do you handle the race of

	fault handler:			munmap

	look up vma without locks

	.. interrupt happens or
	  something delays things ..
					remove the vma from the list
					sync_vma()

	current->fault_vma = vma

etc?

Maybe I'm missing something, but quite frankly, the above looks pretty 
damn fundamental. Something needs to take a lock. If you get rid of the 
mmap_sem, you need to replace it with another lock.

There's no sane way to "look up vma and atomically mark it as busy" 
without locks. You can do it with extra work, ie something like

	look up vma without locks
	current->fault_vma = vma
	smp_mb();
	check that the vma is still on the list

(with he appropriate barriers on the munmap side too, of course) where you 
avoid the lock by basically turning it into an ordering problem, but it 
ends up being pretty much as expensive anyway for single threads.

And for lots and lots of threads, you now made that munmap be pretty 
expensive.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 17:49                 ` Linus Torvalds
  2010-01-07 18:00                   ` Peter Zijlstra
@ 2010-01-07 18:44                   ` Linus Torvalds
  2010-01-07 19:20                     ` Paul E. McKenney
                                       ` (2 more replies)
  1 sibling, 3 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 18:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Linus Torvalds wrote:
> 
> For example: there's no real reason why we take mmap_sem for writing when 
> extending an existing vma. And while 'brk()' is a very oldfashioned way of 
> doing memory management, it's still quite common. So rather than looking 
> at subtle lockless algorithms, why not look at doing the common cases of 
> an extending brk? Make that one take the mmap_sem for _reading_, and then 
> do the extending of the brk area with a simple cmpxchg or something?

I didn't use cmpxchg, because we actually want to update both 
'current->brk' _and_ the vma->vm_end atomically, so here's a totally 
untested patch that uses the page_table_lock spinlock for it instead (it 
could be a new spinlock, not worth it).

It's also totally untested and might be horribly broken. But you get the 
idea.

We could probably do things like this in regular mmap() too for the 
"extend a mmap" case. brk() is just especially simple.

		Linus

---
 mm/mmap.c |   89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index ee22989..3d07e5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -242,23 +242,14 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-SYSCALL_DEFINE1(brk, unsigned long, brk)
+static long slow_brk(unsigned long brk)
 {
 	unsigned long rlim, retval;
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
-	unsigned long min_brk;
 
 	down_write(&mm->mmap_sem);
 
-#ifdef CONFIG_COMPAT_BRK
-	min_brk = mm->end_code;
-#else
-	min_brk = mm->start_brk;
-#endif
-	if (brk < min_brk)
-		goto out;
-
 	/*
 	 * Check against rlimit here. If this check is done later after the test
 	 * of oldbrk with newbrk then it can escape the test and let the data
@@ -297,6 +288,84 @@ out:
 	return retval;
 }
 
+/*
+ * NOTE NOTE NOTE!
+ *
+ * We do all this speculatively with just the read lock held.
+ * If anything goes wrong, we release the read lock, and punt
+ * to the traditional write-locked version.
+ *
+ * Here "goes wrong" includes:
+ *  - having to unmap the area and actually shrink it
+ *  - the final cmpxchg doesn't succeed
+ *  - the old brk area wasn't a simple anonymous vma
+ *  etc etc
+ */
+static struct vm_area_struct *ok_to_extend_brk(struct mm_struct *mm, unsigned long cur_brk, unsigned long brk)
+{
+	struct vm_area_struct *vma, *nextvma;
+
+	nextvma = find_vma_prev(mm, cur_brk, &vma);
+	if (vma) {
+		/*
+		 * Needs to be an anonymous vma that ends at the current brk,
+		 * and with no following vma that would start before the
+		 * new brk
+		 */
+		if (vma->vm_file || cur_brk != vma->vm_end || (nextvma && brk > nextvma->vm_start))
+			vma = NULL;
+	}
+	return vma;
+}
+
+SYSCALL_DEFINE1(brk, unsigned long, brk)
+{
+	unsigned long cur_brk, min_brk;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	down_read(&mm->mmap_sem);
+	cur_brk = mm->brk;
+#ifdef CONFIG_COMPAT_BRK
+	min_brk = mm->end_code;
+#else
+	min_brk = mm->start_brk;
+#endif
+	if (brk < min_brk)
+		goto out;
+
+	brk = PAGE_ALIGN(brk);
+	cur_brk = PAGE_ALIGN(cur_brk);
+
+	if (brk < cur_brk)
+		goto slow_case;
+	if (brk == cur_brk)
+		goto out;
+
+	vma = ok_to_extend_brk(mm, cur_brk, brk);
+	if (!vma)
+		goto slow_case;
+
+	spin_lock(&mm->page_table_lock);
+	if (vma->vm_end == cur_brk) {
+		vma->vm_end = brk;
+		mm->brk = brk;
+		cur_brk = brk;
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	if (cur_brk != brk)
+		goto slow_case;
+
+out:
+	up_read(&mm->mmap_sem);
+	return cur_brk;
+
+slow_case:
+	up_read(&mm->mmap_sem);
+	return slow_brk(brk);
+}
+
 #ifdef DEBUG_MM_RB
 static int browse_rb(struct rb_root *root)
 {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 18:44                   ` Linus Torvalds
@ 2010-01-07 19:20                     ` Paul E. McKenney
  2010-01-07 20:06                       ` Linus Torvalds
  2010-01-07 19:24                     ` Christoph Lameter
  2010-01-07 21:44                     ` Peter Zijlstra
  2 siblings, 1 reply; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-07 19:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, Jan 07, 2010 at 10:44:13AM -0800, Linus Torvalds wrote:
> 
> 
> On Thu, 7 Jan 2010, Linus Torvalds wrote:
> > 
> > For example: there's no real reason why we take mmap_sem for writing when 
> > extending an existing vma. And while 'brk()' is a very oldfashioned way of 
> > doing memory management, it's still quite common. So rather than looking 
> > at subtle lockless algorithms, why not look at doing the common cases of 
> > an extending brk? Make that one take the mmap_sem for _reading_, and then 
> > do the extending of the brk area with a simple cmpxchg or something?
> 
> I didn't use cmpxchg, because we actually want to update both 
> 'current->brk' _and_ the vma->vm_end atomically, so here's a totally 
> untested patch that uses the page_table_lock spinlock for it instead (it 
> could be a new spinlock, not worth it).
> 
> It's also totally untested and might be horribly broken. But you get the 
> idea.
> 
> We could probably do things like this in regular mmap() too for the 
> "extend a mmap" case. brk() is just especially simple.

One question on the final test...

> 		Linus
> 
> ---
>  mm/mmap.c |   89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 files changed, 79 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index ee22989..3d07e5f 100644

[ . . . ]

> +	if (!vma)
> +		goto slow_case;
> +
> +	spin_lock(&mm->page_table_lock);
> +	if (vma->vm_end == cur_brk) {
> +		vma->vm_end = brk;
> +		mm->brk = brk;
> +		cur_brk = brk;
> +	}
> +	spin_unlock(&mm->page_table_lock);
> +
> +	if (cur_brk != brk)

Can this be "if (cur_brk < brk)"?  Seems like it should, given the
earlier tests, but I don't claim to understand the VM code.

							Thanx, Paul

> +		goto slow_case;
> +
> +out:
> +	up_read(&mm->mmap_sem);
> +	return cur_brk;
> +
> +slow_case:
> +	up_read(&mm->mmap_sem);
> +	return slow_brk(brk);
> +}
> +
>  #ifdef DEBUG_MM_RB
>  static int browse_rb(struct rb_root *root)
>  {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 18:44                   ` Linus Torvalds
  2010-01-07 19:20                     ` Paul E. McKenney
@ 2010-01-07 19:24                     ` Christoph Lameter
  2010-01-07 20:08                       ` Linus Torvalds
  2010-01-07 21:44                     ` Peter Zijlstra
  2 siblings, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-07 19:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Linus Torvalds wrote:

> +	if (brk < cur_brk)
> +		goto slow_case;
> +	if (brk == cur_brk)
> +		goto out;
> +
> +	vma = ok_to_extend_brk(mm, cur_brk, brk);
> +	if (!vma)
> +		goto slow_case;
> +
> +	spin_lock(&mm->page_table_lock);


page_table_lock used to serialize multiple fast brks?

CONFIG_SPLIT_PTLOCK implies that code will not use this lock in fault
handling. So no serialization with faults.

Also the current code assumes vm_end and so on to be stable if mmap_sem is
held. F.e. find_vma() from do_fault is now running while vm_end may be changing
under it.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 19:20                     ` Paul E. McKenney
@ 2010-01-07 20:06                       ` Linus Torvalds
  2010-01-07 20:25                         ` Paul E. McKenney
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 20:06 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Paul E. McKenney wrote:
> > +
> > +	spin_lock(&mm->page_table_lock);
> > +	if (vma->vm_end == cur_brk) {
> > +		vma->vm_end = brk;
> > +		mm->brk = brk;
> > +		cur_brk = brk;
> > +	}
> > +	spin_unlock(&mm->page_table_lock);
> > +
> > +	if (cur_brk != brk)
> 
> Can this be "if (cur_brk < brk)"?  Seems like it should, given the
> earlier tests, but I don't claim to understand the VM code.

It's really just a flag, to test whether the final check (inside the 
spinlock) succeeded, or whether we perhaps raced with _another_ brk() call 
that also had the mm_sem for reading.

We know that cur_brk was different from brk before - because otherwise 
we'd have just returned early (or done the slow case). So testing whether 
it's different afterwards really only tests whether that 

	cur_brk = brk;

statment was executed or not.

I could have used a separate flag called "success" or something. 

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 19:24                     ` Christoph Lameter
@ 2010-01-07 20:08                       ` Linus Torvalds
  2010-01-07 20:13                         ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 20:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Christoph Lameter wrote:
> 
> page_table_lock used to serialize multiple fast brks?
> 
> CONFIG_SPLIT_PTLOCK implies that code will not use this lock in fault
> handling. So no serialization with faults.

Correct. The faults we do not need to serialize with. It doesn't matter 
whether they see the old or the new end. 

> Also the current code assumes vm_end and so on to be stable if mmap_sem is
> held. F.e. find_vma() from do_fault is now running while vm_end may be changing
> under it.

Again, it doesn't matter. Old or new - if some other thread looks up the 
vma, either is fine.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 20:08                       ` Linus Torvalds
@ 2010-01-07 20:13                         ` Linus Torvalds
  0 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 20:13 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Linus Torvalds wrote:
> 
> Again, it doesn't matter. Old or new - if some other thread looks up the 
> vma, either is fine.

Btw, don't get me wrong. That patch may compile (I checked it), but I am 
not in any way claiming that it is anything else than a total throw-away 
"this is something we could look at doing" suggestion.

For example, I'm not at all wedded to using 'mm->page_table_lock': I in 
fact wanted to use a per-vma lock, but I picked a lock we already had. The 
fact that picking a lock we already had also means that it serializes page 
table updates (sometimes) is actually a downside, not a good thing. 

So the patch was meant to get people thinking about alternatives, rather 
than anything else.

The point being that there are things we can play with on mmap_sem, that 
don't involve getting rid of it - just being a bit more aggressive in how 
we use it.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 20:06                       ` Linus Torvalds
@ 2010-01-07 20:25                         ` Paul E. McKenney
  0 siblings, 0 replies; 121+ messages in thread
From: Paul E. McKenney @ 2010-01-07 20:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, Jan 07, 2010 at 12:06:48PM -0800, Linus Torvalds wrote:
> 
> 
> On Thu, 7 Jan 2010, Paul E. McKenney wrote:
> > > +
> > > +	spin_lock(&mm->page_table_lock);
> > > +	if (vma->vm_end == cur_brk) {
> > > +		vma->vm_end = brk;
> > > +		mm->brk = brk;
> > > +		cur_brk = brk;
> > > +	}
> > > +	spin_unlock(&mm->page_table_lock);
> > > +
> > > +	if (cur_brk != brk)
> > 
> > Can this be "if (cur_brk < brk)"?  Seems like it should, given the
> > earlier tests, but I don't claim to understand the VM code.
> 
> It's really just a flag, to test whether the final check (inside the 
> spinlock) succeeded, or whether we perhaps raced with _another_ brk() call 
> that also had the mm_sem for reading.
> 
> We know that cur_brk was different from brk before - because otherwise 
> we'd have just returned early (or done the slow case). So testing whether 
> it's different afterwards really only tests whether that 
> 
> 	cur_brk = brk;
> 
> statment was executed or not.
> 
> I could have used a separate flag called "success" or something. 

I was (perhaps confusedly) thinking of a pair of threads both trying
to sbrk() at the same time.  One of them wins by acquiring the
->page_table_lock first.  Then the other acquires the spinlock, but
sees vma->vm_end != cur_brk.  But if the first one extended the region
at least as far as the second intended to, the second one's work is done.

Of course, we can debate the sanity of an application that actually does
concurrent sbrk() calls.

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 18:44                   ` Linus Torvalds
  2010-01-07 19:20                     ` Paul E. McKenney
  2010-01-07 19:24                     ` Christoph Lameter
@ 2010-01-07 21:44                     ` Peter Zijlstra
  2010-01-07 22:33                       ` Linus Torvalds
  2 siblings, 1 reply; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-07 21:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 2010-01-07 at 10:44 -0800, Linus Torvalds wrote:
> 
> On Thu, 7 Jan 2010, Linus Torvalds wrote:
> > 
> > For example: there's no real reason why we take mmap_sem for writing when 
> > extending an existing vma. And while 'brk()' is a very oldfashioned way of 
> > doing memory management, it's still quite common. So rather than looking 
> > at subtle lockless algorithms, why not look at doing the common cases of 
> > an extending brk? Make that one take the mmap_sem for _reading_, and then 
> > do the extending of the brk area with a simple cmpxchg or something?
> 
> I didn't use cmpxchg, because we actually want to update both 
> 'current->brk' _and_ the vma->vm_end atomically, so here's a totally 
> untested patch that uses the page_table_lock spinlock for it instead (it 
> could be a new spinlock, not worth it).
> 
> It's also totally untested and might be horribly broken. But you get the 
> idea.
> 
> We could probably do things like this in regular mmap() too for the 
> "extend a mmap" case. brk() is just especially simple.

I haven't yet looked at the patch, but isn't expand_stack() kinda like
what you want? That serializes using anon_vma_lock().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 18:15                     ` Linus Torvalds
@ 2010-01-07 21:49                       ` Peter Zijlstra
  0 siblings, 0 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-07 21:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 2010-01-07 at 10:15 -0800, Linus Torvalds wrote:
> 
> On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> > 
> > Well, with that sync_vma() thing I posted the other day all the
> > speculative page fault needs is a write to current->fault_vma, the ptl
> > and an O(nr_threads) loop on unmap() for file vmas -- aside from writing
> > the pte itself of course.
> 
> How do you handle the race of
> 
> 	fault handler:			munmap
> 
> 	look up vma without locks
> 
> 	.. interrupt happens or
> 	  something delays things ..
> 					remove the vma from the list
> 					sync_vma()
> 
> 	current->fault_vma = vma
> 
> etc?
> 
> Maybe I'm missing something, but quite frankly, the above looks pretty 
> damn fundamental. Something needs to take a lock. If you get rid of the 
> mmap_sem, you need to replace it with another lock.
> 
> There's no sane way to "look up vma and atomically mark it as busy" 
> without locks. You can do it with extra work, ie something like
> 
> 	look up vma without locks
> 	current->fault_vma = vma
> 	smp_mb();
> 	check that the vma is still on the list
> 
> (with he appropriate barriers on the munmap side too, of course) where you 
> avoid the lock by basically turning it into an ordering problem, but it 
> ends up being pretty much as expensive anyway for single threads.

As expensive for single threads is just fine with me, as long as we get
cheaper with lots of threads.

I basically did the above revalidation with seqlocks, we lookup the vma,
mark it busy and revalidate the vma sequence count.

> And for lots and lots of threads, you now made that munmap be pretty 
> expensive.

Yeah, I really hate that part too, trying hard to come up with something
smarter but my brain isn't yet co-operating :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 21:44                     ` Peter Zijlstra
@ 2010-01-07 22:33                       ` Linus Torvalds
  2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-07 22:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Arjan van de Ven, Paul E. McKenney,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Peter Zijlstra wrote:
>
> I haven't yet looked at the patch, but isn't expand_stack() kinda like
> what you want? That serializes using anon_vma_lock().

Yeah, that sounds like the right thing to do.  It is the same operation, 
after all (and has the same effects, especially for the special case of 
upwards-growing stacks).

So basically the idea is to extend that stack expansion to brk(), and 
possibly mmap() in general.

Doing the same for munmap() (or shrinking thigns in general, which you can 
do with brk but not with the stack) is quite a bit harder. As can be seen 
by the fact that all the problems with the speculative approach are in the 
unmap cases.

But the good news is that shrinking mappings is _much_ less common than 
growing them. Many memory allocators never shrink at all, or shrink only 
when they hit certain big chunks. In a lot of cases, the only time you 
shrink a mapping ends up being at the final exit, which doesn't have any 
locking issues anyway, since even if we take the mmap_sem lock for 
writing, there aren't going to be any readers possibly left.

And a lot of growing mmaps end up just extending an old one. No, not 
always, but I suspect that if we really put some effort into it, we could 
probably make the write-lock frequency go down by something like an order 
of magnitude on many loads.

Not all loads, no. Some loads will do a lot of file mmap's, or use 
MAP_FIXED and/or mprotect to split vma's on purpose. But that is certainly 
not likely to be the common case.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 17:36               ` Linus Torvalds
  2010-01-07 17:49                 ` Linus Torvalds
@ 2010-01-07 23:51                 ` Rik van Riel
  1 sibling, 0 replies; 121+ messages in thread
From: Rik van Riel @ 2010-01-07 23:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	Paul E. McKenney, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar

On 01/07/2010 12:36 PM, Linus Torvalds wrote:
> On Thu, 7 Jan 2010, Peter Zijlstra wrote:
>>
>> Right, supposing we can make this speculative fault stuff work, then we
>> can basically reduce the mmap_sem usage in fault to:
>>
>>    - allocating new page tables
>>    - extending the growable vmas
>>
>> And do everything else without holding it, including zeroing and IO.
>
> Well, I have yet to hear a realistic scenario of _how_ to do it all
> speculatively in the first place, at least not without horribly subtle
> complexity issues. So I'd really rather see how far we can possibly get by
> just improving mmap_sem.

I would like to second this sentiment.  I am trying to make the
anon_vma and rmap bits more scalable for multi-process server
workloads and it is quite worrying how complex locking already
is.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 22:33                       ` Linus Torvalds
@ 2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
  2010-01-08  0:25                           ` KAMEZAWA Hiroyuki
  2010-01-08  0:39                           ` Linus Torvalds
  0 siblings, 2 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-08  0:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	Paul E. McKenney, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010 14:33:50 -0800 (PST)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> >
> > I haven't yet looked at the patch, but isn't expand_stack() kinda like
> > what you want? That serializes using anon_vma_lock().
> 
> Yeah, that sounds like the right thing to do.  It is the same operation, 
> after all (and has the same effects, especially for the special case of 
> upwards-growing stacks).
> 
> So basically the idea is to extend that stack expansion to brk(), and 
> possibly mmap() in general.
> 
Hmm, do_brk() sometimes unmap conflicting mapping. Isn't it be a problem ?
Stack expansion just fails and SEGV if it hit with other mmaps....

Thanks,
-Kame





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
@ 2010-01-08  0:25                           ` KAMEZAWA Hiroyuki
  2010-01-08  0:39                           ` Linus Torvalds
  1 sibling, 0 replies; 121+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-08  0:25 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Linus Torvalds, Peter Zijlstra, Christoph Lameter,
	Arjan van de Ven, Paul E. McKenney, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010 09:23:33 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 7 Jan 2010 14:33:50 -0800 (PST)
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> > 
> > 
> > On Thu, 7 Jan 2010, Peter Zijlstra wrote:
> > >
> > > I haven't yet looked at the patch, but isn't expand_stack() kinda like
> > > what you want? That serializes using anon_vma_lock().
> > 
> > Yeah, that sounds like the right thing to do.  It is the same operation, 
> > after all (and has the same effects, especially for the special case of 
> > upwards-growing stacks).
> > 
> > So basically the idea is to extend that stack expansion to brk(), and 
> > possibly mmap() in general.
> > 
> Hmm, do_brk() sometimes unmap conflicting mapping. Isn't it be a problem ?
> Stack expansion just fails and SEGV if it hit with other mmaps....
Sorry please ignore the last line...

Sorry for noise.
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
  2010-01-08  0:25                           ` KAMEZAWA Hiroyuki
@ 2010-01-08  0:39                           ` Linus Torvalds
  2010-01-08  0:41                             ` Linus Torvalds
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08  0:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	Paul E. McKenney, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar



On Fri, 8 Jan 2010, KAMEZAWA Hiroyuki wrote:
> 
> Hmm, do_brk() sometimes unmap conflicting mapping. Isn't it be a problem ?

No. For two reasons:

 - sys_brk() doesn't actually do that (see the "find_vma_intersection()" 
   call). I'm not sure why do_brk() does, but it might have to do with 
   execve().

 - the patch I sent out just falls back to the old code if it finds 
   something fishy, so it will do whatever do_brk() does regardless.

(Yes, brk() does unmap the old brk for the _shrinking_ case, of course. 
Again, the patch I sent just falls back to the old behavior in that case)

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08  0:39                           ` Linus Torvalds
@ 2010-01-08  0:41                             ` Linus Torvalds
  0 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08  0:41 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Peter Zijlstra, Christoph Lameter, Arjan van de Ven,
	Paul E. McKenney, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Linus Torvalds wrote:
> 
>  - the patch I sent out just falls back to the old code if it finds 
>    something fishy, so it will do whatever do_brk() does regardless.

Btw, I'd like to state it again - the patch I sent out was not ready to be 
applied. I'm pretty sure it should check things like certain vm_flags 
too(VM_LOCKED etc), and fall back for those cases as well.

So the patch was more meant to illustrate the _concept_ than meant to 
necessarily be taken seriously as-is.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-07 16:36             ` Christoph Lameter
@ 2010-01-08  4:49               ` Arjan van de Ven
  2010-01-08  5:00                 ` Linus Torvalds
  2010-01-08 15:51                 ` Christoph Lameter
  0 siblings, 2 replies; 121+ messages in thread
From: Arjan van de Ven @ 2010-01-08  4:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Linus Torvalds, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010 10:36:52 -0600 (CST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 7 Jan 2010, Linus Torvalds wrote:
> 
> > You're missing what Arjan said - the jav workload does a lot of
> > memory allocations too, causing mmap/munmap.
> 
> Well isnt that tunable on the app level? Get bigger chunks of memory
> in order to reduce the frequency of mmap operations? If you want
> concurrency of faults then mmap_sem write locking currently needs to
> be limited.

if an app has to change because our kernel sucks (for no good reason),
"change the app" really is the lame type of answer.


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08  4:49               ` Arjan van de Ven
@ 2010-01-08  5:00                 ` Linus Torvalds
  2010-01-08 15:51                 ` Christoph Lameter
  1 sibling, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08  5:00 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Christoph Lameter, Peter Zijlstra, Paul E. McKenney,
	Peter Zijlstra, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	minchan.kim, hugh.dickins, Nick Piggin, Ingo Molnar



On Thu, 7 Jan 2010, Arjan van de Ven wrote:
> 
> if an app has to change because our kernel sucks (for no good reason),
> "change the app" really is the lame type of answer.

Well, in all fairness, I doubt other kernels do any better. So changing 
the app is likely to help in general, and thus be at least part of the 
right solution.

But as outlined, we _can_ almost certainly do better on many simple and 
common cases, so changing the kernel - in addition to fixing app memory 
allocation patterns - sounds like a good avenue.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08  4:49               ` Arjan van de Ven
  2010-01-08  5:00                 ` Linus Torvalds
@ 2010-01-08 15:51                 ` Christoph Lameter
  2010-01-09 15:55                   ` Arjan van de Ven
  1 sibling, 1 reply; 121+ messages in thread
From: Christoph Lameter @ 2010-01-08 15:51 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Linus Torvalds, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Thu, 7 Jan 2010, Arjan van de Ven wrote:

> if an app has to change because our kernel sucks (for no good reason),
> "change the app" really is the lame type of answer.

We are changing apps all of the time here to reduce the number of system
calls. Any system call usually requires context switching, scheduling
activities etc. Evil effects if you want the processor for computation
and are sensitive to cpu caching effects. It is good to reduce the number
of system calls as much as possible.

System calls are at best placed to affect the largest memory
possible in a given context and be avoided in loops.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-06  4:20                           ` Linus Torvalds
  2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
@ 2010-01-08 16:53                             ` Peter Zijlstra
  2010-01-08 17:22                               ` Linus Torvalds
  2010-01-09 14:47                               ` Ed Tomlinson
  1 sibling, 2 replies; 121+ messages in thread
From: Peter Zijlstra @ 2010-01-08 16:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney, linux-kernel,
	linux-mm, cl, hugh.dickins, Nick Piggin, Ingo Molnar

On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
> 
> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > > 
> > > Of course, your other load with MADV_DONTNEED seems to be horrible, and 
> > > has some nasty spinlock issues, but that looks like a separate deal (I 
> > > assume that load is just very hard on the pgtable lock).
> > 
> > It's zone->lock, I guess. My test program avoids pgtable lock problem.
> 
> Yeah, I should have looked more at your callchain. That's nasty. Much 
> worse than the per-mm lock. I thought the page buffering would avoid the 
> zone lock becoming a huge problem, but clearly not in this case.

Right, so I ran some numbers on a multi-socket (2) machine as well:

                               pf/min

-tip                          56398626
-tip + xadd                  174753190
-tip + speculative           189274319
-tip + xadd + speculative    200174641

[ variance is around 0.5% for this workload, ran most of these numbers
with --repeat 5 ]

At both the xadd/speculative point the workload is dominated by the
zone->lock, the xadd+speculative removes some of the contention, and
removing the various RSS counters could yield another few percent
according to the profiles, but then we're pretty much there.

One way around those RSS counters is to track it per task, a quick grep
shows its only the oom-killer and proc that use them.

A quick hack removing them gets us: 203158058

So from a throughput pov. the whole speculative fault thing might not be
interesting until the rest of the vm gets a lift to go along with it.

>From a blocking on mmap_sem pov. I think Linus is right in that we
should first consider things like dropping mmap_sep around IO and page
zeroing, and generally looking at reducing hold times and such.

So while I think its quite feasible to do these speculative faults, it
appears we're not quite ready for them.

Maybe I can get -rt to carry it for a while, there we have to reduce
mmap_sem to a mutex, which hurts lots.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 16:53                             ` Peter Zijlstra
@ 2010-01-08 17:22                               ` Linus Torvalds
  2010-01-08 17:43                                 ` Christoph Lameter
  2010-01-09 14:47                               ` Ed Tomlinson
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 17:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney, linux-kernel,
	linux-mm, cl, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Peter Zijlstra wrote:

> On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
> > 
> > Yeah, I should have looked more at your callchain. That's nasty. Much 
> > worse than the per-mm lock. I thought the page buffering would avoid the 
> > zone lock becoming a huge problem, but clearly not in this case.
> 
> Right, so I ran some numbers on a multi-socket (2) machine as well:
> 
>                                pf/min
> 
> -tip                          56398626
> -tip + xadd                  174753190
> -tip + speculative           189274319
> -tip + xadd + speculative    200174641
> 
> [ variance is around 0.5% for this workload, ran most of these numbers
> with --repeat 5 ]

That's a huge jump. It's clear that the spinlock-based rwsem's simply 
suck.  The speculation gets rid of some additional mmap_sem contention, 
but at least for two sockets it looks like the rwsem implementation was 
the biggest problem by far.

> At both the xadd/speculative point the workload is dominated by the
> zone->lock, the xadd+speculative removes some of the contention, and
> removing the various RSS counters could yield another few percent
> according to the profiles, but then we're pretty much there.

I don't know if worrying about a few percent is worth it. "Perfect is the 
enemy of good", and the workload is pretty dang artificial with the whole 
"remove pages and re-fault them as fast as you can".

So the benchmark is pointless and extreme, and I think it's not worth 
worrying too much about details. Especially when compared to just the 
*three-fold* jump from just the fairly trivial rwsem implementation change 
(with speculation on top of it then adding another 15% improvement - 
nothing to sneeze at, but it's still in a different class).

Of course, larger numbers of sockets will likely change the situation, but 
at the same time I do suspect that workloads designed for hundreds of 
cores will need to try to behave better than that benchmark anyway ;)

> One way around those RSS counters is to track it per task, a quick grep
> shows its only the oom-killer and proc that use them.
> 
> A quick hack removing them gets us: 203158058

Yeah, well.. After that 200% and 15% improvement, a 1.5% improvement on a 
totally artificial benchmark looks less interesting.

Because let's face it - if your workload does several million page faults 
per second, you're just doing something fundamentally _wrong_.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 17:22                               ` Linus Torvalds
@ 2010-01-08 17:43                                 ` Christoph Lameter
  2010-01-08 17:52                                   ` Linus Torvalds
                                                     ` (2 more replies)
  0 siblings, 3 replies; 121+ messages in thread
From: Christoph Lameter @ 2010-01-08 17:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Linus Torvalds wrote:

> That's a huge jump. It's clear that the spinlock-based rwsem's simply
> suck.  The speculation gets rid of some additional mmap_sem contention,
> but at least for two sockets it looks like the rwsem implementation was
> the biggest problem by far.

I'd say that the ticket lock sucks for short critical sections vs. a
simple spinlock since it forces the cacheline into shared mode.

> Of course, larger numbers of sockets will likely change the situation, but
> at the same time I do suspect that workloads designed for hundreds of
> cores will need to try to behave better than that benchmark anyway ;)

Can we at least consider a typical standard business server, dual quad
core hyperthreaded with 16 "cpus"? Cacheline contention will increase
significantly there.

> Because let's face it - if your workload does several million page faults
> per second, you're just doing something fundamentally _wrong_.

You may just want to get your app running and its trying to initialize
its memory in parallel on all threads. Nothing wrong with that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 17:43                                 ` Christoph Lameter
@ 2010-01-08 17:52                                   ` Linus Torvalds
  2010-01-08 18:33                                     ` Christoph Lameter
  2010-01-08 18:46                                   ` Andi Kleen
  2010-01-08 21:36                                   ` Linus Torvalds
  2 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 17:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar



On Fri, 8 Jan 2010, Christoph Lameter wrote:
> 
> Can we at least consider a typical standard business server, dual quad
> core hyperthreaded with 16 "cpus"? Cacheline contention will increase
> significantly there.

I bet it won't be a problem. It's when things go cross-socket that they 
suck. So 16 cpu's across two sockets I wouldn't worry about.

> > Because let's face it - if your workload does several million page faults
> > per second, you're just doing something fundamentally _wrong_.
> 
> You may just want to get your app running and its trying to initialize
> its memory in parallel on all threads. Nothing wrong with that.

Umm. That's going to be limited by the memset/memcpy, not the rwlock, I 
bet.

The benchmark in question literally did a single byte write to each page 
in order to show just the kernel component. That really isn't realistic 
for any real load.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 17:52                                   ` Linus Torvalds
@ 2010-01-08 18:33                                     ` Christoph Lameter
  0 siblings, 0 replies; 121+ messages in thread
From: Christoph Lameter @ 2010-01-08 18:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Linus Torvalds wrote:

> I bet it won't be a problem. It's when things go cross-socket that they
> suck. So 16 cpu's across two sockets I wouldn't worry about.
>
> > > Because let's face it - if your workload does several million page faults
> > > per second, you're just doing something fundamentally _wrong_.
> >
> > You may just want to get your app running and its trying to initialize
> > its memory in parallel on all threads. Nothing wrong with that.
>
> Umm. That's going to be limited by the memset/memcpy, not the rwlock, I
> bet.

That may be true for a system with 2 threads. As the number of threads
increases so does the cacheline contention. In larger systems the
memset/memcpy is negligible.

> The benchmark in question literally did a single byte write to each page
> in order to show just the kernel component. That really isn't realistic
> for any real load.

Each anon fault also includes zeroing the page before its ready to be
written to. The cachelines will be hot after a fault and initialization of
any variables in the page will be fast due to that warming up effect.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 17:43                                 ` Christoph Lameter
  2010-01-08 17:52                                   ` Linus Torvalds
@ 2010-01-08 18:46                                   ` Andi Kleen
  2010-01-08 18:56                                     ` Christoph Lameter
  2010-01-08 21:36                                   ` Linus Torvalds
  2 siblings, 1 reply; 121+ messages in thread
From: Andi Kleen @ 2010-01-08 18:46 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Linus Torvalds, Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar

Christoph Lameter <cl@linux-foundation.org> writes:

> Can we at least consider a typical standard business server, dual quad
> core hyperthreaded with 16 "cpus"? Cacheline contention will increase
> significantly there.

This year's standard server will be more like 24-64 "cpus"

Cheating with locks usually doesn't work anymore at these sizes.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 18:46                                   ` Andi Kleen
@ 2010-01-08 18:56                                     ` Christoph Lameter
  2010-01-08 19:10                                       ` Andi Kleen
  2010-01-08 19:11                                       ` Linus Torvalds
  0 siblings, 2 replies; 121+ messages in thread
From: Christoph Lameter @ 2010-01-08 18:56 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Linus Torvalds, Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Andi Kleen wrote:

> This year's standard server will be more like 24-64 "cpus"

What will it be? 2 or 4 sockets?

> Cheating with locks usually doesn't work anymore at these sizes.

Cheating means?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 18:56                                     ` Christoph Lameter
@ 2010-01-08 19:10                                       ` Andi Kleen
  2010-01-08 19:11                                       ` Linus Torvalds
  1 sibling, 0 replies; 121+ messages in thread
From: Andi Kleen @ 2010-01-08 19:10 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Linus Torvalds, Peter Zijlstra, KAMEZAWA Hiroyuki,
	Minchan Kim, Paul E. McKenney, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, Jan 08, 2010 at 12:56:08PM -0600, Christoph Lameter wrote:
> On Fri, 8 Jan 2010, Andi Kleen wrote:
> 
> > This year's standard server will be more like 24-64 "cpus"
> 
> What will it be? 2 or 4 sockets?

2-4 sockets.
 
> > Cheating with locks usually doesn't work anymore at these sizes.
> 
> Cheating means?

Not really fixing the underlying data structure problem.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 18:56                                     ` Christoph Lameter
  2010-01-08 19:10                                       ` Andi Kleen
@ 2010-01-08 19:11                                       ` Linus Torvalds
  2010-01-08 19:28                                         ` Andi Kleen
  1 sibling, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 19:11 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, hugh.dickins,
	Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Christoph Lameter wrote:

> On Fri, 8 Jan 2010, Andi Kleen wrote:
> 
> > This year's standard server will be more like 24-64 "cpus"
> 
> What will it be? 2 or 4 sockets?

I think we can be pretty safe in saying that two sockets is going to be 
overwhelmingly the more common case.

It's simply physics and form factor. It's hard to put four powerful CPU's 
on one board in any normal form-factor, so when you go from 2->4 sockets, 
you almost inevitably have to go to rather fancier form-factors (or 
low-power sockets designed for socket-density rather than multi-core 
density, which is kind of against the point these days).

So often you end up with CPU daughter-cards etc, which involves a lot more 
design and cost, and no longer fit in standard desktop enclosures for 
people who want stand-alone servers etc (or even in rack setups if you 
want local disks too etc).

Think about it this way: just four sockets and associated per-socket RAM 
DIMM's (never mind anything else) take up a _lot_ of space. And you can 
only make your boards so big before they start having purely machanical 
issues due to flexing etc.

Which is why I suspect that two sockets will be the bulk of the server 
space for the forseeable future. It's been true before, and multiple 
memory channels per socket to feed all those cores are just making it even 
more so.

				Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 19:11                                       ` Linus Torvalds
@ 2010-01-08 19:28                                         ` Andi Kleen
  2010-01-08 19:39                                           ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Andi Kleen @ 2010-01-08 19:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Lameter, Andi Kleen, Peter Zijlstra, KAMEZAWA Hiroyuki,
	Minchan Kim, Paul E. McKenney, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, Jan 08, 2010 at 11:11:32AM -0800, Linus Torvalds wrote:
> 
> 
> On Fri, 8 Jan 2010, Christoph Lameter wrote:
> 
> > On Fri, 8 Jan 2010, Andi Kleen wrote:
> > 
> > > This year's standard server will be more like 24-64 "cpus"
> > 
> > What will it be? 2 or 4 sockets?
> 
> I think we can be pretty safe in saying that two sockets is going to be 
> overwhelmingly the more common case.

With 24 CPU threads cheating is very difficult too.

Besides even the "uncommon" part of a large pie can be still a lot of systems.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 19:28                                         ` Andi Kleen
@ 2010-01-08 19:39                                           ` Linus Torvalds
  2010-01-08 19:42                                             ` Linus Torvalds
  0 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 19:39 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Christoph Lameter, Peter Zijlstra, KAMEZAWA Hiroyuki,
	Minchan Kim, Paul E. McKenney, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Andi Kleen wrote:
> 
> With 24 CPU threads cheating is very difficult too.

Stop making value judgements in you word choice, like "cheating".

The fact is, the mmap_sem is per-mm, and works perfectly well. Other 
locking can be vma-specific, but as already mentioned, it's not going to 
_help_, since most of the time even hugely threaded programs will be using 
malloc-like functionality and you have allocations not only cross threads, 
but in general using the same vma. 

Another fact is simply that you shouldn't write your app so that it needs 
to do millions of page faults per second.

So this whole "cheating" argument of yours is total bullshit. It bears no 
relation to reality.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 19:39                                           ` Linus Torvalds
@ 2010-01-08 19:42                                             ` Linus Torvalds
  0 siblings, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 19:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Christoph Lameter, Peter Zijlstra, KAMEZAWA Hiroyuki,
	Minchan Kim, Paul E. McKenney, linux-kernel, linux-mm,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Linus Torvalds wrote:
> 
> Another fact is simply that you shouldn't write your app so that it needs 
> to do millions of page faults per second.

Side note: an app with lots of threads, and that needs to fault in lots of 
pages at startup time, and has performance problems, can - and should - 
likely use interfaces that are _designed_ for that.

There's things like madvise(WILLNEED) etc, which can batch up the filling 
of a memory area.

If you're doing a performance-sensitive application with hundreds of 
threads, and hundreds of gigabytes of data, you had better know about 
simple concepts like "batch fill", rather than whine about "oh my, my 
totally special-case app takes a few seconds to start because I'M A 
F*CKING MORON AND DID EVERYTHING WRONG".

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 17:43                                 ` Christoph Lameter
  2010-01-08 17:52                                   ` Linus Torvalds
  2010-01-08 18:46                                   ` Andi Kleen
@ 2010-01-08 21:36                                   ` Linus Torvalds
  2010-01-08 21:46                                     ` Christoph Lameter
  2 siblings, 1 reply; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 21:36 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Christoph Lameter wrote:
> 
> I'd say that the ticket lock sucks for short critical sections vs. a
> simple spinlock since it forces the cacheline into shared mode.

Btw, I do agree that it's likely made worse by the fairness of the ticket 
locks and the resulting extra traffic of people waiting for their turn. 
Often for absolutely no good reason, since in this case the rwlock itself 
will then be granted for reading in most cases - and there are no 
ordering issues on readers.

We worried about the effects of fair spinlocks when introducing the ticket 
locks, but nobody ever actually had a load that seemed to indicate it made 
much of a difference, and we did have a few cases where starvation was a 
very noticeable problem.

				Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 21:36                                   ` Linus Torvalds
@ 2010-01-08 21:46                                     ` Christoph Lameter
  2010-01-08 22:43                                       ` Linus Torvalds
  2010-01-08 22:43                                       ` Linus Torvalds
  0 siblings, 2 replies; 121+ messages in thread
From: Christoph Lameter @ 2010-01-08 21:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Linus Torvalds wrote:

> We worried about the effects of fair spinlocks when introducing the ticket
> locks, but nobody ever actually had a load that seemed to indicate it made
> much of a difference, and we did have a few cases where starvation was a
> very noticeable problem.

And I made the point that starvation was a hardware issue due to immature
cacheline handling. Now the software patchup job for the hardware breakage
is causing regressions for everyone.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 21:46                                     ` Christoph Lameter
@ 2010-01-08 22:43                                       ` Linus Torvalds
  2010-01-08 22:43                                       ` Linus Torvalds
  1 sibling, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 22:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Christoph Lameter wrote:
> 
> And I made the point that starvation was a hardware issue due to immature
> cacheline handling. Now the software patchup job for the hardware breakage
> is causing regressions for everyone.

Well, in all fairness, (a) existing hardware doesn't do a good job, and 
would have a really hard time doing so in general (ie the whole issue of 
on-die vs directly-between-sockets vs between-complex-fabric), and (b) in 
this case, the problem really was that the x86-64 rwsems were badly 
implemented.

The fact that somebody _thought_ that it might be ok to do them with 
spinlocks and had done some limited testing without ever hitting the 
problem spot (probably never having tested any amount of contention at 
all) is immaterial. We should have had real native rwsemaphores for 
x86-64, and complaining about the fallback sucking under load is kind of 
pointless.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 21:46                                     ` Christoph Lameter
  2010-01-08 22:43                                       ` Linus Torvalds
@ 2010-01-08 22:43                                       ` Linus Torvalds
  1 sibling, 0 replies; 121+ messages in thread
From: Linus Torvalds @ 2010-01-08 22:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010, Christoph Lameter wrote:
> 
> And I made the point that starvation was a hardware issue due to immature
> cacheline handling. Now the software patchup job for the hardware breakage
> is causing regressions for everyone.

Well, in all fairness, (a) existing hardware doesn't do a good job, and 
would have a really hard time doing so in general (ie the whole issue of 
on-die vs directly-between-sockets vs between-complex-fabric), and (b) in 
this case, the problem really was that the x86-64 rwsems were badly 
implemented.

The fact that somebody _thought_ that it might be ok to do them with 
spinlocks and had done some limited testing without ever hitting the 
problem spot (probably never having tested any amount of contention at 
all) is immaterial. We should have had real native rwsemaphores for 
x86-64, and complaining about the fallback sucking under load is kind of 
pointless.

			Linus

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 16:53                             ` Peter Zijlstra
  2010-01-08 17:22                               ` Linus Torvalds
@ 2010-01-09 14:47                               ` Ed Tomlinson
  2010-01-10  5:27                                 ` Nitin Gupta
  1 sibling, 1 reply; 121+ messages in thread
From: Ed Tomlinson @ 2010-01-09 14:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, KAMEZAWA Hiroyuki, Minchan Kim, Paul E. McKenney,
	linux-kernel, linux-mm, cl, hugh.dickins, Nick Piggin,
	Ingo Molnar, Nitin Gupta

On Friday 08 January 2010 11:53:30 Peter Zijlstra wrote:
> On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
> > 
> > On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
> > > > 
> > > > Of course, your other load with MADV_DONTNEED seems to be horrible, and 
> > > > has some nasty spinlock issues, but that looks like a separate deal (I 
> > > > assume that load is just very hard on the pgtable lock).
> > > 
> > > It's zone->lock, I guess. My test program avoids pgtable lock problem.
> > 
> > Yeah, I should have looked more at your callchain. That's nasty. Much 
> > worse than the per-mm lock. I thought the page buffering would avoid the 
> > zone lock becoming a huge problem, but clearly not in this case.
> 
> Right, so I ran some numbers on a multi-socket (2) machine as well:
> 
>                                pf/min
> 
> -tip                          56398626
> -tip + xadd                  174753190
> -tip + speculative           189274319
> -tip + xadd + speculative    200174641

Has anyone tried these patches with ramzswap?  Nitin do they help with the locking
issues you mentioned?

Thanks,
Ed

 
> [ variance is around 0.5% for this workload, ran most of these numbers
> with --repeat 5 ]
> 
> At both the xadd/speculative point the workload is dominated by the
> zone->lock, the xadd+speculative removes some of the contention, and
> removing the various RSS counters could yield another few percent
> according to the profiles, but then we're pretty much there.
> 
> One way around those RSS counters is to track it per task, a quick grep
> shows its only the oom-killer and proc that use them.
> 
> A quick hack removing them gets us: 203158058
> 
> So from a throughput pov. the whole speculative fault thing might not be
> interesting until the rest of the vm gets a lift to go along with it.
> 
> >From a blocking on mmap_sem pov. I think Linus is right in that we
> should first consider things like dropping mmap_sep around IO and page
> zeroing, and generally looking at reducing hold times and such.
> 
> So while I think its quite feasible to do these speculative faults, it
> appears we're not quite ready for them.
> 
> Maybe I can get -rt to carry it for a while, there we have to reduce
> mmap_sem to a mutex, which hurts lots.
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-08 15:51                 ` Christoph Lameter
@ 2010-01-09 15:55                   ` Arjan van de Ven
  0 siblings, 0 replies; 121+ messages in thread
From: Arjan van de Ven @ 2010-01-09 15:55 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Linus Torvalds, Peter Zijlstra, Paul E. McKenney, Peter Zijlstra,
	KAMEZAWA Hiroyuki, linux-kernel, linux-mm, minchan.kim,
	hugh.dickins, Nick Piggin, Ingo Molnar

On Fri, 8 Jan 2010 09:51:49 -0600 (CST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 7 Jan 2010, Arjan van de Ven wrote:
> 
> > if an app has to change because our kernel sucks (for no good
> > reason), "change the app" really is the lame type of answer.
> 
> We are changing apps all of the time here to reduce the number of
> system calls. 

it's fine to fix an app for something fundamental.
it sucks to need to fix an app because we decided to clear a page
inside a lock rather than outside it.


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

* Re: [RFC][PATCH 6/8] mm: handle_speculative_fault()
  2010-01-09 14:47                               ` Ed Tomlinson
@ 2010-01-10  5:27                                 ` Nitin Gupta
  0 siblings, 0 replies; 121+ messages in thread
From: Nitin Gupta @ 2010-01-10  5:27 UTC (permalink / raw)
  To: Ed Tomlinson
  Cc: Peter Zijlstra, Linus Torvalds, KAMEZAWA Hiroyuki, Minchan Kim,
	Paul E. McKenney, linux-kernel, linux-mm, cl, hugh.dickins,
	Nick Piggin, Ingo Molnar

On 01/09/2010 08:17 PM, Ed Tomlinson wrote:
> On Friday 08 January 2010 11:53:30 Peter Zijlstra wrote:
>> On Tue, 2010-01-05 at 20:20 -0800, Linus Torvalds wrote:
>>>
>>> On Wed, 6 Jan 2010, KAMEZAWA Hiroyuki wrote:
>>>>>
>>>>> Of course, your other load with MADV_DONTNEED seems to be horrible, and 
>>>>> has some nasty spinlock issues, but that looks like a separate deal (I 
>>>>> assume that load is just very hard on the pgtable lock).
>>>>
>>>> It's zone->lock, I guess. My test program avoids pgtable lock problem.
>>>
>>> Yeah, I should have looked more at your callchain. That's nasty. Much 
>>> worse than the per-mm lock. I thought the page buffering would avoid the 
>>> zone lock becoming a huge problem, but clearly not in this case.
>>
>> Right, so I ran some numbers on a multi-socket (2) machine as well:
>>
>>                                pf/min
>>
>> -tip                          56398626
>> -tip + xadd                  174753190
>> -tip + speculative           189274319
>> -tip + xadd + speculative    200174641
> 
> Has anyone tried these patches with ramzswap?  Nitin do they help with the locking
> issues you mentioned?
> 

Locking problem with ramzswap seems completely unrelated to what is being discussed here.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 121+ messages in thread

end of thread, other threads:[~2010-01-10  5:29 UTC | newest]

Thread overview: 121+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 3/8] mm: Add vma sequence count Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 4/8] mm: RCU free vmas Peter Zijlstra
2010-01-05  2:43   ` Paul E. McKenney
2010-01-05  8:28     ` Peter Zijlstra
2010-01-05 16:05       ` Paul E. McKenney
2010-01-04 18:24 ` [RFC][PATCH 5/8] mm: Speculative pte_map_lock() Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
2010-01-05  0:25   ` KAMEZAWA Hiroyuki
2010-01-05  3:13     ` Linus Torvalds
2010-01-05  8:17       ` Peter Zijlstra
2010-01-05  8:57       ` Peter Zijlstra
2010-01-05 15:34         ` Linus Torvalds
2010-01-05 15:40           ` Al Viro
2010-01-05 16:10             ` Linus Torvalds
2010-01-06 15:41               ` Peter Zijlstra
2010-01-05  9:37       ` Peter Zijlstra
2010-01-05 23:35         ` Linus Torvalds
2010-01-05  4:29     ` Minchan Kim
2010-01-05  4:43       ` KAMEZAWA Hiroyuki
2010-01-05  5:10         ` Linus Torvalds
2010-01-05  5:30           ` KAMEZAWA Hiroyuki
2010-01-05  7:39             ` KAMEZAWA Hiroyuki
2010-01-05 15:26               ` Linus Torvalds
2010-01-05 16:14                 ` Linus Torvalds
2010-01-05 17:25                   ` Andi Kleen
2010-01-05 17:47                     ` Christoph Lameter
2010-01-05 18:00                       ` Andi Kleen
2010-01-05 17:55                     ` Linus Torvalds
2010-01-05 18:13                       ` Christoph Lameter
2010-01-05 18:25                         ` Linus Torvalds
2010-01-05 18:46                           ` Christoph Lameter
2010-01-05 18:56                             ` Linus Torvalds
2010-01-05 19:15                               ` Christoph Lameter
2010-01-05 19:28                                 ` Linus Torvalds
2010-01-05 18:55                           ` Paul E. McKenney
2010-01-05 19:08                             ` Linus Torvalds
2010-01-05 19:23                               ` Paul E. McKenney
2010-01-05 20:29                           ` Peter Zijlstra
2010-01-05 20:46                             ` Linus Torvalds
2010-01-05 21:00                               ` Linus Torvalds
2010-01-05 23:29                             ` Paul E. McKenney
2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
2010-01-06  1:37                   ` Linus Torvalds
2010-01-06  2:52                     ` KAMEZAWA Hiroyuki
2010-01-06  3:27                       ` Linus Torvalds
2010-01-06  3:56                         ` KAMEZAWA Hiroyuki
2010-01-06  4:20                           ` Linus Torvalds
2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
2010-01-06  7:49                               ` Minchan Kim
2010-01-06  9:39                               ` Linus Torvalds
2010-01-07  1:00                                 ` KAMEZAWA Hiroyuki
2010-01-08 16:53                             ` Peter Zijlstra
2010-01-08 17:22                               ` Linus Torvalds
2010-01-08 17:43                                 ` Christoph Lameter
2010-01-08 17:52                                   ` Linus Torvalds
2010-01-08 18:33                                     ` Christoph Lameter
2010-01-08 18:46                                   ` Andi Kleen
2010-01-08 18:56                                     ` Christoph Lameter
2010-01-08 19:10                                       ` Andi Kleen
2010-01-08 19:11                                       ` Linus Torvalds
2010-01-08 19:28                                         ` Andi Kleen
2010-01-08 19:39                                           ` Linus Torvalds
2010-01-08 19:42                                             ` Linus Torvalds
2010-01-08 21:36                                   ` Linus Torvalds
2010-01-08 21:46                                     ` Christoph Lameter
2010-01-08 22:43                                       ` Linus Torvalds
2010-01-08 22:43                                       ` Linus Torvalds
2010-01-09 14:47                               ` Ed Tomlinson
2010-01-10  5:27                                 ` Nitin Gupta
2010-01-05 15:14             ` Christoph Lameter
2010-01-05  8:18           ` Peter Zijlstra
2010-01-05  6:00         ` Minchan Kim
2010-01-05  4:48       ` Linus Torvalds
2010-01-05  6:09         ` Minchan Kim
2010-01-05  6:09           ` KAMEZAWA Hiroyuki
2010-01-05  6:24             ` Minchan Kim
2010-01-05  8:35           ` Peter Zijlstra
2010-01-05 13:45   ` Arjan van de Ven
2010-01-05 14:15     ` Andi Kleen
2010-01-05 15:17     ` Christoph Lameter
2010-01-06  3:22       ` Arjan van de Ven
2010-01-07 16:11         ` Christoph Lameter
2010-01-07 16:19           ` Linus Torvalds
2010-01-07 16:31             ` Linus Torvalds
2010-01-07 16:34             ` Paul E. McKenney
2010-01-07 16:36             ` Christoph Lameter
2010-01-08  4:49               ` Arjan van de Ven
2010-01-08  5:00                 ` Linus Torvalds
2010-01-08 15:51                 ` Christoph Lameter
2010-01-09 15:55                   ` Arjan van de Ven
2010-01-07 17:22             ` Peter Zijlstra
2010-01-07 17:36               ` Linus Torvalds
2010-01-07 17:49                 ` Linus Torvalds
2010-01-07 18:00                   ` Peter Zijlstra
2010-01-07 18:15                     ` Linus Torvalds
2010-01-07 21:49                       ` Peter Zijlstra
2010-01-07 18:44                   ` Linus Torvalds
2010-01-07 19:20                     ` Paul E. McKenney
2010-01-07 20:06                       ` Linus Torvalds
2010-01-07 20:25                         ` Paul E. McKenney
2010-01-07 19:24                     ` Christoph Lameter
2010-01-07 20:08                       ` Linus Torvalds
2010-01-07 20:13                         ` Linus Torvalds
2010-01-07 21:44                     ` Peter Zijlstra
2010-01-07 22:33                       ` Linus Torvalds
2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
2010-01-08  0:25                           ` KAMEZAWA Hiroyuki
2010-01-08  0:39                           ` Linus Torvalds
2010-01-08  0:41                             ` Linus Torvalds
2010-01-07 23:51                 ` Rik van Riel
2010-01-04 18:24 ` [RFC][PATCH 7/8] mm,x86: speculative pagefault support Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 8/8] mm: Optimize pte_map_lock() Peter Zijlstra
2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
2010-01-04 21:46   ` Peter Zijlstra
2010-01-04 23:20     ` Rik van Riel
2010-01-04 21:59   ` Christoph Lameter
2010-01-05  0:28     ` KAMEZAWA Hiroyuki
2010-01-05  2:26 ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).