linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Peter Zijlstra <peterz@infradead.org>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"minchan.kim@gmail.com" <minchan.kim@gmail.com>,
	cl@linux-foundation.org,
	"hugh.dickins" <hugh.dickins@tiscali.co.uk>,
	Nick Piggin <nickpiggin@yahoo.com.au>,
	Ingo Molnar <mingo@elte.hu>,
	Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure
Date: Mon, 04 Jan 2010 19:24:31 +0100	[thread overview]
Message-ID: <20100104182813.270919564@chello.nl> (raw)
In-Reply-To: 20100104182429.833180340@chello.nl

[-- Attachment #1: mm-foo-6.patch --]
[-- Type: text/plain, Size: 9926 bytes --]

Replace pte_offset_map_lock() usage in the pagefault path with
pte_map_lock() which when called with .flags & FAULT_FLAG_SPECULATIVE
can fail, in which case we should return VM_FAULT_RETRY, meaning we
need to retry the fault (or do one with mmap_sem held).

This patch adds both FAULT_FLAG_SPECULATIVE, VM_FAULT_RETRY and the
error paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h |    2 
 mm/memory.c        |  119 ++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 88 insertions(+), 33 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -136,6 +136,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 #define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_SPECULATIVE	0x08
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -711,6 +712,7 @@ static inline int page_mapped(struct pag
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
+#define VM_FAULT_RETRY  0x0400
 
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
 
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1957,6 +1957,14 @@ static inline void cow_user_page(struct 
 		copy_user_highpage(dst, src, va, vma);
 }
 
+static int pte_map_lock(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pmd_t *pmd, unsigned int flags,
+		pte_t **ptep, spinlock_t **ptl)
+{
+	*ptep = pte_offset_map_lock(mm, pmd, address, ptl);
+	return 1;
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1977,7 +1985,7 @@ static inline void cow_user_page(struct 
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		spinlock_t *ptl, pte_t orig_pte)
+		spinlock_t *ptl, unsigned int flags, pte_t orig_pte)
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
@@ -2009,8 +2017,14 @@ static int do_wp_page(struct mm_struct *
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 			lock_page(old_page);
-			page_table = pte_offset_map_lock(mm, pmd, address,
-							 &ptl);
+
+			if (!pte_map_lock(mm, vma, address, pmd, flags,
+						&page_table, &ptl)) {
+				unlock_page(old_page);
+				ret = VM_FAULT_RETRY;
+				goto err;
+			}
+
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				page_cache_release(old_page);
@@ -2052,14 +2066,14 @@ static int do_wp_page(struct mm_struct *
 			if (unlikely(tmp &
 					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
 				ret = tmp;
-				goto unwritable_page;
+				goto err;
 			}
 			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
 				lock_page(old_page);
 				if (!old_page->mapping) {
 					ret = 0; /* retry the fault */
 					unlock_page(old_page);
-					goto unwritable_page;
+					goto err;
 				}
 			} else
 				VM_BUG_ON(!PageLocked(old_page));
@@ -2070,8 +2084,13 @@ static int do_wp_page(struct mm_struct *
 			 * they did, we just return, as we can count on the
 			 * MMU to tell us if they didn't also make it writable.
 			 */
-			page_table = pte_offset_map_lock(mm, pmd, address,
-							 &ptl);
+			if (!pte_map_lock(mm, vma, address, pmd, flags,
+						&page_table, &ptl)) {
+				unlock_page(old_page);
+				ret = VM_FAULT_RETRY;
+				goto err;
+			}
+
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				page_cache_release(old_page);
@@ -2103,17 +2122,23 @@ reuse:
 gotten:
 	pte_unmap_unlock(page_table, ptl);
 
-	if (unlikely(anon_vma_prepare(vma)))
-		goto oom;
+	if (unlikely(anon_vma_prepare(vma))) {
+		ret = VM_FAULT_OOM;
+		goto err;
+	}
 
 	if (is_zero_pfn(pte_pfn(orig_pte))) {
 		new_page = alloc_zeroed_user_highpage_movable(vma, address);
-		if (!new_page)
-			goto oom;
+		if (!new_page) {
+			ret = VM_FAULT_OOM;
+			goto err;
+		}
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-		if (!new_page)
-			goto oom;
+		if (!new_page) {
+			ret = VM_FAULT_OOM;
+			goto err;
+		}
 		cow_user_page(new_page, old_page, address, vma);
 	}
 	__SetPageUptodate(new_page);
@@ -2128,13 +2153,20 @@ gotten:
 		unlock_page(old_page);
 	}
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
-		goto oom_free_new;
+	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+		ret = VM_FAULT_OOM;
+		goto err_free_new;
+	}
 
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		mem_cgroup_uncharge_page(new_page);
+		ret = VM_FAULT_RETRY;
+		goto err_free_new;
+	}
+
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
@@ -2233,9 +2265,9 @@ unlock:
 			file_update_time(vma->vm_file);
 	}
 	return ret;
-oom_free_new:
+err_free_new:
 	page_cache_release(new_page);
-oom:
+err:
 	if (old_page) {
 		if (page_mkwrite) {
 			unlock_page(old_page);
@@ -2243,10 +2275,6 @@ oom:
 		}
 		page_cache_release(old_page);
 	}
-	return VM_FAULT_OOM;
-
-unwritable_page:
-	page_cache_release(old_page);
 	return ret;
 }
 
@@ -2496,6 +2524,10 @@ static int do_swap_page(struct mm_struct
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
+			if (flags & FAULT_FLAG_SPECULATIVE) {
+				ret = VM_FAULT_RETRY;
+				goto out;
+			}
 			migration_entry_wait(mm, pmd, address);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
@@ -2516,7 +2548,11 @@ static int do_swap_page(struct mm_struct
 			 * Back out if somebody else faulted in this pte
 			 * while we released the pte lock.
 			 */
-			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+			if (!pte_map_lock(mm, vma, address, pmd, flags,
+						&page_table, &ptl)) {
+				ret = VM_FAULT_RETRY;
+				goto out;
+			}
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2553,7 +2589,11 @@ static int do_swap_page(struct mm_struct
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		ret = VM_FAULT_RETRY;
+		goto out_nolock;
+	}
+
 	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
 
@@ -2594,7 +2634,7 @@ static int do_swap_page(struct mm_struct
 	unlock_page(page);
 
 	if (flags & FAULT_FLAG_WRITE) {
-		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, flags, pte);
 		if (ret & VM_FAULT_ERROR)
 			ret &= VM_FAULT_ERROR;
 		goto out;
@@ -2607,8 +2647,9 @@ unlock:
 out:
 	return ret;
 out_nomap:
-	mem_cgroup_cancel_charge_swapin(ptr);
 	pte_unmap_unlock(page_table, ptl);
+out_nolock:
+	mem_cgroup_cancel_charge_swapin(ptr);
 out_page:
 	unlock_page(page);
 out_release:
@@ -2631,7 +2672,9 @@ static int do_anonymous_page(struct mm_s
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
-		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+		if (!pte_map_lock(mm, vma, address, pmd, flags,
+					&page_table, &ptl))
+			return VM_FAULT_RETRY;
 		if (!pte_none(*page_table))
 			goto unlock;
 		goto setpte;
@@ -2654,7 +2697,12 @@ static int do_anonymous_page(struct mm_s
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		mem_cgroup_uncharge_page(page);
+		page_cache_release(page);
+		return VM_FAULT_RETRY;
+	}
+
 	if (!pte_none(*page_table))
 		goto release;
 
@@ -2793,7 +2841,10 @@ static int __do_fault(struct mm_struct *
 
 	}
 
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+		ret = VM_FAULT_RETRY;
+		goto out_uncharge;
+	}
 
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -2826,7 +2877,10 @@ static int __do_fault(struct mm_struct *
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
+		pte_unmap_unlock(page_table, ptl);
 	} else {
+		pte_unmap_unlock(page_table, ptl);
+out_uncharge:
 		if (charged)
 			mem_cgroup_uncharge_page(page);
 		if (anon)
@@ -2835,8 +2889,6 @@ static int __do_fault(struct mm_struct *
 			anon = 1; /* no anon but release faulted_page */
 	}
 
-	pte_unmap_unlock(page_table, ptl);
-
 out:
 	if (dirty_page) {
 		struct address_space *mapping = page->mapping;
@@ -2945,13 +2997,14 @@ static inline int handle_pte_fault(struc
 					pmd, flags, entry);
 	}
 
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_map_lock(mm, vma, address, pmd, flags, &pte, &ptl))
+		return VM_FAULT_RETRY;
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+					pte, pmd, ptl, flags, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2010-01-04 20:49 UTC|newest]

Thread overview: 121+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
2010-01-04 18:24 ` Peter Zijlstra [this message]
2010-01-04 18:24 ` [RFC][PATCH 3/8] mm: Add vma sequence count Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 4/8] mm: RCU free vmas Peter Zijlstra
2010-01-05  2:43   ` Paul E. McKenney
2010-01-05  8:28     ` Peter Zijlstra
2010-01-05 16:05       ` Paul E. McKenney
2010-01-04 18:24 ` [RFC][PATCH 5/8] mm: Speculative pte_map_lock() Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
2010-01-05  0:25   ` KAMEZAWA Hiroyuki
2010-01-05  3:13     ` Linus Torvalds
2010-01-05  8:17       ` Peter Zijlstra
2010-01-05  8:57       ` Peter Zijlstra
2010-01-05 15:34         ` Linus Torvalds
2010-01-05 15:40           ` Al Viro
2010-01-05 16:10             ` Linus Torvalds
2010-01-06 15:41               ` Peter Zijlstra
2010-01-05  9:37       ` Peter Zijlstra
2010-01-05 23:35         ` Linus Torvalds
2010-01-05  4:29     ` Minchan Kim
2010-01-05  4:43       ` KAMEZAWA Hiroyuki
2010-01-05  5:10         ` Linus Torvalds
2010-01-05  5:30           ` KAMEZAWA Hiroyuki
2010-01-05  7:39             ` KAMEZAWA Hiroyuki
2010-01-05 15:26               ` Linus Torvalds
2010-01-05 16:14                 ` Linus Torvalds
2010-01-05 17:25                   ` Andi Kleen
2010-01-05 17:47                     ` Christoph Lameter
2010-01-05 18:00                       ` Andi Kleen
2010-01-05 17:55                     ` Linus Torvalds
2010-01-05 18:13                       ` Christoph Lameter
2010-01-05 18:25                         ` Linus Torvalds
2010-01-05 18:46                           ` Christoph Lameter
2010-01-05 18:56                             ` Linus Torvalds
2010-01-05 19:15                               ` Christoph Lameter
2010-01-05 19:28                                 ` Linus Torvalds
2010-01-05 18:55                           ` Paul E. McKenney
2010-01-05 19:08                             ` Linus Torvalds
2010-01-05 19:23                               ` Paul E. McKenney
2010-01-05 20:29                           ` Peter Zijlstra
2010-01-05 20:46                             ` Linus Torvalds
2010-01-05 21:00                               ` Linus Torvalds
2010-01-05 23:29                             ` Paul E. McKenney
2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
2010-01-06  1:37                   ` Linus Torvalds
2010-01-06  2:52                     ` KAMEZAWA Hiroyuki
2010-01-06  3:27                       ` Linus Torvalds
2010-01-06  3:56                         ` KAMEZAWA Hiroyuki
2010-01-06  4:20                           ` Linus Torvalds
2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
2010-01-06  7:49                               ` Minchan Kim
2010-01-06  9:39                               ` Linus Torvalds
2010-01-07  1:00                                 ` KAMEZAWA Hiroyuki
2010-01-08 16:53                             ` Peter Zijlstra
2010-01-08 17:22                               ` Linus Torvalds
2010-01-08 17:43                                 ` Christoph Lameter
2010-01-08 17:52                                   ` Linus Torvalds
2010-01-08 18:33                                     ` Christoph Lameter
2010-01-08 18:46                                   ` Andi Kleen
2010-01-08 18:56                                     ` Christoph Lameter
2010-01-08 19:10                                       ` Andi Kleen
2010-01-08 19:11                                       ` Linus Torvalds
2010-01-08 19:28                                         ` Andi Kleen
2010-01-08 19:39                                           ` Linus Torvalds
2010-01-08 19:42                                             ` Linus Torvalds
2010-01-08 21:36                                   ` Linus Torvalds
2010-01-08 21:46                                     ` Christoph Lameter
2010-01-08 22:43                                       ` Linus Torvalds
2010-01-08 22:43                                       ` Linus Torvalds
2010-01-09 14:47                               ` Ed Tomlinson
2010-01-10  5:27                                 ` Nitin Gupta
2010-01-05 15:14             ` Christoph Lameter
2010-01-05  8:18           ` Peter Zijlstra
2010-01-05  6:00         ` Minchan Kim
2010-01-05  4:48       ` Linus Torvalds
2010-01-05  6:09         ` Minchan Kim
2010-01-05  6:09           ` KAMEZAWA Hiroyuki
2010-01-05  6:24             ` Minchan Kim
2010-01-05  8:35           ` Peter Zijlstra
2010-01-05 13:45   ` Arjan van de Ven
2010-01-05 14:15     ` Andi Kleen
2010-01-05 15:17     ` Christoph Lameter
2010-01-06  3:22       ` Arjan van de Ven
2010-01-07 16:11         ` Christoph Lameter
2010-01-07 16:19           ` Linus Torvalds
2010-01-07 16:31             ` Linus Torvalds
2010-01-07 16:34             ` Paul E. McKenney
2010-01-07 16:36             ` Christoph Lameter
2010-01-08  4:49               ` Arjan van de Ven
2010-01-08  5:00                 ` Linus Torvalds
2010-01-08 15:51                 ` Christoph Lameter
2010-01-09 15:55                   ` Arjan van de Ven
2010-01-07 17:22             ` Peter Zijlstra
2010-01-07 17:36               ` Linus Torvalds
2010-01-07 17:49                 ` Linus Torvalds
2010-01-07 18:00                   ` Peter Zijlstra
2010-01-07 18:15                     ` Linus Torvalds
2010-01-07 21:49                       ` Peter Zijlstra
2010-01-07 18:44                   ` Linus Torvalds
2010-01-07 19:20                     ` Paul E. McKenney
2010-01-07 20:06                       ` Linus Torvalds
2010-01-07 20:25                         ` Paul E. McKenney
2010-01-07 19:24                     ` Christoph Lameter
2010-01-07 20:08                       ` Linus Torvalds
2010-01-07 20:13                         ` Linus Torvalds
2010-01-07 21:44                     ` Peter Zijlstra
2010-01-07 22:33                       ` Linus Torvalds
2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
2010-01-08  0:25                           ` KAMEZAWA Hiroyuki
2010-01-08  0:39                           ` Linus Torvalds
2010-01-08  0:41                             ` Linus Torvalds
2010-01-07 23:51                 ` Rik van Riel
2010-01-04 18:24 ` [RFC][PATCH 7/8] mm,x86: speculative pagefault support Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 8/8] mm: Optimize pte_map_lock() Peter Zijlstra
2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
2010-01-04 21:46   ` Peter Zijlstra
2010-01-04 23:20     ` Rik van Riel
2010-01-04 21:59   ` Christoph Lameter
2010-01-05  0:28     ` KAMEZAWA Hiroyuki
2010-01-05  2:26 ` Minchan Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100104182813.270919564@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=cl@linux-foundation.org \
    --cc=hugh.dickins@tiscali.co.uk \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan.kim@gmail.com \
    --cc=mingo@elte.hu \
    --cc=nickpiggin@yahoo.com.au \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).