From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
Peter Zijlstra <peterz@infradead.org>,
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
"linux-mm@kvack.org" <linux-mm@kvack.org>,
"minchan.kim@gmail.com" <minchan.kim@gmail.com>,
cl@linux-foundation.org,
"hugh.dickins" <hugh.dickins@tiscali.co.uk>,
Nick Piggin <nickpiggin@yahoo.com.au>,
Ingo Molnar <mingo@elte.hu>,
Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure
Date: Mon, 04 Jan 2010 19:24:31 +0100 [thread overview]
Message-ID: <20100104182813.270919564@chello.nl> (raw)
In-Reply-To: 20100104182429.833180340@chello.nl
[-- Attachment #1: mm-foo-6.patch --]
[-- Type: text/plain, Size: 9926 bytes --]
Replace pte_offset_map_lock() usage in the pagefault path with
pte_map_lock() which when called with .flags & FAULT_FLAG_SPECULATIVE
can fail, in which case we should return VM_FAULT_RETRY, meaning we
need to retry the fault (or do one with mmap_sem held).
This patch adds both FAULT_FLAG_SPECULATIVE, VM_FAULT_RETRY and the
error paths.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/mm.h | 2
mm/memory.c | 119 ++++++++++++++++++++++++++++++++++++++---------------
2 files changed, 88 insertions(+), 33 deletions(-)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -136,6 +136,7 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_SPECULATIVE 0x08
/*
* This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -711,6 +712,7 @@ static inline int page_mapped(struct pag
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
+#define VM_FAULT_RETRY 0x0400
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1957,6 +1957,14 @@ static inline void cow_user_page(struct
copy_user_highpage(dst, src, va, vma);
}
+static int pte_map_lock(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags,
+ pte_t **ptep, spinlock_t **ptl)
+{
+ *ptep = pte_offset_map_lock(mm, pmd, address, ptl);
+ return 1;
+}
+
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -1977,7 +1985,7 @@ static inline void cow_user_page(struct
*/
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+ spinlock_t *ptl, unsigned int flags, pte_t orig_pte)
{
struct page *old_page, *new_page;
pte_t entry;
@@ -2009,8 +2017,14 @@ static int do_wp_page(struct mm_struct *
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
+
+ if (!pte_map_lock(mm, vma, address, pmd, flags,
+ &page_table, &ptl)) {
+ unlock_page(old_page);
+ ret = VM_FAULT_RETRY;
+ goto err;
+ }
+
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
@@ -2052,14 +2066,14 @@ static int do_wp_page(struct mm_struct *
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
- goto unwritable_page;
+ goto err;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0; /* retry the fault */
unlock_page(old_page);
- goto unwritable_page;
+ goto err;
}
} else
VM_BUG_ON(!PageLocked(old_page));
@@ -2070,8 +2084,13 @@ static int do_wp_page(struct mm_struct *
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags,
+ &page_table, &ptl)) {
+ unlock_page(old_page);
+ ret = VM_FAULT_RETRY;
+ goto err;
+ }
+
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
@@ -2103,17 +2122,23 @@ reuse:
gotten:
pte_unmap_unlock(page_table, ptl);
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto err;
+ }
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);
- if (!new_page)
- goto oom;
+ if (!new_page) {
+ ret = VM_FAULT_OOM;
+ goto err;
+ }
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- if (!new_page)
- goto oom;
+ if (!new_page) {
+ ret = VM_FAULT_OOM;
+ goto err;
+ }
cow_user_page(new_page, old_page, address, vma);
}
__SetPageUptodate(new_page);
@@ -2128,13 +2153,20 @@ gotten:
unlock_page(old_page);
}
- if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
- goto oom_free_new;
+ if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+ ret = VM_FAULT_OOM;
+ goto err_free_new;
+ }
/*
* Re-check the pte - we dropped the lock
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+ mem_cgroup_uncharge_page(new_page);
+ ret = VM_FAULT_RETRY;
+ goto err_free_new;
+ }
+
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2233,9 +2265,9 @@ unlock:
file_update_time(vma->vm_file);
}
return ret;
-oom_free_new:
+err_free_new:
page_cache_release(new_page);
-oom:
+err:
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
@@ -2243,10 +2275,6 @@ oom:
}
page_cache_release(old_page);
}
- return VM_FAULT_OOM;
-
-unwritable_page:
- page_cache_release(old_page);
return ret;
}
@@ -2496,6 +2524,10 @@ static int do_swap_page(struct mm_struct
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
+ if (flags & FAULT_FLAG_SPECULATIVE) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
migration_entry_wait(mm, pmd, address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
@@ -2516,7 +2548,11 @@ static int do_swap_page(struct mm_struct
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags,
+ &page_table, &ptl)) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2553,7 +2589,11 @@ static int do_swap_page(struct mm_struct
/*
* Back out if somebody else already faulted in this pte.
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+ ret = VM_FAULT_RETRY;
+ goto out_nolock;
+ }
+
if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
@@ -2594,7 +2634,7 @@ static int do_swap_page(struct mm_struct
unlock_page(page);
if (flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, flags, pte);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
@@ -2607,8 +2647,9 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge_swapin(ptr);
pte_unmap_unlock(page_table, ptl);
+out_nolock:
+ mem_cgroup_cancel_charge_swapin(ptr);
out_page:
unlock_page(page);
out_release:
@@ -2631,7 +2672,9 @@ static int do_anonymous_page(struct mm_s
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags,
+ &page_table, &ptl))
+ return VM_FAULT_RETRY;
if (!pte_none(*page_table))
goto unlock;
goto setpte;
@@ -2654,7 +2697,12 @@ static int do_anonymous_page(struct mm_s
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+ mem_cgroup_uncharge_page(page);
+ page_cache_release(page);
+ return VM_FAULT_RETRY;
+ }
+
if (!pte_none(*page_table))
goto release;
@@ -2793,7 +2841,10 @@ static int __do_fault(struct mm_struct *
}
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags, &page_table, &ptl)) {
+ ret = VM_FAULT_RETRY;
+ goto out_uncharge;
+ }
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -2826,7 +2877,10 @@ static int __do_fault(struct mm_struct *
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
+ pte_unmap_unlock(page_table, ptl);
} else {
+ pte_unmap_unlock(page_table, ptl);
+out_uncharge:
if (charged)
mem_cgroup_uncharge_page(page);
if (anon)
@@ -2835,8 +2889,6 @@ static int __do_fault(struct mm_struct *
anon = 1; /* no anon but release faulted_page */
}
- pte_unmap_unlock(page_table, ptl);
-
out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
@@ -2945,13 +2997,14 @@ static inline int handle_pte_fault(struc
pmd, flags, entry);
}
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(mm, vma, address, pmd, flags, &pte, &ptl))
+ return VM_FAULT_RETRY;
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
- pte, pmd, ptl, entry);
+ pte, pmd, ptl, flags, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2010-01-04 20:49 UTC|newest]
Thread overview: 121+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
2010-01-04 18:24 ` Peter Zijlstra [this message]
2010-01-04 18:24 ` [RFC][PATCH 3/8] mm: Add vma sequence count Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 4/8] mm: RCU free vmas Peter Zijlstra
2010-01-05 2:43 ` Paul E. McKenney
2010-01-05 8:28 ` Peter Zijlstra
2010-01-05 16:05 ` Paul E. McKenney
2010-01-04 18:24 ` [RFC][PATCH 5/8] mm: Speculative pte_map_lock() Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
2010-01-05 0:25 ` KAMEZAWA Hiroyuki
2010-01-05 3:13 ` Linus Torvalds
2010-01-05 8:17 ` Peter Zijlstra
2010-01-05 8:57 ` Peter Zijlstra
2010-01-05 15:34 ` Linus Torvalds
2010-01-05 15:40 ` Al Viro
2010-01-05 16:10 ` Linus Torvalds
2010-01-06 15:41 ` Peter Zijlstra
2010-01-05 9:37 ` Peter Zijlstra
2010-01-05 23:35 ` Linus Torvalds
2010-01-05 4:29 ` Minchan Kim
2010-01-05 4:43 ` KAMEZAWA Hiroyuki
2010-01-05 5:10 ` Linus Torvalds
2010-01-05 5:30 ` KAMEZAWA Hiroyuki
2010-01-05 7:39 ` KAMEZAWA Hiroyuki
2010-01-05 15:26 ` Linus Torvalds
2010-01-05 16:14 ` Linus Torvalds
2010-01-05 17:25 ` Andi Kleen
2010-01-05 17:47 ` Christoph Lameter
2010-01-05 18:00 ` Andi Kleen
2010-01-05 17:55 ` Linus Torvalds
2010-01-05 18:13 ` Christoph Lameter
2010-01-05 18:25 ` Linus Torvalds
2010-01-05 18:46 ` Christoph Lameter
2010-01-05 18:56 ` Linus Torvalds
2010-01-05 19:15 ` Christoph Lameter
2010-01-05 19:28 ` Linus Torvalds
2010-01-05 18:55 ` Paul E. McKenney
2010-01-05 19:08 ` Linus Torvalds
2010-01-05 19:23 ` Paul E. McKenney
2010-01-05 20:29 ` Peter Zijlstra
2010-01-05 20:46 ` Linus Torvalds
2010-01-05 21:00 ` Linus Torvalds
2010-01-05 23:29 ` Paul E. McKenney
2010-01-06 0:22 ` KAMEZAWA Hiroyuki
2010-01-06 1:37 ` Linus Torvalds
2010-01-06 2:52 ` KAMEZAWA Hiroyuki
2010-01-06 3:27 ` Linus Torvalds
2010-01-06 3:56 ` KAMEZAWA Hiroyuki
2010-01-06 4:20 ` Linus Torvalds
2010-01-06 7:06 ` KAMEZAWA Hiroyuki
2010-01-06 7:49 ` Minchan Kim
2010-01-06 9:39 ` Linus Torvalds
2010-01-07 1:00 ` KAMEZAWA Hiroyuki
2010-01-08 16:53 ` Peter Zijlstra
2010-01-08 17:22 ` Linus Torvalds
2010-01-08 17:43 ` Christoph Lameter
2010-01-08 17:52 ` Linus Torvalds
2010-01-08 18:33 ` Christoph Lameter
2010-01-08 18:46 ` Andi Kleen
2010-01-08 18:56 ` Christoph Lameter
2010-01-08 19:10 ` Andi Kleen
2010-01-08 19:11 ` Linus Torvalds
2010-01-08 19:28 ` Andi Kleen
2010-01-08 19:39 ` Linus Torvalds
2010-01-08 19:42 ` Linus Torvalds
2010-01-08 21:36 ` Linus Torvalds
2010-01-08 21:46 ` Christoph Lameter
2010-01-08 22:43 ` Linus Torvalds
2010-01-08 22:43 ` Linus Torvalds
2010-01-09 14:47 ` Ed Tomlinson
2010-01-10 5:27 ` Nitin Gupta
2010-01-05 15:14 ` Christoph Lameter
2010-01-05 8:18 ` Peter Zijlstra
2010-01-05 6:00 ` Minchan Kim
2010-01-05 4:48 ` Linus Torvalds
2010-01-05 6:09 ` Minchan Kim
2010-01-05 6:09 ` KAMEZAWA Hiroyuki
2010-01-05 6:24 ` Minchan Kim
2010-01-05 8:35 ` Peter Zijlstra
2010-01-05 13:45 ` Arjan van de Ven
2010-01-05 14:15 ` Andi Kleen
2010-01-05 15:17 ` Christoph Lameter
2010-01-06 3:22 ` Arjan van de Ven
2010-01-07 16:11 ` Christoph Lameter
2010-01-07 16:19 ` Linus Torvalds
2010-01-07 16:31 ` Linus Torvalds
2010-01-07 16:34 ` Paul E. McKenney
2010-01-07 16:36 ` Christoph Lameter
2010-01-08 4:49 ` Arjan van de Ven
2010-01-08 5:00 ` Linus Torvalds
2010-01-08 15:51 ` Christoph Lameter
2010-01-09 15:55 ` Arjan van de Ven
2010-01-07 17:22 ` Peter Zijlstra
2010-01-07 17:36 ` Linus Torvalds
2010-01-07 17:49 ` Linus Torvalds
2010-01-07 18:00 ` Peter Zijlstra
2010-01-07 18:15 ` Linus Torvalds
2010-01-07 21:49 ` Peter Zijlstra
2010-01-07 18:44 ` Linus Torvalds
2010-01-07 19:20 ` Paul E. McKenney
2010-01-07 20:06 ` Linus Torvalds
2010-01-07 20:25 ` Paul E. McKenney
2010-01-07 19:24 ` Christoph Lameter
2010-01-07 20:08 ` Linus Torvalds
2010-01-07 20:13 ` Linus Torvalds
2010-01-07 21:44 ` Peter Zijlstra
2010-01-07 22:33 ` Linus Torvalds
2010-01-08 0:23 ` KAMEZAWA Hiroyuki
2010-01-08 0:25 ` KAMEZAWA Hiroyuki
2010-01-08 0:39 ` Linus Torvalds
2010-01-08 0:41 ` Linus Torvalds
2010-01-07 23:51 ` Rik van Riel
2010-01-04 18:24 ` [RFC][PATCH 7/8] mm,x86: speculative pagefault support Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 8/8] mm: Optimize pte_map_lock() Peter Zijlstra
2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
2010-01-04 21:46 ` Peter Zijlstra
2010-01-04 23:20 ` Rik van Riel
2010-01-04 21:59 ` Christoph Lameter
2010-01-05 0:28 ` KAMEZAWA Hiroyuki
2010-01-05 2:26 ` Minchan Kim
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100104182813.270919564@chello.nl \
--to=a.p.zijlstra@chello.nl \
--cc=cl@linux-foundation.org \
--cc=hugh.dickins@tiscali.co.uk \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=minchan.kim@gmail.com \
--cc=mingo@elte.hu \
--cc=nickpiggin@yahoo.com.au \
--cc=paulmck@linux.vnet.ibm.com \
--cc=peterz@infradead.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).