All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] fix swapin race condition
@ 2010-09-03 15:39 Andrea Arcangeli
  2010-09-03 20:02 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Andrea Arcangeli @ 2010-09-03 15:39 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, linux-mm

From: Andrea Arcangeli <aarcange@redhat.com>

The pte_same check is reliable only if the swap entry remains pinned
(by the page lock on swapcache). We've also to ensure the swapcache
isn't removed before we take the lock as try_to_free_swap won't care
about the page pin.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -16,6 +16,9 @@
 struct stable_node;
 struct mem_cgroup;
 
+struct page *ksm_does_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address);
+
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
@@ -70,19 +73,14 @@ static inline void set_page_stable_node(
  * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
  * but what if the vma was unmerged while the page was swapped out?
  */
-struct page *ksm_does_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address);
-static inline struct page *ksm_might_need_to_copy(struct page *page,
+static inline int ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address)
 {
 	struct anon_vma *anon_vma = page_anon_vma(page);
 
-	if (!anon_vma ||
-	    (anon_vma->root == vma->anon_vma->root &&
-	     page->index == linear_page_index(vma, address)))
-		return page;
-
-	return ksm_does_need_to_copy(page, vma, address);
+	return anon_vma &&
+		(anon_vma->root != vma->anon_vma->root ||
+		 page->index != linear_page_index(vma, address));
 }
 
 int page_referenced_ksm(struct page *page,
@@ -115,10 +113,10 @@ static inline int ksm_madvise(struct vm_
 	return 0;
 }
 
-static inline struct page *ksm_might_need_to_copy(struct page *page,
+static inline int ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address)
 {
-	return page;
+	return 0;
 }
 
 static inline int page_referenced_ksm(struct page *page,
diff --git a/mm/ksm.c b/mm/ksm.c
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1518,8 +1518,6 @@ struct page *ksm_does_need_to_copy(struc
 {
 	struct page *new_page;
 
-	unlock_page(page);	/* any racers will COW it, not modify it */
-
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (new_page) {
 		copy_user_highpage(new_page, page, address, vma);
@@ -1535,7 +1533,6 @@ struct page *ksm_does_need_to_copy(struc
 			add_page_to_unevictable_list(new_page);
 	}
 
-	page_cache_release(page);
 	return new_page;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2616,7 +2616,7 @@ static int do_swap_page(struct mm_struct
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page;
+	struct page *page, *swapcache = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	struct mem_cgroup *ptr = NULL;
@@ -2671,10 +2671,23 @@ static int do_swap_page(struct mm_struct
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
-	page = ksm_might_need_to_copy(page, vma, address);
-	if (!page) {
-		ret = VM_FAULT_OOM;
-		goto out;
+	/*
+	 * Make sure try_to_free_swap didn't release the swapcache
+	 * from under us. The page pin isn't enough to prevent that.
+	 */
+	if (unlikely(!PageSwapCache(page)))
+		goto out_page;
+
+	if (ksm_might_need_to_copy(page, vma, address)) {
+		swapcache = page;
+		page = ksm_does_need_to_copy(page, vma, address);
+
+		if (unlikely(!page)) {
+			ret = VM_FAULT_OOM;
+			page = swapcache;
+			swapcache = NULL;
+			goto out_page;
+		}
 	}
 
 	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2725,6 +2738,18 @@ static int do_swap_page(struct mm_struct
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
+	if (swapcache) {
+		/*
+		 * Hold the lock to avoid the swap entry to be reused
+		 * until we take the PT lock for the pte_same() check
+		 * (to avoid false positives from pte_same). For
+		 * further safety release the lock after the swap_free
+		 * so that the swap count won't change under a
+		 * parallel locked swapcache.
+		 */
+		unlock_page(swapcache);
+		page_cache_release(swapcache);
+	}
 
 	if (flags & FAULT_FLAG_WRITE) {
 		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2746,6 +2771,10 @@ out_page:
 	unlock_page(page);
 out_release:
 	page_cache_release(page);
+	if (swapcache) {
+		unlock_page(swapcache);
+		page_cache_release(swapcache);
+	}
 	return ret;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-03 15:39 [PATCH] fix swapin race condition Andrea Arcangeli
@ 2010-09-03 20:02 ` Andrew Morton
  2010-09-04 12:29   ` Andrea Arcangeli
  2010-09-03 23:57 ` Rik van Riel
  2010-09-06  2:35 ` Hugh Dickins
  2 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2010-09-03 20:02 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Hugh Dickins, linux-mm

On Fri, 3 Sep 2010 17:39:58 +0200
Andrea Arcangeli <aarcange@redhat.com> wrote:

> The pte_same check is reliable only if the swap entry remains pinned
> (by the page lock on swapcache). We've also to ensure the swapcache
> isn't removed before we take the lock as try_to_free_swap won't care
> about the page pin.

What were the end-user-observeable effects of this bug?

Do we think the fix should be backported into earlier kernels?

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-03 15:39 [PATCH] fix swapin race condition Andrea Arcangeli
  2010-09-03 20:02 ` Andrew Morton
@ 2010-09-03 23:57 ` Rik van Riel
  2010-09-06  2:35 ` Hugh Dickins
  2 siblings, 0 replies; 23+ messages in thread
From: Rik van Riel @ 2010-09-03 23:57 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, Hugh Dickins, linux-mm

On 09/03/2010 11:39 AM, Andrea Arcangeli wrote:
> From: Andrea Arcangeli<aarcange@redhat.com>
>
> The pte_same check is reliable only if the swap entry remains pinned
> (by the page lock on swapcache). We've also to ensure the swapcache
> isn't removed before we take the lock as try_to_free_swap won't care
> about the page pin.
>
> Signed-off-by: Andrea Arcangeli<aarcange@redhat.com>

Reviewed-by: Rik van Riel <riel@redhat.com>

Andrew, one of the possible impacts of this patch is that a
KSM-shared page can point to the anon_vma of another process,
which could exit before the page is freed.

This can leave a page with a pointer to a recycled anon_vma
object, or worse, a pointer to something that is no longer
an anon_vma.

Backporting this patch to -stable is worthwhile, IMHO.

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-03 20:02 ` Andrew Morton
@ 2010-09-04 12:29   ` Andrea Arcangeli
  0 siblings, 0 replies; 23+ messages in thread
From: Andrea Arcangeli @ 2010-09-04 12:29 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, linux-mm

On Fri, Sep 03, 2010 at 01:02:59PM -0700, Andrew Morton wrote:
> On Fri, 3 Sep 2010 17:39:58 +0200
> Andrea Arcangeli <aarcange@redhat.com> wrote:
> 
> > The pte_same check is reliable only if the swap entry remains pinned
> > (by the page lock on swapcache). We've also to ensure the swapcache
> > isn't removed before we take the lock as try_to_free_swap won't care
> > about the page pin.
> 
> What were the end-user-observeable effects of this bug?

I found it by code review only.

> Do we think the fix should be backported into earlier kernels?

Considering I didn't see user-observable effects it's up to you.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-03 15:39 [PATCH] fix swapin race condition Andrea Arcangeli
  2010-09-03 20:02 ` Andrew Morton
  2010-09-03 23:57 ` Rik van Riel
@ 2010-09-06  2:35 ` Hugh Dickins
  2010-09-15 23:02   ` Hugh Dickins
  2 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2010-09-06  2:35 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, Rik van Riel, linux-mm

On Fri, 3 Sep 2010, Andrea Arcangeli wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> The pte_same check is reliable only if the swap entry remains pinned
> (by the page lock on swapcache). We've also to ensure the swapcache
> isn't removed before we take the lock as try_to_free_swap won't care
> about the page pin.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

Acked-by: Hugh Dickins <hughd@google.com>

Yes, it's a great little find, and long predates the KSM hooks you've
had to adjust.  It does upset me (aesthetically) that the KSM case now
intrudes into do_swap_swap() much more than it used to; but I have not
come up with a better solution, so yes, let's go forward with this.

Thanks,
Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-06  2:35 ` Hugh Dickins
@ 2010-09-15 23:02   ` Hugh Dickins
  2010-09-15 23:42     ` Andrea Arcangeli
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2010-09-15 23:02 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Sun, 5 Sep 2010, Hugh Dickins wrote:
> On Fri, 3 Sep 2010, Andrea Arcangeli wrote:
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > The pte_same check is reliable only if the swap entry remains pinned
> > (by the page lock on swapcache). We've also to ensure the swapcache
> > isn't removed before we take the lock as try_to_free_swap won't care
> > about the page pin.
> > 
> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> 
> Acked-by: Hugh Dickins <hughd@google.com>
> 
> Yes, it's a great little find, and long predates the KSM hooks you've
> had to adjust.  It does upset me (aesthetically) that the KSM case now
> intrudes into do_swap_swap() much more than it used to; but I have not
> come up with a better solution, so yes, let's go forward with this.

I had an afterthought, something I've not thought through fully, but am
reminded of by Greg's mail for stable: is your patch incomplete?  Just
as it's very unlikely but conceivable the pte_same() test is inadequate,
isn't the PageSwapCache() test you've added to do_swap_page() inadequate?
Doesn't it need a "page_private(page) == entry.val" test too?

Just as it's conceivable that the same swap has got reused (either via
try_to_free_swap or via swapoff+swapon) for a COWed version of the page
in that pte slot meanwhile, isn't it conceivable that the page we hold
while waiting for pagelock, has got freed from swap then reallocated to
elsewhere on swap meanwhile?  Which, together with your scenario (and I
suspect the two unlikelihoods are not actually to be multiplied), would
still lead to the wrong result, unless we add the further test.

I apologize if I'm holding up your stable fix, and just vying against
you in a "world's most unlikely VM race" competition, but I don't at
present see what prevents this variant.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-15 23:02   ` Hugh Dickins
@ 2010-09-15 23:42     ` Andrea Arcangeli
  2010-09-16  0:10       ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Andrea Arcangeli @ 2010-09-15 23:42 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Wed, Sep 15, 2010 at 04:02:24PM -0700, Hugh Dickins wrote:
> I had an afterthought, something I've not thought through fully, but am
> reminded of by Greg's mail for stable: is your patch incomplete?  Just
> as it's very unlikely but conceivable the pte_same() test is inadequate,
> isn't the PageSwapCache() test you've added to do_swap_page() inadequate?
> Doesn't it need a "page_private(page) == entry.val" test too?
> 
> Just as it's conceivable that the same swap has got reused (either via
> try_to_free_swap or via swapoff+swapon) for a COWed version of the page
> in that pte slot meanwhile, isn't it conceivable that the page we hold

Yes, before the fix, the page could be removed from swapcache despite
being pinned, and the cow copy could reuse the same swap entry and be
unmapped again so breaking the pte_same check.

> while waiting for pagelock, has got freed from swap then reallocated to
> elsewhere on swap meanwhile?  Which, together with your scenario (and I
> suspect the two unlikelihoods are not actually to be multiplied), would
> still lead to the wrong result, unless we add the further test.

For this to happen the page would need to be removed from swapcache
and then added back to swapcache to a different swap entry. But if
that happens the "page_table" pointer would be set to a different swap
entry too, so failing the pte_same check. If the swap entry of the
page changes the page_table will change with it, so the pte_same check
will fail in the first place, so at first glance it looks like
checking the page_private isn't necessary and the pte_same check on
the page_table is enough.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-15 23:42     ` Andrea Arcangeli
@ 2010-09-16  0:10       ` Hugh Dickins
  2010-09-16 21:03         ` Andrea Arcangeli
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2010-09-16  0:10 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Thu, 16 Sep 2010, Andrea Arcangeli wrote:
> On Wed, Sep 15, 2010 at 04:02:24PM -0700, Hugh Dickins wrote:
> > I had an afterthought, something I've not thought through fully, but am
> > reminded of by Greg's mail for stable: is your patch incomplete?  Just
> > as it's very unlikely but conceivable the pte_same() test is inadequate,
> > isn't the PageSwapCache() test you've added to do_swap_page() inadequate?
> > Doesn't it need a "page_private(page) == entry.val" test too?
> > 
> > Just as it's conceivable that the same swap has got reused (either via
> > try_to_free_swap or via swapoff+swapon) for a COWed version of the page
> > in that pte slot meanwhile, isn't it conceivable that the page we hold
> 
> Yes, before the fix, the page could be removed from swapcache despite
> being pinned, and the cow copy could reuse the same swap entry and be
> unmapped again so breaking the pte_same check.
> 
> > while waiting for pagelock, has got freed from swap then reallocated to
> > elsewhere on swap meanwhile?  Which, together with your scenario (and I
> > suspect the two unlikelihoods are not actually to be multiplied), would
> > still lead to the wrong result, unless we add the further test.
> 
> For this to happen the page would need to be removed from swapcache
> and then added back to swapcache to a different swap entry. But if
> that happens the "page_table" pointer would be set to a different swap
> entry too, so failing the pte_same check. If the swap entry of the
> page changes the page_table will change with it, so the pte_same check
> will fail in the first place, so at first glance it looks like
> checking the page_private isn't necessary and the pte_same check on
> the page_table is enough.

I agree that if my scenario happened on its own, the pte_same check
would catch it.  But if my scenario happens along with your scenario
(and I'm thinking that the combination is not that much less likely
than either alone), then the PageSwapCache test will succeed and the
pte_same test will succeed, but we're still putting the wrong page into
the pte, since this page is now represented by a different swap entry
(and the page that should be there by our original swap entry).

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-16  0:10       ` Hugh Dickins
@ 2010-09-16 21:03         ` Andrea Arcangeli
  2010-09-16 21:08           ` Andrea Arcangeli
  2010-09-17  2:31           ` Hugh Dickins
  0 siblings, 2 replies; 23+ messages in thread
From: Andrea Arcangeli @ 2010-09-16 21:03 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

Hi Hugh,

On Wed, Sep 15, 2010 at 05:10:36PM -0700, Hugh Dickins wrote:
> I agree that if my scenario happened on its own, the pte_same check
> would catch it.  But if my scenario happens along with your scenario
> (and I'm thinking that the combination is not that much less likely
> than either alone), then the PageSwapCache test will succeed and the
> pte_same test will succeed, but we're still putting the wrong page into
> the pte, since this page is now represented by a different swap entry
> (and the page that should be there by our original swap entry).

If I understood well you're saying that it is possible that this
BUG_ON triggers:

   page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
   BUG_ON(page_private(page) != entry.val && pte_same(page_table, orig_pte));
   if (unlikely(!pte_same(*page_table, orig_pte)))

I still don't get it (that doesn't make me right though).

I'll try to rephrase my argument: if the page was swapped in from
swapcache by swapoff and then swapon runs again and the page is added
to swapcache to a different swap entry, in between the
lookup_swap_cache and the lock_page, the pte_same(*page_table,
orig_pte) in pte_same should always fail in the first place (so
without requiring the page_private(page) != entry.val check).

If the page is found mapped during pte_same the pte_same check will
fail (pte_present first of all). If the page got unmapped and
page_private(page) != entry.val, the "entry" == "orig_pte" will be
different to what we read in *page_table at the above BUG_ON line (the
page has to be unmapped before pte_same check can succeed, but if gets
unmapped the new swap entry will be written in the page_table and it
won't risk to succeed the pte_same check).

If the page wasn't mapped when it was removed from swapcache, it can't
be added to swapcache at all because it was pinned: because only free
pages (during swapin) or mapped pages (during swapout) can be added to
swapcache.

If I'm missing something a trace of the exact scenario would help to
clarify your point.

Thanks!
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-16 21:03         ` Andrea Arcangeli
@ 2010-09-16 21:08           ` Andrea Arcangeli
  2010-09-17  2:31           ` Hugh Dickins
  1 sibling, 0 replies; 23+ messages in thread
From: Andrea Arcangeli @ 2010-09-16 21:08 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Thu, Sep 16, 2010 at 11:03:49PM +0200, Andrea Arcangeli wrote:
> If I'm missing something a trace of the exact scenario would help to
> clarify your point.

Also supposing I'm right, I wouldn't mind if you add a
VM_BUG_ON(page_private(page) != entry.val) in the "pte_same == true"
path, just to be sure the invariant page_private(page) ==
pte_to_swp_entry(*page_table) doesn't ever break in a unnoticed way in
the future (I think it's unlikely to ever break but a VM_BUG_ON is
zero cost in production and it'll clarify the invariant). If instead
I'm wrong then just ahead fix it and I'll ack :).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-16 21:03         ` Andrea Arcangeli
  2010-09-16 21:08           ` Andrea Arcangeli
@ 2010-09-17  2:31           ` Hugh Dickins
  2010-09-18 13:19             ` Andrea Arcangeli
  1 sibling, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2010-09-17  2:31 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Thu, 16 Sep 2010, Andrea Arcangeli wrote:
> On Wed, Sep 15, 2010 at 05:10:36PM -0700, Hugh Dickins wrote:
> > I agree that if my scenario happened on its own, the pte_same check
> > would catch it.  But if my scenario happens along with your scenario
> > (and I'm thinking that the combination is not that much less likely
> > than either alone), then the PageSwapCache test will succeed and the
> > pte_same test will succeed, but we're still putting the wrong page into
> > the pte, since this page is now represented by a different swap entry
> > (and the page that should be there by our original swap entry).
> 
> If I understood well you're saying that it is possible that this
> BUG_ON triggers:
> 
>    page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
>    BUG_ON(page_private(page) != entry.val && pte_same(*page_table, orig_pte));
>    if (unlikely(!pte_same(*page_table, orig_pte)))

Yes, I believe so.

> 
> I still don't get it (that doesn't make me right though).
> 
> I'll try to rephrase my argument: if the page was swapped in from
> swapcache by swapoff and then swapon runs again and the page is added
> to swapcache to a different swap entry, in between the
> lookup_swap_cache and the lock_page, the pte_same(*page_table,
> orig_pte) in pte_same should always fail in the first place (so
> without requiring the page_private(page) != entry.val check).

Usually yes, but more may have happened in between.

> 
> If the page is found mapped during pte_same the pte_same check will
> fail (pte_present first of all). If the page got unmapped and
> page_private(page) != entry.val, the "entry" == "orig_pte" will be
> different to what we read in *page_table at the above BUG_ON line (the
> page has to be unmapped before pte_same check can succeed, but if gets
> unmapped the new swap entry will be written in the page_table and it
> won't risk to succeed the pte_same check).

Usually yes, but not necessarily.

> 
> If the page wasn't mapped when it was removed from swapcache, it can't
> be added to swapcache at all because it was pinned: because only free
> pages (during swapin) or mapped pages (during swapout) can be added to
> swapcache.

Yes, I think that happens to be the case, but does not rule out my
scenario.  Perhaps there's a page_count test that I've overlooked
that makes my scenario impossible, but is_page_cache_freeable()
appears to prevent writeout without affecting swap allocation.

> 
> If I'm missing something a trace of the exact scenario would help to
> clarify your point.

Indeed yes: I was being lazy, hoping to get you to do my thinking
for me (in my defence, and in your praise, I have to say that that
is usually much the quickest strategy :-)  Thank you for the time
you've spent on it, when I should have tried harder.

Here's what I think can happen: you may shame me by shooting it down
immediately, but go ahead!

I've cast it in terms of reuse_swap_page(), but I expect it could be
reformulated to rely on try_to_free_swap() instead, or swapoff+swapon.


A, in do_swap_page(): does page1 = lookup_swap_cache(swap1)
and comes through the lock_page(page1).

B, a racing thread of same process, also faults into do_swap_page():
does page1 = lookup_swap_cache(swap1) and now waits in lock_page(page1),
but for whatever reason is unlucky not to get the lock any time soon.

A carries on through do_swap_page(), a write fault, but cannot reuse
the swap page1 (another reference to swap1).  Unlocks the page1 (but B
doesn't get it yet), does COW in do_wp_page(), page2 now in that pte.

C, perhaps the parent of A+B, comes in and write faults the same swap
page1 into its mm, reuse_swap_page() succeeds this time, swap1 is freed.

kswapd comes in after some time (B still unlucky) and swaps out some
pages from A+B and C: it allocates the original swap1 to page2 in A+B,
and some other swap2 to the original page1 now in C.  But does not
immediately free page1 (actually it couldn't: B holds a reference),
leaving it in swap cache for now.

B at last gets the lock on page1, hooray!  Is PageSwapCache(page1)?
Yes.  Is pte_same(*page_table, orig_pte)?  Yes, because page2 has
now been given the swap1 which page1 used to have.  So B proceeds
to insert page1 into A+B's page_table, though its content now
belongs to C, quite different from what A wrote there.

B ought to have checked that page1's swap was still swap1.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-17  2:31           ` Hugh Dickins
@ 2010-09-18 13:19             ` Andrea Arcangeli
  2010-09-20  2:35               ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Andrea Arcangeli @ 2010-09-18 13:19 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Thu, Sep 16, 2010 at 07:31:57PM -0700, Hugh Dickins wrote:
> Indeed yes: I was being lazy, hoping to get you to do my thinking
> for me (in my defence, and in your praise, I have to say that that
> is usually much the quickest strategy :-)  Thank you for the time
> you've spent on it, when I should have tried harder.
> 
> Here's what I think can happen: you may shame me by shooting it down
> immediately, but go ahead!

Can't shoot it. This definitely helped. My previous scenario only
involved threads, so I was only thinking at threads...

> I've cast it in terms of reuse_swap_page(), but I expect it could be
> reformulated to rely on try_to_free_swap() instead, or swapoff+swapon.

It's actually better to formulate in reuse_swap_page terms as it
doesn't require swapoff to trigger.

> A, in do_swap_page(): does page1 = lookup_swap_cache(swap1)
> and comes through the lock_page(page1).
> 
> B, a racing thread of same process, also faults into do_swap_page():
> does page1 = lookup_swap_cache(swap1) and now waits in lock_page(page1),
> but for whatever reason is unlucky not to get the lock any time soon.
> 
> A carries on through do_swap_page(), a write fault, but cannot reuse
> the swap page1 (another reference to swap1).  Unlocks the page1 (but B
> doesn't get it yet), does COW in do_wp_page(), page2 now in that pte.
> 
> C, perhaps the parent of A+B, comes in and write faults the same swap
> page1 into its mm, reuse_swap_page() succeeds this time, swap1 is freed.

The key is C mm is different from the A/B mm. If C was sharing the
same mm (as in the scenario I was thinking of with threads)
reuse_swap_cache couldn't run in C because the pte_same check would
fail.

> kswapd comes in after some time (B still unlucky) and swaps out some
> pages from A+B and C: it allocates the original swap1 to page2 in A+B,
> and some other swap2 to the original page1 now in C.  But does not
> immediately free page1 (actually it couldn't: B holds a reference),
> leaving it in swap cache for now.
> 
> B at last gets the lock on page1, hooray!  Is PageSwapCache(page1)?
> Yes.  Is pte_same(*page_table, orig_pte)?  Yes, because page2 has
> now been given the swap1 which page1 used to have.  So B proceeds
> to insert page1 into A+B's page_table, though its content now
> belongs to C, quite different from what A wrote there.
> 
> B ought to have checked that page1's swap was still swap1.

I suggest adding the explanation to the patch comment.

Thanks!
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-09-18 13:19             ` Andrea Arcangeli
@ 2010-09-20  2:35               ` Hugh Dickins
  2010-09-20  2:40                   ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2010-09-20  2:35 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Greg KH, Andrew Morton, Rik van Riel, linux-mm

On Sat, 18 Sep 2010, Andrea Arcangeli wrote:
> On Thu, Sep 16, 2010 at 07:31:57PM -0700, Hugh Dickins wrote:
> > 
> > Here's what I think can happen: you may shame me by shooting it down
> > immediately, but go ahead!
> 
> Can't shoot it. This definitely helped. My previous scenario only
> involved threads, so I was only thinking at threads...

Thanks a lot for going through it.

> > B ought to have checked that page1's swap was still swap1.
> 
> I suggest adding the explanation to the patch comment.

Absolutely: patch to Linus follows.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] mm: further fix swapin race condition
  2010-09-20  2:35               ` Hugh Dickins
@ 2010-09-20  2:40                   ` Hugh Dickins
  0 siblings, 0 replies; 23+ messages in thread
From: Hugh Dickins @ 2010-09-20  2:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Greg KH, Andrew Morton, Rik van Riel, linux-mm,
	linux-kernel

Commit 4969c1192d15afa3389e7ae3302096ff684ba655 "mm: fix swapin race condition"
is now agreed to be incomplete.  There's a race, not very much less likely
than the original race envisaged, in which it is further necessary to check
that the swapcache page's swap has not changed.

Here's the reasoning: cast in terms of reuse_swap_page(), but probably could
be reformulated to rely on try_to_free_swap() instead, or on swapoff+swapon.

A, faults into do_swap_page(): does page1 = lookup_swap_cache(swap1)
and comes through the lock_page(page1).

B, a racing thread of the same process, faults on the same address:
does page1 = lookup_swap_cache(swap1) and now waits in lock_page(page1),
but for whatever reason is unlucky not to get the lock any time soon.

A carries on through do_swap_page(), a write fault, but cannot reuse
the swap page1 (another reference to swap1).  Unlocks the page1 (but B
doesn't get it yet), does COW in do_wp_page(), page2 now in that pte.

C, perhaps the parent of A+B, comes in and write faults the same swap
page1 into its mm, reuse_swap_page() succeeds this time, swap1 is freed.

kswapd comes in after some time (B still unlucky) and swaps out some
pages from A+B and C: it allocates the original swap1 to page2 in A+B,
and some other swap2 to the original page1 now in C.  But does not
immediately free page1 (actually it couldn't: B holds a reference),
leaving it in swap cache for now.

B at last gets the lock on page1, hooray!  Is PageSwapCache(page1)?
Yes.  Is pte_same(*page_table, orig_pte)?  Yes, because page2 has
now been given the swap1 which page1 used to have.  So B proceeds
to insert page1 into A+B's page_table, though its content now
belongs to C, quite different from what A wrote there.

B ought to have checked that page1's swap was still swap1.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
---

 mm/memory.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

--- 2.6.36-rc4/mm/memory.c	2010-09-12 17:34:03.000000000 -0700
+++ linux/mm/memory.c	2010-09-19 18:23:43.000000000 -0700
@@ -2680,10 +2680,12 @@ static int do_swap_page(struct mm_struct
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
 	/*
-	 * Make sure try_to_free_swap didn't release the swapcache
-	 * from under us. The page pin isn't enough to prevent that.
+	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+	 * release the swapcache from under us.  The page pin, and pte_same
+	 * test below, are not enough to exclude that.  Even if it is still
+	 * swapcache, we need to check that the page's swap has not changed.
 	 */
-	if (unlikely(!PageSwapCache(page)))
+	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
 		goto out_page;
 
 	if (ksm_might_need_to_copy(page, vma, address)) {

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] mm: further fix swapin race condition
@ 2010-09-20  2:40                   ` Hugh Dickins
  0 siblings, 0 replies; 23+ messages in thread
From: Hugh Dickins @ 2010-09-20  2:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Greg KH, Andrew Morton, Rik van Riel, linux-mm,
	linux-kernel

Commit 4969c1192d15afa3389e7ae3302096ff684ba655 "mm: fix swapin race condition"
is now agreed to be incomplete.  There's a race, not very much less likely
than the original race envisaged, in which it is further necessary to check
that the swapcache page's swap has not changed.

Here's the reasoning: cast in terms of reuse_swap_page(), but probably could
be reformulated to rely on try_to_free_swap() instead, or on swapoff+swapon.

A, faults into do_swap_page(): does page1 = lookup_swap_cache(swap1)
and comes through the lock_page(page1).

B, a racing thread of the same process, faults on the same address:
does page1 = lookup_swap_cache(swap1) and now waits in lock_page(page1),
but for whatever reason is unlucky not to get the lock any time soon.

A carries on through do_swap_page(), a write fault, but cannot reuse
the swap page1 (another reference to swap1).  Unlocks the page1 (but B
doesn't get it yet), does COW in do_wp_page(), page2 now in that pte.

C, perhaps the parent of A+B, comes in and write faults the same swap
page1 into its mm, reuse_swap_page() succeeds this time, swap1 is freed.

kswapd comes in after some time (B still unlucky) and swaps out some
pages from A+B and C: it allocates the original swap1 to page2 in A+B,
and some other swap2 to the original page1 now in C.  But does not
immediately free page1 (actually it couldn't: B holds a reference),
leaving it in swap cache for now.

B at last gets the lock on page1, hooray!  Is PageSwapCache(page1)?
Yes.  Is pte_same(*page_table, orig_pte)?  Yes, because page2 has
now been given the swap1 which page1 used to have.  So B proceeds
to insert page1 into A+B's page_table, though its content now
belongs to C, quite different from what A wrote there.

B ought to have checked that page1's swap was still swap1.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
---

 mm/memory.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

--- 2.6.36-rc4/mm/memory.c	2010-09-12 17:34:03.000000000 -0700
+++ linux/mm/memory.c	2010-09-19 18:23:43.000000000 -0700
@@ -2680,10 +2680,12 @@ static int do_swap_page(struct mm_struct
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
 	/*
-	 * Make sure try_to_free_swap didn't release the swapcache
-	 * from under us. The page pin isn't enough to prevent that.
+	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+	 * release the swapcache from under us.  The page pin, and pte_same
+	 * test below, are not enough to exclude that.  Even if it is still
+	 * swapcache, we need to check that the page's swap has not changed.
 	 */
-	if (unlikely(!PageSwapCache(page)))
+	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
 		goto out_page;
 
 	if (ksm_might_need_to_copy(page, vma, address)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: further fix swapin race condition
  2010-09-20  2:40                   ` Hugh Dickins
@ 2010-09-20  3:09                     ` Rik van Riel
  -1 siblings, 0 replies; 23+ messages in thread
From: Rik van Riel @ 2010-09-20  3:09 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Andrea Arcangeli, Greg KH, Andrew Morton,
	linux-mm, linux-kernel

On 09/19/2010 10:40 PM, Hugh Dickins wrote:
> Commit 4969c1192d15afa3389e7ae3302096ff684ba655 "mm: fix swapin race condition"
> is now agreed to be incomplete.  There's a race, not very much less likely
> than the original race envisaged, in which it is further necessary to check
> that the swapcache page's swap has not changed.

> B ought to have checked that page1's swap was still swap1.
>
> Signed-off-by: Hugh Dickins<hughd@google.com>
> Cc: stable@kernel.org

Reviewed-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: further fix swapin race condition
@ 2010-09-20  3:09                     ` Rik van Riel
  0 siblings, 0 replies; 23+ messages in thread
From: Rik van Riel @ 2010-09-20  3:09 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, Andrea Arcangeli, Greg KH, Andrew Morton,
	linux-mm, linux-kernel

On 09/19/2010 10:40 PM, Hugh Dickins wrote:
> Commit 4969c1192d15afa3389e7ae3302096ff684ba655 "mm: fix swapin race condition"
> is now agreed to be incomplete.  There's a race, not very much less likely
> than the original race envisaged, in which it is further necessary to check
> that the swapcache page's swap has not changed.

> B ought to have checked that page1's swap was still swap1.
>
> Signed-off-by: Hugh Dickins<hughd@google.com>
> Cc: stable@kernel.org

Reviewed-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-07-13  1:08   ` Andrea Arcangeli
@ 2010-07-13 21:30     ` Hugh Dickins
  0 siblings, 0 replies; 23+ messages in thread
From: Hugh Dickins @ 2010-07-13 21:30 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Marcelo Tosatti, Rik van Riel

On Tue, 13 Jul 2010, Andrea Arcangeli wrote:
> On Fri, Jul 09, 2010 at 01:32:45PM -0700, Hugh Dickins wrote:
> > It is well established that by the time lookup_swap_cache() returns,
> > the page it returns may already have been removed from swapcache:
> > yes, you have to get page lock to be sure.  Long ago I put a comment 
> > on that into lookup_swap_cache(), but it fell out of the 2.6 version
> > when we briefly changed how that one worked.
> 
> I found it by code review only, so it was unexpected to me. I assume
> it was unexpected to do_swap_cache as well if you agree this is a bug
> too and it could trigger in theory.

I _was_ disagreeing with you that there's such a bug in do_swap_page(),
beyond the recent KSM case you discovered and first drew attention to.
But you should have argued with me, when I questioned the need for the
extra PageSwapCache test you add to do_swap_page().

Thinking it over again, I now see what I think you moved on to saying
after highlighting the ksm_might_need_to_copy() case: never mind KSM,
that pte_same() test has been inadequate ever since it came in 2.4.8.

Because it's conceivable that before do_swap_page() gets lock_page(),
a racing thread could swap in the page for writing, COW it, the original
page be deleted from swap cache and its swap freed, the new page be
modified and then swapped out, and happen to get exactly the same
swap slot as the original had: pte_same!  In which case, without your
PageSwapCache test after lock_page(), the code goes on to insert the
now out-of-date version of the page in place of the modified version.

All very very unlikely; and some of those details impossible before
2.6.29, when I scrapped the exclusive page_count requirement of
remove_exclusive_swap_page - but the scenario still made possible
from earliest days by swapoff, which never cared about page_count.

> 
> In the old days I'm quite sure lookup_swap_cache wouldn't return a
> page evicted by swapcache I think,

If by old days you mean 2.2, yes, its lookup_swap_cache() returned a
locked page.  But 2.4 and 2.6 always returned an unlocked page, which
could already have been evicted from swapcache: as we realized in 2.4.10.

> but this looks good so we can run
> try_to_free_swap and release swap space, without care of the page pins
> and we decouple the actual pinning of the page from keeping the swap
> entry pinned too. Just unexpected to me when reading do_swap_page,
> while being used to the "secular" semantics of lookup_swap_cache ;).
> 
> I also thought maybe lookup_swap_cache could return the page locked or
> return NULL, but then there's the same problem in the regular swapin
> that is async and the lock will go away when I/O completes, so there's
> still a window for the same race, so I thought it's not worth locking
> anything in lookup_swap_cache.
> 
> >
> > It can even happen when swap is near empty: through swapoff,
> > or through reuse_swap_page(), at least.
> 
> Agreed, it can also happen with threads swapping in a not shared anon
> page.
> 
> > I'm not aware of any bug we have from that, but sure, it comes as a
> > surprise when you realize it.
> 
> :)
> 
> > > It's also possible to fix it by forcing do_wp_page to run but for a
> > > little while it won't be possible to rmap the the instantiated page so
> > > I didn't change that even if it probably would make life easier to
> > > memcg swapin handlers (maybe, dunno).
> > 
> > That's an interesting idea.  I'm not clear what you have in mind there,
> > but if we could get rid of ksm_does_need_to_copy(), letting do_wp_page()
> > do the copy instead, that would be very satisfying.  However, I suspect
> > it would rather involve tricking do_wp_page() into doing it, involve a
> > number of hard-to-maintain hacks, appealing to me but to nobody else!
> 
> It's actually next to trivial to make that change and it eliminates a
> dozen lines of code. I didn't to just make a strict fix that doesn't
> alter any logic of the code. But it's just enough to do:
> 
> if (ksm_might_need_to_copy())
>    flags |= FAULT_FLAG_WRITE;

That's very attractive, but I think you'll find it's not quite as
simple as that.

For a start, there's still the CONFIG_DEBUG_VM BUG_ON page->index
test in __page_check_anon_rmap(): perhaps we could just give up on
__page_check_anon_rmap(), though Nick likes it.  Then reuse_swap_page()
will be liable to let the same page be used, though its page->mapping
points to an irrelevant anon_vma - leaving the page rmap-unfindable
thereafter.  That could perhaps be fixed by forcing page->mapping and
page->index when reusing, but that will be something new: changing
page_anon_vma(page) while page is in use, perhaps that will turn out
to be safe while the page is locked, but would need an audit.  There
may be other gotchas, I've not thought further.

But I do like very much your idea of deleting ksm_does_need_to_copy(),
letting do_wp_page() do the work: it's easy to imagine people updating
do_wp_page() copying and forgetting to update ksm_does_need_to_copy().

Hugh

> 
> Then the whole ksm_does_need_to_copy and swapcache variable and the
> rest of the code I added goes away (only the pageswapcache check after
> page_lock remains for the bug discussed at the top). pte_same in
> do_wp_page and do_swap_page then become both reliable because
> do_swap_page locks the swapcache before changing the pte to point to
> the page, and do_wp_page pins the old page before doing pte_same. So
> there's no risk of swap entry reuse that way and it eliminates some
> code.
> 
> Only downside of ksm_does_need_to_copy removal: rmappability of the
> swapcache for a ksm-swapin that isn't linear becomes impossible for a
> little window of time, but only split_huge_page requires rmappability
> always (only transparent hugepages are required to be precisely
> rmapped at all times if they are ever established in any
> pmd_trans_huge pmd), and migrate only requires what is remappable to
> remain remappable or remove_migration_pte will then crash (but it's ok
> if stuff isn't remappable before it's established as the page count
> won't match and migrate will temporarily abort). So there is no real
> problem if some swapcache (not transparent hugepage) established in a
> _new_ pte temporarily isn't remappable. OTOH reducing the window for
> lack of rmap is nice too considering it's not going to make any
> difference to do_swap_page.
> 
> The only one that really may benefit from ksm_does_need_to_copy that I
> can imagine memcg, I've no clue how memcg will feel when
> mem_cgroup_commit_charge_swapin aren't getting a swapcache. There are
> some checks for pageswapcache and it probably doesn't choke but I
> can't tell if some stat will go off by one or similar, or if it
> already works fine as is.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-07-09 20:32 ` Hugh Dickins
  2010-07-09 21:19   ` Hugh Dickins
  2010-07-09 22:02   ` Rik van Riel
@ 2010-07-13  1:08   ` Andrea Arcangeli
  2010-07-13 21:30     ` Hugh Dickins
  2 siblings, 1 reply; 23+ messages in thread
From: Andrea Arcangeli @ 2010-07-13  1:08 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-mm, Marcelo Tosatti, Rik van Riel

Hi Hugh,

thanks a lot for the review!

On Fri, Jul 09, 2010 at 01:32:45PM -0700, Hugh Dickins wrote:
> Yes, nice find, you're absolutely right: not likely, but possible.
> Swap is slippery stuff, and that pte_same does depend on keeping the
> original page locked (or else an additional swap_duplicate+swap_free).

Agreed. Now this race of swap entry reused during the ksm-copy leading
to pte_same false positive, seems so unlikely that I doubt anybody
could have ever triggered it so far, but possible nevertheless. When
things gets into a cluster getting one in a thousand systems that hits
swap heavy leading to this isn't nice as it's next to impossible to
track the corruption because it'd be not easily reproducible, so it's
better to be safe than sorry ;).

> It is well established that by the time lookup_swap_cache() returns,
> the page it returns may already have been removed from swapcache:
> yes, you have to get page lock to be sure.  Long ago I put a comment 
> on that into lookup_swap_cache(), but it fell out of the 2.6 version
> when we briefly changed how that one worked.

I found it by code review only, so it was unexpected to me. I assume
it was unexpected to do_swap_cache as well if you agree this is a bug
too and it could trigger in theory.

In the old days I'm quite sure lookup_swap_cache wouldn't return a
page evicted by swapcache I think, but this looks good so we can run
try_to_free_swap and release swap space, without care of the page pins
and we decouple the actual pinning of the page from keeping the swap
entry pinned too. Just unexpected to me when reading do_swap_page,
while being used to the "secular" semantics of lookup_swap_cache ;).

I also thought maybe lookup_swap_cache could return the page locked or
return NULL, but then there's the same problem in the regular swapin
that is async and the lock will go away when I/O completes, so there's
still a window for the same race, so I thought it's not worth locking
anything in lookup_swap_cache.

>
> It can even happen when swap is near empty: through swapoff,
> or through reuse_swap_page(), at least.

Agreed, it can also happen with threads swapping in a not shared anon
page.

> I'm not aware of any bug we have from that, but sure, it comes as a
> surprise when you realize it.

:)

> > It's also possible to fix it by forcing do_wp_page to run but for a
> > little while it won't be possible to rmap the the instantiated page so
> > I didn't change that even if it probably would make life easier to
> > memcg swapin handlers (maybe, dunno).
> 
> That's an interesting idea.  I'm not clear what you have in mind there,
> but if we could get rid of ksm_does_need_to_copy(), letting do_wp_page()
> do the copy instead, that would be very satisfying.  However, I suspect
> it would rather involve tricking do_wp_page() into doing it, involve a
> number of hard-to-maintain hacks, appealing to me but to nobody else!

It's actually next to trivial to make that change and it eliminates a
dozen lines of code. I didn't to just make a strict fix that doesn't
alter any logic of the code. But it's just enough to do:

if (ksm_might_need_to_copy())
   flags |= FAULT_FLAG_WRITE;

Then the whole ksm_does_need_to_copy and swapcache variable and the
rest of the code I added goes away (only the pageswapcache check after
page_lock remains for the bug discussed at the top). pte_same in
do_wp_page and do_swap_page then become both reliable because
do_swap_page locks the swapcache before changing the pte to point to
the page, and do_wp_page pins the old page before doing pte_same. So
there's no risk of swap entry reuse that way and it eliminates some
code.

Only downside of ksm_does_need_to_copy removal: rmappability of the
swapcache for a ksm-swapin that isn't linear becomes impossible for a
little window of time, but only split_huge_page requires rmappability
always (only transparent hugepages are required to be precisely
rmapped at all times if they are ever established in any
pmd_trans_huge pmd), and migrate only requires what is remappable to
remain remappable or remove_migration_pte will then crash (but it's ok
if stuff isn't remappable before it's established as the page count
won't match and migrate will temporarily abort). So there is no real
problem if some swapcache (not transparent hugepage) established in a
_new_ pte temporarily isn't remappable. OTOH reducing the window for
lack of rmap is nice too considering it's not going to make any
difference to do_swap_page.

The only one that really may benefit from ksm_does_need_to_copy that I
can imagine memcg, I've no clue how memcg will feel when
mem_cgroup_commit_charge_swapin aren't getting a swapcache. There are
some checks for pageswapcache and it probably doesn't choke but I
can't tell if some stat will go off by one or similar, or if it
already works fine as is.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-07-09 20:32 ` Hugh Dickins
  2010-07-09 21:19   ` Hugh Dickins
@ 2010-07-09 22:02   ` Rik van Riel
  2010-07-13  1:08   ` Andrea Arcangeli
  2 siblings, 0 replies; 23+ messages in thread
From: Rik van Riel @ 2010-07-09 22:02 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrea Arcangeli, linux-mm, Marcelo Tosatti

On 07/09/2010 04:32 PM, Hugh Dickins wrote:

>>   	struct anon_vma *anon_vma = page_anon_vma(page);
>>
>> -	if (!anon_vma ||
>> -	    (anon_vma->root == vma->anon_vma->root&&
>> -	     page->index == linear_page_index(vma, address)))
>> -		return page;
>> -
>> -	return ksm_does_need_to_copy(page, vma, address);
>> +	return anon_vma&&
>> +		(anon_vma->root != vma->anon_vma->root ||
>> +		 page->index != linear_page_index(vma, address));
>>   }
>
> Hiding in here is a bigger question than your concern:
> are these tests right since Rik refactored the anon_vmas?
> I just don't know, but hope you and Rik can answer.

Yes, this bit is correct.  Andrea and I have gone over
this in detail a few weeks ago :)

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-07-09 20:32 ` Hugh Dickins
@ 2010-07-09 21:19   ` Hugh Dickins
  2010-07-09 22:02   ` Rik van Riel
  2010-07-13  1:08   ` Andrea Arcangeli
  2 siblings, 0 replies; 23+ messages in thread
From: Hugh Dickins @ 2010-07-09 21:19 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Marcelo Tosatti, Rik van Riel

On Fri, Jul 9, 2010 at 1:32 PM, Hugh Dickins <hughd@google.com> wrote:
>
> There's a related bug of mine lurking here, only realized in looking
> through this, which you might want to fix at the same time: I should
> have moved the PageUptodate check from after the pte_same check to
> before the ksm_might_need_to_copy, shouldn't I?  As it stands, we
> might copy junk from an invalid !Uptodate page into a clean new page.

Actually, not so, forget it: though it does look worrying, if the swap
page read had failed, leaving the page !Uptodate, then it would not
have been inserted into any address space in the first place,
page->mapping would remain unset, and ksm_might_need_to_copy() would
have no reason to copy it.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] fix swapin race condition
  2010-07-09  0:23 [PATCH] " Andrea Arcangeli
@ 2010-07-09 20:32 ` Hugh Dickins
  2010-07-09 21:19   ` Hugh Dickins
                     ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Hugh Dickins @ 2010-07-09 20:32 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Marcelo Tosatti, Rik van Riel

On Fri, 9 Jul 2010, Andrea Arcangeli wrote:

> Hi Hugh,
> 
> can you review this patch? This is only theoretical so far.
> 
> Basically a thread can get stuck in ksm_does_need_to_copy after the
> unlock_page (with orig_pte pointing to swap entry A). In the meantime
> another thread can do the ksm swapin copy too, the copy can be swapped
> out to swap entry B, then a new swapin can create a new copy that can
> return to swap entry A if reused by things like try_to_free_swap (to
> me it seems the page pin is useless to prevent the swapcache to go
> away, it only helps in page reclaim but swapcache is removed by other
> things too). So then the first thread can finish the ksm-copy find the
> pte_same pointing to swap entry A again (despite it passed through
> swap entry B) and break.

Yes, nice find, you're absolutely right: not likely, but possible.
Swap is slippery stuff, and that pte_same does depend on keeping the
original page locked (or else an additional swap_duplicate+swap_free).

> 
> I also don't seem to see a guarantee that lookup_swap_cache returns
> swapcache until we take the lock on the page.
> 
> I exclude this can cause regressions, but I'd like to know if it
> really can happen or if I'm missing something and it cannot happen. I
> surely looks weird that lookup_swap_cache might return a page that
> gets removed from swapcache before we take the page lock but I don't
> see anything preventing it. Surely it's not going to happen until >50%
> swap is full.

It is well established that by the time lookup_swap_cache() returns,
the page it returns may already have been removed from swapcache:
yes, you have to get page lock to be sure.  Long ago I put a comment 
on that into lookup_swap_cache(), but it fell out of the 2.6 version
when we briefly changed how that one worked.

It can even happen when swap is near empty: through swapoff,
or through reuse_swap_page(), at least.

I'm not aware of any bug we have from that, but sure, it comes as a
surprise when you realize it.

> 
> It's also possible to fix it by forcing do_wp_page to run but for a
> little while it won't be possible to rmap the the instantiated page so
> I didn't change that even if it probably would make life easier to
> memcg swapin handlers (maybe, dunno).

That's an interesting idea.  I'm not clear what you have in mind there,
but if we could get rid of ksm_does_need_to_copy(), letting do_wp_page()
do the copy instead, that would be very satisfying.  However, I suspect
it would rather involve tricking do_wp_page() into doing it, involve a
number of hard-to-maintain hacks, appealing to me but to nobody else!

> 
> Thanks,
> Andrea
> 
> ======
> Subject: fix swapin race condition
> 
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> The pte_same check is reliable only if the swap entry remains pinned
> (by the page lock on swapcache). We've also to ensure the swapcache
> isn't removed before we take the lock as try_to_free_swap won't care
> about the page pin.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> ---
> 
> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> --- a/include/linux/ksm.h
> +++ b/include/linux/ksm.h
> @@ -16,6 +16,9 @@
>  struct stable_node;
>  struct mem_cgroup;
>  
> +struct page *ksm_does_need_to_copy(struct page *page,
> +			struct vm_area_struct *vma, unsigned long address);
> +

Hmm, I guess that works, but depends on the optimizer to remove the
reference to it when !CONFIG_KSM.  I think it would be better back
in its original place, with a dummy inline added for !CONFIG_KSM.

>  #ifdef CONFIG_KSM
>  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
>  		unsigned long end, int advice, unsigned long *vm_flags);
> @@ -70,19 +73,14 @@ static inline void set_page_stable_node(
>   * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
>   * but what if the vma was unmerged while the page was swapped out?
>   */
> -struct page *ksm_does_need_to_copy(struct page *page,
> -			struct vm_area_struct *vma, unsigned long address);
> -static inline struct page *ksm_might_need_to_copy(struct page *page,
> +static inline int ksm_might_need_to_copy(struct page *page,
>  			struct vm_area_struct *vma, unsigned long address)
>  {
>  	struct anon_vma *anon_vma = page_anon_vma(page);
>  
> -	if (!anon_vma ||
> -	    (anon_vma->root == vma->anon_vma->root &&
> -	     page->index == linear_page_index(vma, address)))
> -		return page;
> -
> -	return ksm_does_need_to_copy(page, vma, address);
> +	return anon_vma &&
> +		(anon_vma->root != vma->anon_vma->root ||
> +		 page->index != linear_page_index(vma, address));
>  }

Hiding in here is a bigger question than your concern:
are these tests right since Rik refactored the anon_vmas?
I just don't know, but hope you and Rik can answer.

I put in this ksm_might_need_to_copy() stuff with the old anon_vma:
it was necessary to avoid the BUG_ON(page->mapping != vma->anon_vma)
in the old __page_check_anon_rmap().  Not just to avoid the BUG_ON -
it was correct to be checking that the page would be rmap-findable.
Nowadays that check is gone, and I wonder if ksm_does_need_to_copy()
is getting called in cases when we do not need to copy at all?

(We need to copy on bringing back from swap when KSM collapsed pages
unrelated by fork into one single page which was then swapped out.)

>  
>  int page_referenced_ksm(struct page *page,
> @@ -115,10 +113,10 @@ static inline int ksm_madvise(struct vm_
>  	return 0;
>  }
>  
> -static inline struct page *ksm_might_need_to_copy(struct page *page,
> +static inline int ksm_might_need_to_copy(struct page *page,
>  			struct vm_area_struct *vma, unsigned long address)
>  {
> -	return page;
> +	return 0;
>  }
>  
>  static inline int page_referenced_ksm(struct page *page,
> diff --git a/mm/ksm.c b/mm/ksm.c
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -1518,8 +1518,6 @@ struct page *ksm_does_need_to_copy(struc
>  {
>  	struct page *new_page;
>  
> -	unlock_page(page);	/* any racers will COW it, not modify it */
> -
>  	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
>  	if (new_page) {
>  		copy_user_highpage(new_page, page, address, vma);
> @@ -1535,7 +1533,6 @@ struct page *ksm_does_need_to_copy(struc
>  			add_page_to_unevictable_list(new_page);
>  	}
>  
> -	page_cache_release(page);
>  	return new_page;
>  }
>  
> diff --git a/mm/memory.c b/mm/memory.c
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2616,7 +2616,7 @@ static int do_swap_page(struct mm_struct
>  		unsigned int flags, pte_t orig_pte)
>  {
>  	spinlock_t *ptl;
> -	struct page *page;
> +	struct page *page, *swapcache = NULL;

If we're honest (and ksm_might_need_to_copy really is giving the
right answers it should), we'd name that ksm_swapcache, and notice
how KSM has intruded here rather more than we wanted.

>  	swp_entry_t entry;
>  	pte_t pte;
>  	struct mem_cgroup *ptr = NULL;
> @@ -2671,10 +2671,23 @@ static int do_swap_page(struct mm_struct
>  	lock_page(page);
>  	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
>  
> -	page = ksm_might_need_to_copy(page, vma, address);
> -	if (!page) {
> -		ret = VM_FAULT_OOM;
> -		goto out;
> +	/*
> +	 * Make sure try_to_free_swap didn't release the swapcache
> +	 * from under us. The page pin isn't enough to prevent that.
> +	 */
> +	if (unlikely(!PageSwapCache(page)))
> +		goto out_page;

Do you actually need to add that check there? (And do we actually
need the same check in mem_cgroup_try_charge_swapin?  Now I think not.)
You would if you were to proceed by holding the swap entry with an
additional swap_duplicate+swap_free (which conceptually I'd prefer,
but you're more efficient to use the page lock we already have).

If you really want to add it, just to make things easier to think
about, that's fair enough; but I don't see that it's necessary.

> +
> +	if (ksm_might_need_to_copy(page, vma, address)) {
> +		swapcache = page;
> +		page = ksm_does_need_to_copy(page, vma, address);
> +
> +		if (unlikely(!page)) {
> +			ret = VM_FAULT_OOM;
> +			page = swapcache;
> +			swapcache = NULL;
> +			goto out_page;
> +		}
>  	}
>  
>  	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {

There's a related bug of mine lurking here, only realized in looking
through this, which you might want to fix at the same time: I should
have moved the PageUptodate check from after the pte_same check to
before the ksm_might_need_to_copy, shouldn't I?  As it stands, we
might copy junk from an invalid !Uptodate page into a clean new page.

> @@ -2725,6 +2738,18 @@ static int do_swap_page(struct mm_struct
>  	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
>  		try_to_free_swap(page);
>  	unlock_page(page);
> +	if (swapcache) {
> +		/*
> +		 * Hold the lock to avoid the swap entry to be reused
> +		 * until we take the PT lock for the pte_same() check
> +		 * (to avoid false positives from pte_same). For
> +		 * further safety release the lock after the swap_free
> +		 * so that the swap count won't change under a
> +		 * parallel locked swapcache.
> +		 */
> +		unlock_page(swapcache);
> +		page_cache_release(swapcache);
> +	}
>  
>  	if (flags & FAULT_FLAG_WRITE) {
>  		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
> @@ -2746,6 +2771,10 @@ out_page:
>  	unlock_page(page);
>  out_release:
>  	page_cache_release(page);
> +	if (swapcache) {
> +		unlock_page(swapcache);
> +		page_cache_release(swapcache);
> +	}

Minor point, but couldn't that added block go just after the unlock_page
above, before the out_release label?  Doesn't matter, just pairs up more
naturally.

>  	return ret;
>  }

Yes, I don't like the way KSM is intruding further on do_swap_page(),
but I haven't thought of a nicer way of handling the case.

Thanks for finding this - probably years before any user hits it!
Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] fix swapin race condition
@ 2010-07-09  0:23 Andrea Arcangeli
  2010-07-09 20:32 ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Andrea Arcangeli @ 2010-07-09  0:23 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-mm, Marcelo Tosatti, Rik van Riel

Hi Hugh,

can you review this patch? This is only theoretical so far.

Basically a thread can get stuck in ksm_does_need_to_copy after the
unlock_page (with orig_pte pointing to swap entry A). In the meantime
another thread can do the ksm swapin copy too, the copy can be swapped
out to swap entry B, then a new swapin can create a new copy that can
return to swap entry A if reused by things like try_to_free_swap (to
me it seems the page pin is useless to prevent the swapcache to go
away, it only helps in page reclaim but swapcache is removed by other
things too). So then the first thread can finish the ksm-copy find the
pte_same pointing to swap entry A again (despite it passed through
swap entry B) and break.

I also don't seem to see a guarantee that lookup_swap_cache returns
swapcache until we take the lock on the page.

I exclude this can cause regressions, but I'd like to know if it
really can happen or if I'm missing something and it cannot happen. I
surely looks weird that lookup_swap_cache might return a page that
gets removed from swapcache before we take the page lock but I don't
see anything preventing it. Surely it's not going to happen until >50%
swap is full.

It's also possible to fix it by forcing do_wp_page to run but for a
little while it won't be possible to rmap the the instantiated page so
I didn't change that even if it probably would make life easier to
memcg swapin handlers (maybe, dunno).

Thanks,
Andrea

======
Subject: fix swapin race condition

From: Andrea Arcangeli <aarcange@redhat.com>

The pte_same check is reliable only if the swap entry remains pinned
(by the page lock on swapcache). We've also to ensure the swapcache
isn't removed before we take the lock as try_to_free_swap won't care
about the page pin.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -16,6 +16,9 @@
 struct stable_node;
 struct mem_cgroup;
 
+struct page *ksm_does_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address);
+
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
@@ -70,19 +73,14 @@ static inline void set_page_stable_node(
  * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
  * but what if the vma was unmerged while the page was swapped out?
  */
-struct page *ksm_does_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address);
-static inline struct page *ksm_might_need_to_copy(struct page *page,
+static inline int ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address)
 {
 	struct anon_vma *anon_vma = page_anon_vma(page);
 
-	if (!anon_vma ||
-	    (anon_vma->root == vma->anon_vma->root &&
-	     page->index == linear_page_index(vma, address)))
-		return page;
-
-	return ksm_does_need_to_copy(page, vma, address);
+	return anon_vma &&
+		(anon_vma->root != vma->anon_vma->root ||
+		 page->index != linear_page_index(vma, address));
 }
 
 int page_referenced_ksm(struct page *page,
@@ -115,10 +113,10 @@ static inline int ksm_madvise(struct vm_
 	return 0;
 }
 
-static inline struct page *ksm_might_need_to_copy(struct page *page,
+static inline int ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address)
 {
-	return page;
+	return 0;
 }
 
 static inline int page_referenced_ksm(struct page *page,
diff --git a/mm/ksm.c b/mm/ksm.c
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1518,8 +1518,6 @@ struct page *ksm_does_need_to_copy(struc
 {
 	struct page *new_page;
 
-	unlock_page(page);	/* any racers will COW it, not modify it */
-
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (new_page) {
 		copy_user_highpage(new_page, page, address, vma);
@@ -1535,7 +1533,6 @@ struct page *ksm_does_need_to_copy(struc
 			add_page_to_unevictable_list(new_page);
 	}
 
-	page_cache_release(page);
 	return new_page;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2616,7 +2616,7 @@ static int do_swap_page(struct mm_struct
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page;
+	struct page *page, *swapcache = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	struct mem_cgroup *ptr = NULL;
@@ -2671,10 +2671,23 @@ static int do_swap_page(struct mm_struct
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
-	page = ksm_might_need_to_copy(page, vma, address);
-	if (!page) {
-		ret = VM_FAULT_OOM;
-		goto out;
+	/*
+	 * Make sure try_to_free_swap didn't release the swapcache
+	 * from under us. The page pin isn't enough to prevent that.
+	 */
+	if (unlikely(!PageSwapCache(page)))
+		goto out_page;
+
+	if (ksm_might_need_to_copy(page, vma, address)) {
+		swapcache = page;
+		page = ksm_does_need_to_copy(page, vma, address);
+
+		if (unlikely(!page)) {
+			ret = VM_FAULT_OOM;
+			page = swapcache;
+			swapcache = NULL;
+			goto out_page;
+		}
 	}
 
 	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2725,6 +2738,18 @@ static int do_swap_page(struct mm_struct
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
+	if (swapcache) {
+		/*
+		 * Hold the lock to avoid the swap entry to be reused
+		 * until we take the PT lock for the pte_same() check
+		 * (to avoid false positives from pte_same). For
+		 * further safety release the lock after the swap_free
+		 * so that the swap count won't change under a
+		 * parallel locked swapcache.
+		 */
+		unlock_page(swapcache);
+		page_cache_release(swapcache);
+	}
 
 	if (flags & FAULT_FLAG_WRITE) {
 		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2746,6 +2771,10 @@ out_page:
 	unlock_page(page);
 out_release:
 	page_cache_release(page);
+	if (swapcache) {
+		unlock_page(swapcache);
+		page_cache_release(swapcache);
+	}
 	return ret;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2010-09-20  3:09 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-03 15:39 [PATCH] fix swapin race condition Andrea Arcangeli
2010-09-03 20:02 ` Andrew Morton
2010-09-04 12:29   ` Andrea Arcangeli
2010-09-03 23:57 ` Rik van Riel
2010-09-06  2:35 ` Hugh Dickins
2010-09-15 23:02   ` Hugh Dickins
2010-09-15 23:42     ` Andrea Arcangeli
2010-09-16  0:10       ` Hugh Dickins
2010-09-16 21:03         ` Andrea Arcangeli
2010-09-16 21:08           ` Andrea Arcangeli
2010-09-17  2:31           ` Hugh Dickins
2010-09-18 13:19             ` Andrea Arcangeli
2010-09-20  2:35               ` Hugh Dickins
2010-09-20  2:40                 ` [PATCH] mm: further " Hugh Dickins
2010-09-20  2:40                   ` Hugh Dickins
2010-09-20  3:09                   ` Rik van Riel
2010-09-20  3:09                     ` Rik van Riel
  -- strict thread matches above, loose matches on Subject: below --
2010-07-09  0:23 [PATCH] " Andrea Arcangeli
2010-07-09 20:32 ` Hugh Dickins
2010-07-09 21:19   ` Hugh Dickins
2010-07-09 22:02   ` Rik van Riel
2010-07-13  1:08   ` Andrea Arcangeli
2010-07-13 21:30     ` Hugh Dickins

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.