linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Race in new page migration code?
@ 2006-01-14 15:55 Nick Piggin
  2006-01-14 18:01 ` Christoph Lameter
  0 siblings, 1 reply; 28+ messages in thread
From: Nick Piggin @ 2006-01-14 15:55 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List

Hi Christoph,

I'm fairly sure there is a race in the page migration code
due to your not taking a reference on the page. Taking the
reference also can make things less convoluted.

Also, an unsuccessful isolation attempt does not mean something is
wrong. I removed the WARN_ON, but you should probably be retrying
on this level if you are really interested in migrating all pages.

Patch is not yet tested with page migration.

Nick

--
Migration code currently does not take a reference to target page
properly, so between unlocking the pte and trying to take a new
reference to the page with isolate_lru_page, anything could happen to
it.

Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c
+++ linux-2.6/mm/mempolicy.c
@@ -217,6 +217,7 @@ static int check_pte_range(struct vm_are
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+			get_page(page);
 			spin_unlock(ptl);
 			migrate_page_add(vma, page, private, flags);
 			spin_lock(ptl);
@@ -544,7 +545,8 @@ out:
 }
 
 /*
- * Add a page to be migrated to the pagelist
+ * Add a page to be migrated to the pagelist.
+ * Caller must hold a reference on the page.
  */
 static void migrate_page_add(struct vm_area_struct *vma,
 	struct page *page, struct list_head *pagelist, unsigned long flags)
@@ -555,15 +557,10 @@ static void migrate_page_add(struct vm_a
 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 	    mapping_writably_mapped(page->mapping) ||
 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
+		if (isolate_lru_page(page))
 			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
+		else
+			put_page(page);
 	}
 }
 
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -586,7 +586,8 @@ static inline void move_to_lru(struct pa
 }
 
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU, we must hold a reference
+ * to the pages, which is dropped here.
  *
  * returns the number of pages put back.
  */
@@ -773,32 +774,33 @@ static void lru_add_drain_per_cpu(void *
  * Result:
  *  0 = page not on LRU list
  *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
+ * -ve = error
  */
 int isolate_lru_page(struct page *page)
 {
 	int rc = 0;
 	struct zone *zone = page_zone(page);
 
-redo:
+	if (!PageLRU(page)) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (!PageLRU(page))
+			goto out;
+	}
+
 	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
+	if (TestClearPageLRU(page)) {
+		rc = 1;
 		if (PageActive(page))
 			del_page_from_active_list(zone, page);
 		else
 			del_page_from_inactive_list(zone, page);
 	}
 	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
-	}
+out:
 	return rc;
 }
 #endif
@@ -831,18 +833,20 @@ static int isolate_lru_pages(int nr_to_s
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		switch (__isolate_lru_page(page)) {
-		case 1:
-			/* Succeeded to isolate page */
-			list_move(&page->lru, dst);
-			nr_taken++;
-			break;
-		case -ENOENT:
-			/* Not possible to isolate */
-			list_move(&page->lru, src);
-			break;
-		default:
+		if (!TestClearPageLRU(page))
 			BUG();
+		list_del(&page->lru);
+		if (get_page_testone(page)) {
+			/*
+			 * It is being freed elsewhere
+			 */
+			__put_page(page);
+			SetPageLRU(page);
+			list_add(&page->lru, src);
+			continue;
+		} else {
+			list_add(&page->lru, dst);
+			nr_taken++;
 		}
 	}
 
Index: linux-2.6/include/linux/mm_inline.h
===================================================================
--- linux-2.6.orig/include/linux/mm_inline.h
+++ linux-2.6/include/linux/mm_inline.h
@@ -39,24 +39,3 @@ del_page_from_lru(struct zone *zone, str
 	}
 }
 
-/*
- * Isolate one page from the LRU lists.
- *
- * - zone->lru_lock must be held
- */
-static inline int __isolate_lru_page(struct page *page)
-{
-	if (unlikely(!TestClearPageLRU(page)))
-		return 0;
-
-	if (get_page_testone(page)) {
-		/*
-		 * It is being freed elsewhere
-		 */
-		__put_page(page);
-		SetPageLRU(page);
-		return -ENOENT;
-	}
-
-	return 1;
-}

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-14 15:55 Race in new page migration code? Nick Piggin
@ 2006-01-14 18:01 ` Christoph Lameter
  2006-01-14 18:19   ` Nick Piggin
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2006-01-14 18:01 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List

On Sat, 14 Jan 2006, Nick Piggin wrote:

> I'm fairly sure there is a race in the page migration code
> due to your not taking a reference on the page. Taking the
> reference also can make things less convoluted.

We take that reference count on the page:

/*
 * Isolate one page from the LRU lists.
 *
 * - zone->lru_lock must be held
 */
static inline int __isolate_lru_page(struct page *page)
{
        if (unlikely(!TestClearPageLRU(page)))
                return 0;

>>>>        if (get_page_testone(page)) {
                /*
                 * It is being freed elsewhere
                 */
                __put_page(page);
                SetPageLRU(page);
                return -ENOENT;
        }

        return 1;
}

 
> Also, an unsuccessful isolation attempt does not mean something is
> wrong. I removed the WARN_ON, but you should probably be retrying
> on this level if you are really interested in migrating all pages.

It depends what you mean by unsuccessful isolate attempt. One reason for 
not being successful is that the page has been freed. Thats okay.

The other is that the page is not on the LRU, and is not being moved
back to the LRU by draining the lru caches. In that case we need to
have a WARN_ON at least for now. There may be other reasons that a page
is not on the LRU but I would like to be careful about that at first.

Its not an error but something that is of concern thus WARN_ON.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-14 18:01 ` Christoph Lameter
@ 2006-01-14 18:19   ` Nick Piggin
  2006-01-14 18:58     ` Christoph Lameter
  0 siblings, 1 reply; 28+ messages in thread
From: Nick Piggin @ 2006-01-14 18:19 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Sat, Jan 14, 2006 at 10:01:43AM -0800, Christoph Lameter wrote:
> On Sat, 14 Jan 2006, Nick Piggin wrote:
> 
> > I'm fairly sure there is a race in the page migration code
> > due to your not taking a reference on the page. Taking the
> > reference also can make things less convoluted.
> 
> We take that reference count on the page:
> 

Yes, after you have dropped all your claims to pin this page
(ie. pte lock). You really can't take a refcount on a page that
you haven't somehow pinned (only I can ;)). This get_page_testone
code used by reclaim is a tricky special case where the page is
pinned by lru_lock even if it is "free" (ie. zero refcount).

It is not something that you can use without being very careful.
And I don't understand why you would want to even if it did work,
after you take a look at the simplicity of my patch.

> /*
>  * Isolate one page from the LRU lists.
>  *
>  * - zone->lru_lock must be held
>  */
> static inline int __isolate_lru_page(struct page *page)
> {
>         if (unlikely(!TestClearPageLRU(page)))
>                 return 0;
> 
> >>>>        if (get_page_testone(page)) {
>                 /*
>                  * It is being freed elsewhere
>                  */
>                 __put_page(page);
>                 SetPageLRU(page);
>                 return -ENOENT;
>         }
> 
>         return 1;
> }
> 

By this stage the page may have been freed, and reused by an
unrelated pagecache on the LRU. I'm not sure if there are any
worse races than this (ie. random page being moved), but I
wouldn't like to risk it.

>  
> > Also, an unsuccessful isolation attempt does not mean something is
> > wrong. I removed the WARN_ON, but you should probably be retrying
> > on this level if you are really interested in migrating all pages.
> 
> It depends what you mean by unsuccessful isolate attempt. One reason for 
> not being successful is that the page has been freed. Thats okay.
> 
> The other is that the page is not on the LRU, and is not being moved
> back to the LRU by draining the lru caches. In that case we need to
> have a WARN_ON at least for now. There may be other reasons that a page
> is not on the LRU but I would like to be careful about that at first.
> 
> Its not an error but something that is of concern thus WARN_ON.

kswapd picks them off the lru as normal part of scanning. A
WARN_ON is simply spam.

Nick


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-14 18:19   ` Nick Piggin
@ 2006-01-14 18:58     ` Christoph Lameter
  2006-01-15  5:28       ` Nick Piggin
                         ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Christoph Lameter @ 2006-01-14 18:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Linux Kernel Mailing List, Linux Memory Management List

On Sat, 14 Jan 2006, Nick Piggin wrote:

> > We take that reference count on the page:
> Yes, after you have dropped all your claims to pin this page
> (ie. pte lock). You really can't take a refcount on a page that

Oh. Now I see. I screwed that up by a fix I added.... We cannot drop the 
ptl here. So back to the way it was before. Remove the draining from 
isolate_lru_page and do it before scanning for pages so that we do not
have to drop the ptl. 

Also remove the WARN_ON since its now even possible that other actions of 
the VM move the pages into the LRU lists while we scan for pages to
migrate.

This increases the chance that we find pages that cannot be migrated.

Needs to be applied to Linus tree.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.15/mm/mempolicy.c
===================================================================
--- linux-2.6.15.orig/mm/mempolicy.c	2006-01-11 20:55:08.000000000 -0800
+++ linux-2.6.15/mm/mempolicy.c	2006-01-14 10:37:19.000000000 -0800
@@ -216,11 +216,8 @@ static int check_pte_range(struct vm_are
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-			spin_unlock(ptl);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
-			spin_lock(ptl);
-		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -297,6 +294,11 @@ static inline int vma_migratable(struct 
 	return 1;
 }
 
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
 /*
  * Check if all pages in a range are on a set of nodes.
  * If pagelist != NULL then isolate pages from the LRU and
@@ -309,6 +311,12 @@ check_range(struct mm_struct *mm, unsign
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
+	/*
+	 * Clear the LRU lists so that we can isolate the pages
+	 */
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
@@ -554,17 +562,9 @@ static void migrate_page_add(struct vm_a
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 	    mapping_writably_mapped(page->mapping) ||
-	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
+	    single_mm_mapping(vma->vm_mm, page->mapping))
+		if (isolate_lru_page(page) == 1)
 			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
-	}
 }
 
 static int swap_pages(struct list_head *pagelist)
Index: linux-2.6.15/mm/vmscan.c
===================================================================
--- linux-2.6.15.orig/mm/vmscan.c	2006-01-11 12:49:03.000000000 -0800
+++ linux-2.6.15/mm/vmscan.c	2006-01-14 10:39:15.000000000 -0800
@@ -760,11 +760,6 @@ next:
 	return nr_failed + retry;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
 /*
  * Isolate one page from the LRU lists and put it on the
  * indicated list. Do necessary cache draining if the
@@ -780,7 +775,6 @@ int isolate_lru_page(struct page *page)
 	int rc = 0;
 	struct zone *zone = page_zone(page);
 
-redo:
 	spin_lock_irq(&zone->lru_lock);
 	rc = __isolate_lru_page(page);
 	if (rc == 1) {
@@ -790,15 +784,6 @@ redo:
 			del_page_from_inactive_list(zone, page);
 	}
 	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
-	}
 	return rc;
 }
 #endif

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-14 18:58     ` Christoph Lameter
@ 2006-01-15  5:28       ` Nick Piggin
  2006-01-16  6:54         ` Christoph Lameter
  2006-01-15  6:58       ` Nick Piggin
  2006-01-15 10:58       ` Hugh Dickins
  2 siblings, 1 reply; 28+ messages in thread
From: Nick Piggin @ 2006-01-15  5:28 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

Christoph Lameter wrote:
> On Sat, 14 Jan 2006, Nick Piggin wrote:
> 
> 
>>>We take that reference count on the page:
>>
>>Yes, after you have dropped all your claims to pin this page
>>(ie. pte lock). You really can't take a refcount on a page that
> 
> 
> Oh. Now I see. I screwed that up by a fix I added.... We cannot drop the 
> ptl here. So back to the way it was before. Remove the draining from 
> isolate_lru_page and do it before scanning for pages so that we do not
> have to drop the ptl. 
> 

OK (either way is fine), but you should still drop the __isolate_lru_page
nonsense and revert it like my patch does.

> Also remove the WARN_ON since its now even possible that other actions of 
> the VM move the pages into the LRU lists while we scan for pages to
> migrate.
> 

Well, it has always been possible since vmscan started batching scans a
long time ago. Actually seeing as you only take a read lock on the semaphore
it is probably also possible to have a concurrent migrate operation cause
this as well.

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-14 18:58     ` Christoph Lameter
  2006-01-15  5:28       ` Nick Piggin
@ 2006-01-15  6:58       ` Nick Piggin
  2006-01-15 10:58       ` Hugh Dickins
  2 siblings, 0 replies; 28+ messages in thread
From: Nick Piggin @ 2006-01-15  6:58 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Sat, Jan 14, 2006 at 10:58:25AM -0800, Christoph Lameter wrote:
> On Sat, 14 Jan 2006, Nick Piggin wrote:
> 
> > > We take that reference count on the page:
> > Yes, after you have dropped all your claims to pin this page
> > (ie. pte lock). You really can't take a refcount on a page that
> 
> Oh. Now I see. I screwed that up by a fix I added.... We cannot drop the 
> ptl here. So back to the way it was before. Remove the draining from 
> isolate_lru_page and do it before scanning for pages so that we do not
> have to drop the ptl. 
> 

OK, if you prefer doing it with the pte lock held, how's this patch?

--
Migration code currently does not take a reference to target page
properly, so between unlocking the pte and trying to take a new
reference to the page with isolate_lru_page, anything could happen to
it.

Fix this by holding the pte lock until we get a chance to elevate the
refcount.

Other small cleanups while we're here.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c
+++ linux-2.6/mm/mempolicy.c
@@ -216,11 +216,8 @@ static int check_pte_range(struct vm_are
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-			spin_unlock(ptl);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
-			spin_lock(ptl);
-		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +306,10 @@ check_range(struct mm_struct *mm, unsign
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
+	/* Clear the LRU lists so pages can be isolated */
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		lru_add_drain_all();
+
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
@@ -555,15 +556,8 @@ static void migrate_page_add(struct vm_a
 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 	    mapping_writably_mapped(page->mapping) ||
 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
+		if (isolate_lru_page(page))
 			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
 	}
 }
 
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -586,7 +586,7 @@ static inline void move_to_lru(struct pa
 }
 
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
  *
  * returns the number of pages put back.
  */
@@ -760,46 +760,33 @@ next:
 	return nr_failed + retry;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
 /*
  * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
+ * indicated list with elevated refcount.
  *
  * Result:
  *  0 = page not on LRU list
  *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
  */
 int isolate_lru_page(struct page *page)
 {
-	int rc = 0;
-	struct zone *zone = page_zone(page);
+	int ret = 0;
 
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (TestClearPageLRU(page)) {
+			ret = 1;
+			get_page(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
 	}
-	return rc;
+
+	return ret;
 }
 #endif
 
@@ -831,18 +818,20 @@ static int isolate_lru_pages(int nr_to_s
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		switch (__isolate_lru_page(page)) {
-		case 1:
-			/* Succeeded to isolate page */
-			list_move(&page->lru, dst);
-			nr_taken++;
-			break;
-		case -ENOENT:
-			/* Not possible to isolate */
-			list_move(&page->lru, src);
-			break;
-		default:
+		if (!TestClearPageLRU(page))
 			BUG();
+		list_del(&page->lru);
+		if (get_page_testone(page)) {
+			/*
+			 * It is being freed elsewhere
+			 */
+			__put_page(page);
+			SetPageLRU(page);
+			list_add(&page->lru, src);
+			continue;
+		} else {
+			list_add(&page->lru, dst);
+			nr_taken++;
 		}
 	}
 
Index: linux-2.6/include/linux/mm_inline.h
===================================================================
--- linux-2.6.orig/include/linux/mm_inline.h
+++ linux-2.6/include/linux/mm_inline.h
@@ -39,24 +39,3 @@ del_page_from_lru(struct zone *zone, str
 	}
 }
 
-/*
- * Isolate one page from the LRU lists.
- *
- * - zone->lru_lock must be held
- */
-static inline int __isolate_lru_page(struct page *page)
-{
-	if (unlikely(!TestClearPageLRU(page)))
-		return 0;
-
-	if (get_page_testone(page)) {
-		/*
-		 * It is being freed elsewhere
-		 */
-		__put_page(page);
-		SetPageLRU(page);
-		return -ENOENT;
-	}
-
-	return 1;
-}
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -167,6 +167,7 @@ extern void FASTCALL(lru_cache_add_activ
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
+extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -174,6 +174,24 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+#ifdef CONFIG_SMP
+	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+#else
+	lru_add_drain();
+	return 0;
+#endif
+}
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct ki
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
+ *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -33,7 +33,7 @@
  *     mapping->i_mmap_lock
  *       anon_vma->lock
  *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
  *             mapping->private_lock (in __set_page_dirty_buffers)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-14 18:58     ` Christoph Lameter
  2006-01-15  5:28       ` Nick Piggin
  2006-01-15  6:58       ` Nick Piggin
@ 2006-01-15 10:58       ` Hugh Dickins
  2006-01-16  6:51         ` Christoph Lameter
  2 siblings, 1 reply; 28+ messages in thread
From: Hugh Dickins @ 2006-01-15 10:58 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Sat, 14 Jan 2006, Christoph Lameter wrote:
> 
> Also remove the WARN_ON since its now even possible that other actions of 
> the VM move the pages into the LRU lists while we scan for pages to
> migrate.

Good.  And whether it's your or Nick's patch that goes in, please also
remove that PageReserved test which you recently put in check_pte_range.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-15 10:58       ` Hugh Dickins
@ 2006-01-16  6:51         ` Christoph Lameter
  2006-01-16 12:32           ` Hugh Dickins
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2006-01-16  6:51 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Sun, 15 Jan 2006, Hugh Dickins wrote:

> On Sat, 14 Jan 2006, Christoph Lameter wrote:
> > 
> > Also remove the WARN_ON since its now even possible that other actions of 
> > the VM move the pages into the LRU lists while we scan for pages to
> > migrate.
> 
> Good.  And whether it's your or Nick's patch that goes in, please also
> remove that PageReserved test which you recently put in check_pte_range.

Zero pages are still marked reserved AFAIK. Why not check for it?


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-15  5:28       ` Nick Piggin
@ 2006-01-16  6:54         ` Christoph Lameter
  2006-01-16  7:44           ` Nick Piggin
  2006-01-17  8:29           ` Magnus Damm
  0 siblings, 2 replies; 28+ messages in thread
From: Christoph Lameter @ 2006-01-16  6:54 UTC (permalink / raw)
  To: Magnus Damm
  Cc: Nick Piggin, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 15 Jan 2006, Nick Piggin wrote:

> OK (either way is fine), but you should still drop the __isolate_lru_page
> nonsense and revert it like my patch does.

Ok with me. Magnus: You needed the __isolate_lru_page for some other 
purpose. Is that still the case?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16  6:54         ` Christoph Lameter
@ 2006-01-16  7:44           ` Nick Piggin
  2006-01-17  8:29           ` Magnus Damm
  1 sibling, 0 replies; 28+ messages in thread
From: Nick Piggin @ 2006-01-16  7:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Magnus Damm, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

Christoph Lameter wrote:
> On Sun, 15 Jan 2006, Nick Piggin wrote:
> 
> 
>>OK (either way is fine), but you should still drop the __isolate_lru_page
>>nonsense and revert it like my patch does.
> 
> 
> Ok with me. Magnus: You needed the __isolate_lru_page for some other 
> purpose. Is that still the case?
> 

Either way, we can remove it from the tree for now.

But I'm almost sure such a user would be wrong too. The reason it is
required is very specific and it is because taking lru_lock and then
looking up a page on the LRU uniquely does not pin the page. If you
find the page via any other means other than simply looking on the
LRU, then get_page_testone is wrong and you should either pin it or
take a normal reference to it instead.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16  6:51         ` Christoph Lameter
@ 2006-01-16 12:32           ` Hugh Dickins
  2006-01-16 15:47             ` Christoph Lameter
  2006-01-17 17:29             ` Christoph Lameter
  0 siblings, 2 replies; 28+ messages in thread
From: Hugh Dickins @ 2006-01-16 12:32 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Sun, 15 Jan 2006, Christoph Lameter wrote:
> On Sun, 15 Jan 2006, Hugh Dickins wrote:
> > 
> > Good.  And whether it's your or Nick's patch that goes in, please also
> > remove that PageReserved test which you recently put in check_pte_range.
> 
> Zero pages are still marked reserved AFAIK. Why not check for it?

Indeed they are, at present and quite likely into posterity.  But
they're not a common case here, and migrate_page_add now handles them
silently, so why bother to complicate it with an unnecessary check?

You added the test because the WARN_ON fired, you're now doing the
right thing and removing the WARN_ON because it was inevitably racy,
so it would make sense to remove the PageReserved test too.

If you look through the rest of mm/, you'll find 2.6.15 removed all
the PageReserved tests, except at the low level in page_alloc.c:
so it's retrograde for you to be adding one back here.  Testing
PageLRU would be more relevant, if you insist on such a test.

Or have you found the zero page mapcount distorting get_stats stats?
If that's an issue, then better add a commented test for it there.

Hmm, that battery of unusual tests at the start of migrate_page_add
is odd: the tests don't quite match the comment, and it isn't clear
what reasoning lies behind the comment anyway.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 12:32           ` Hugh Dickins
@ 2006-01-16 15:47             ` Christoph Lameter
  2006-01-16 16:06               ` Hugh Dickins
  2006-01-17 17:29             ` Christoph Lameter
  1 sibling, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2006-01-16 15:47 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Mon, 16 Jan 2006, Hugh Dickins wrote:

> Indeed they are, at present and quite likely into posterity.  But
> they're not a common case here, and migrate_page_add now handles them
> silently, so why bother to complicate it with an unnecessary check?

check_range also is used for statistics and for checking if a range is 
policy compliant. Without that check zeropages may be counted or flagged 
as not on the right node with MPOL_MF_STRICT.

For migrate_page_add this has now simply become an optimization since
there is no WARN_ON occurring anymore.

> Or have you found the zero page mapcount distorting get_stats stats?
> If that's an issue, then better add a commented test for it there.

It also applies to the policy compliance check.

> Hmm, that battery of unusual tests at the start of migrate_page_add
> is odd: the tests don't quite match the comment, and it isn't clear
> what reasoning lies behind the comment anywa

Hmm.... Maybe better clean up the thing a bit. Will do that when I get 
back to work next week.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 15:47             ` Christoph Lameter
@ 2006-01-16 16:06               ` Hugh Dickins
  2006-01-16 16:10                 ` Christoph Lameter
  0 siblings, 1 reply; 28+ messages in thread
From: Hugh Dickins @ 2006-01-16 16:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Mon, 16 Jan 2006, Christoph Lameter wrote:
> On Mon, 16 Jan 2006, Hugh Dickins wrote:
> 
> > Or have you found the zero page mapcount distorting get_stats stats?
> > If that's an issue, then better add a commented test for it there.
> 
> It also applies to the policy compliance check.

Good point, I missed that: you've inadventently changed the behaviour
of sys_mbind when it encounters a zero page from a disallowed node.
Another reason to remove your PageReserved test.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 16:06               ` Hugh Dickins
@ 2006-01-16 16:10                 ` Christoph Lameter
  2006-01-16 16:28                   ` Hugh Dickins
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2006-01-16 16:10 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Mon, 16 Jan 2006, Hugh Dickins wrote:

> > It also applies to the policy compliance check.
> 
> Good point, I missed that: you've inadventently changed the behaviour
> of sys_mbind when it encounters a zero page from a disallowed node.
> Another reason to remove your PageReserved test.

The zero page always come from node zero on IA64. I think this is more the 
inadvertent fixing of a bug. The policy compliance check currently fails 
if an address range contains a zero page but node zero is not contained in 
the nodelist.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 16:10                 ` Christoph Lameter
@ 2006-01-16 16:28                   ` Hugh Dickins
  2006-01-16 16:51                     ` Andi Kleen
  0 siblings, 1 reply; 28+ messages in thread
From: Hugh Dickins @ 2006-01-16 16:28 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Mon, 16 Jan 2006, Christoph Lameter wrote:
> On Mon, 16 Jan 2006, Hugh Dickins wrote:
> 
> > > It also applies to the policy compliance check.
> > 
> > Good point, I missed that: you've inadventently changed the behaviour
> > of sys_mbind when it encounters a zero page from a disallowed node.
> > Another reason to remove your PageReserved test.
> 
> The zero page always come from node zero on IA64. I think this is more the 
> inadvertent fixing of a bug. The policy compliance check currently fails 
> if an address range contains a zero page but node zero is not contained in 
> the nodelist.

To me it sounds more like you introduced a bug than fixed one.
If MPOL_MF_STRICT and the zero page is found but not in the nodelist
demanded, then it's right to refuse, I'd say.  If Andi shares your
view that the zero pages should be ignored, I won't argue; but we
shouldn't change behaviour by mistake, without review or comment.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 16:28                   ` Hugh Dickins
@ 2006-01-16 16:51                     ` Andi Kleen
  2006-01-16 16:56                       ` Nick Piggin
  0 siblings, 1 reply; 28+ messages in thread
From: Andi Kleen @ 2006-01-16 16:51 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Christoph Lameter, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Monday 16 January 2006 17:28, Hugh Dickins wrote:
> On Mon, 16 Jan 2006, Christoph Lameter wrote:
> > On Mon, 16 Jan 2006, Hugh Dickins wrote:
> > 
> > > > It also applies to the policy compliance check.
> > > 
> > > Good point, I missed that: you've inadventently changed the behaviour
> > > of sys_mbind when it encounters a zero page from a disallowed node.
> > > Another reason to remove your PageReserved test.
> > 
> > The zero page always come from node zero on IA64. I think this is more the 
> > inadvertent fixing of a bug. The policy compliance check currently fails 
> > if an address range contains a zero page but node zero is not contained in 
> > the nodelist.
> 
> To me it sounds more like you introduced a bug than fixed one.
> If MPOL_MF_STRICT and the zero page is found but not in the nodelist
> demanded, then it's right to refuse, I'd say.  If Andi shares your
> view that the zero pages should be ignored, I won't argue; but we
> shouldn't change behaviour by mistake, without review or comment.

I agree with Christoph that the zero page should be ignored - old behaviour
was really a bug.

-Andi

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 16:51                     ` Andi Kleen
@ 2006-01-16 16:56                       ` Nick Piggin
  2006-01-17  5:06                         ` Christoph Lameter
  0 siblings, 1 reply; 28+ messages in thread
From: Nick Piggin @ 2006-01-16 16:56 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Hugh Dickins, Christoph Lameter, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Mon, Jan 16, 2006 at 05:51:26PM +0100, Andi Kleen wrote:
> 
> I agree with Christoph that the zero page should be ignored - old behaviour
> was really a bug.
> 

Fair enough. It would be nice to have a comment there has Hugh said;
it is not always clear what PageReserved is intended to test for.

Nick


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 16:56                       ` Nick Piggin
@ 2006-01-17  5:06                         ` Christoph Lameter
  2006-01-17 11:16                           ` Nick Piggin
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2006-01-17  5:06 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andi Kleen, Hugh Dickins, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Mon, 16 Jan 2006, Nick Piggin wrote:

> On Mon, Jan 16, 2006 at 05:51:26PM +0100, Andi Kleen wrote:
> > 
> > I agree with Christoph that the zero page should be ignored - old behaviour
> > was really a bug.
> > 
> 
> Fair enough. It would be nice to have a comment there has Hugh said;
> it is not always clear what PageReserved is intended to test for.

Something like this? Are there still other uses of PageReserved than the 
zero page?


Explain the use of PageReserved in check_pte_range.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.15/mm/mempolicy.c
===================================================================
--- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:31.000000000 -0800
+++ linux-2.6.15/mm/mempolicy.c	2006-01-16 21:03:03.000000000 -0800
@@ -211,6 +211,17 @@ static int check_pte_range(struct vm_are
 		page = vm_normal_page(vma, addr, *pte);
 		if (!page)
 			continue;
+		/*
+		 * The check for PageReserved here is important to avoid
+		 * handling zero pages and other pages that may have been
+		 * marked special by the system.
+		 *
+		 * If the PageReserved would not be checked here then f.e.
+		 * the location of the zero page could have an influence
+		 * on MPOL_MF_STRICT, zero pages would be counted for
+		 * the per node stats, and there would be useless attempts
+		 * to put zero pages on the migration list.
+		 */
 		if (PageReserved(page))
 			continue;
 		nid = page_to_nid(page);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16  6:54         ` Christoph Lameter
  2006-01-16  7:44           ` Nick Piggin
@ 2006-01-17  8:29           ` Magnus Damm
  2006-01-17  9:01             ` Nick Piggin
  1 sibling, 1 reply; 28+ messages in thread
From: Magnus Damm @ 2006-01-17  8:29 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On 1/16/06, Christoph Lameter <clameter@engr.sgi.com> wrote:
> On Sun, 15 Jan 2006, Nick Piggin wrote:
>
> > OK (either way is fine), but you should still drop the __isolate_lru_page
> > nonsense and revert it like my patch does.
>
> Ok with me. Magnus: You needed the __isolate_lru_page for some other
> purpose. Is that still the case?

It made sense to have it broken out when it was used twice within
vmscan.c, but now when the patch changed a lot and the function is
used only once I guess the best thing is to inline it as Nick
suggested. I will re-add it myself later on when I need it. Thanks.

/ magnus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17  8:29           ` Magnus Damm
@ 2006-01-17  9:01             ` Nick Piggin
  2006-01-17  9:22               ` Magnus Damm
  0 siblings, 1 reply; 28+ messages in thread
From: Nick Piggin @ 2006-01-17  9:01 UTC (permalink / raw)
  To: Magnus Damm
  Cc: Christoph Lameter, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

Magnus Damm wrote:
> On 1/16/06, Christoph Lameter <clameter@engr.sgi.com> wrote:
> 
>>On Sun, 15 Jan 2006, Nick Piggin wrote:
>>
>>
>>>OK (either way is fine), but you should still drop the __isolate_lru_page
>>>nonsense and revert it like my patch does.
>>
>>Ok with me. Magnus: You needed the __isolate_lru_page for some other
>>purpose. Is that still the case?
> 
> 
> It made sense to have it broken out when it was used twice within
> vmscan.c, but now when the patch changed a lot and the function is
> used only once I guess the best thing is to inline it as Nick
> suggested. I will re-add it myself later on when I need it. Thanks.
> 
> / magnus
> 

I'm curious, what do you need it for?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17  9:01             ` Nick Piggin
@ 2006-01-17  9:22               ` Magnus Damm
  0 siblings, 0 replies; 28+ messages in thread
From: Magnus Damm @ 2006-01-17  9:22 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On 1/17/06, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Magnus Damm wrote:
> > On 1/16/06, Christoph Lameter <clameter@engr.sgi.com> wrote:
> >
> >>On Sun, 15 Jan 2006, Nick Piggin wrote:
> >>
> >>
> >>>OK (either way is fine), but you should still drop the __isolate_lru_page
> >>>nonsense and revert it like my patch does.
> >>
> >>Ok with me. Magnus: You needed the __isolate_lru_page for some other
> >>purpose. Is that still the case?
> >
> >
> > It made sense to have it broken out when it was used twice within
> > vmscan.c, but now when the patch changed a lot and the function is
> > used only once I guess the best thing is to inline it as Nick
> > suggested. I will re-add it myself later on when I need it. Thanks.
> >
> > / magnus
> >
>
> I'm curious, what do you need it for?

I used that function when I worked on a memory resource control
prototype. This prototype has been superseeded by the pzone memory
resource controller posted on ckrm-tech recently.

/ magnus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17  5:06                         ` Christoph Lameter
@ 2006-01-17 11:16                           ` Nick Piggin
  0 siblings, 0 replies; 28+ messages in thread
From: Nick Piggin @ 2006-01-17 11:16 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andi Kleen, Hugh Dickins, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Mon, Jan 16, 2006 at 09:06:29PM -0800, Christoph Lameter wrote:
> On Mon, 16 Jan 2006, Nick Piggin wrote:
> 
> > On Mon, Jan 16, 2006 at 05:51:26PM +0100, Andi Kleen wrote:
> > > 
> > > I agree with Christoph that the zero page should be ignored - old behaviour
> > > was really a bug.
> > > 
> > 
> > Fair enough. It would be nice to have a comment there has Hugh said;
> > it is not always clear what PageReserved is intended to test for.
> 
> Something like this? Are there still other uses of PageReserved than the 
> zero page?
> 

Yes something like that would be good.
There are other users of PageReserved, drivers, memory holes,
kernel text, bootmem allocated memory (I think), ZERO_PAGE.


> 
> Explain the use of PageReserved in check_pte_range.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> Index: linux-2.6.15/mm/mempolicy.c
> ===================================================================
> --- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:31.000000000 -0800
> +++ linux-2.6.15/mm/mempolicy.c	2006-01-16 21:03:03.000000000 -0800
> @@ -211,6 +211,17 @@ static int check_pte_range(struct vm_are
>  		page = vm_normal_page(vma, addr, *pte);
>  		if (!page)
>  			continue;
> +		/*
> +		 * The check for PageReserved here is important to avoid
> +		 * handling zero pages and other pages that may have been
> +		 * marked special by the system.
> +		 *
> +		 * If the PageReserved would not be checked here then f.e.
> +		 * the location of the zero page could have an influence
> +		 * on MPOL_MF_STRICT, zero pages would be counted for
> +		 * the per node stats, and there would be useless attempts
> +		 * to put zero pages on the migration list.
> +		 */
>  		if (PageReserved(page))
>  			continue;
>  		nid = page_to_nid(page);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-16 12:32           ` Hugh Dickins
  2006-01-16 15:47             ` Christoph Lameter
@ 2006-01-17 17:29             ` Christoph Lameter
  2006-01-17 18:46               ` Lee Schermerhorn
  2006-01-17 19:01               ` Hugh Dickins
  1 sibling, 2 replies; 28+ messages in thread
From: Christoph Lameter @ 2006-01-17 17:29 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Mon, 16 Jan 2006, Hugh Dickins wrote:

> Hmm, that battery of unusual tests at the start of migrate_page_add
> is odd: the tests don't quite match the comment, and it isn't clear
> what reasoning lies behind the comment anyway.

Here is patch to clarify the test. I'd be glad if someone could make
the tests more accurate. This ultimately comes down to a concept of
ownership of page by a process / mm_struct that we have to approximate.

===

Explain the complicated check in migrate_page_add by putting the logic
into a separate function migration_check. This way any enhancements can
be easily added.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.15/mm/mempolicy.c
===================================================================
--- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:28.000000000 -0800
+++ linux-2.6.15/mm/mempolicy.c	2006-01-17 09:24:20.000000000 -0800
@@ -551,6 +551,37 @@ out:
 	return rc;
 }
 
+static inline int migration_check(struct mm_struct *mm, struct page *page)
+{
+	/*
+	 * If the page has no mapping then we do not track reverse mappings.
+	 * Thus the page is not mapped by other mms, so its safe to move.
+	 */
+	if (page->mapping)
+		return 1;
+
+	/*
+	 * We cannot determine "ownership" of anonymous pages.
+	 * However, this is the primary set of pages a user would like
+	 * to move. So move the page regardless of sharing.
+	 */
+	if (PageAnon(page))
+		return 1;
+
+	/*
+	 * If the mapping is writable then its reasonable to assume that
+	 * it is okay to move the page.
+	 */
+	if (mapping_writably_mapped(page->mapping))
+		return 1;
+
+	/*
+	 * Its a read only file backed mapping. Only migrate the page
+	 * if we are the only process mapping that file.
+	 */
+	return single_mm_mapping(mm, page->mapping);
+}
+
 /*
  * Add a page to be migrated to the pagelist
  */
@@ -558,11 +589,17 @@ static void migrate_page_add(struct vm_a
 	struct page *page, struct list_head *pagelist, unsigned long flags)
 {
 	/*
-	 * Avoid migrating a page that is shared by others and not writable.
+	 * MPOL_MF_MOVE_ALL migrates all pages. However, migrating all
+	 * pages may also move commonly shared pages (like for example glibc
+	 * pages referenced by all processes). If these are included in
+	 * migration then these pages may be uselessly moved back and
+	 * forth. Migration may also affect the performance of other
+	 * processes.
+	 *
+	 * If MPOL_MF_MOVE_ALL is not set then we try to avoid migrating
+	 * these shared pages.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-	    mapping_writably_mapped(page->mapping) ||
-	    single_mm_mapping(vma->vm_mm, page->mapping))
+	if ((flags & MPOL_MF_MOVE_ALL) || migration_check(vma->vm_mm, page))
 		if (isolate_lru_page(page) == 1)
 			list_add(&page->lru, pagelist);
 }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17 17:29             ` Christoph Lameter
@ 2006-01-17 18:46               ` Lee Schermerhorn
  2006-01-17 18:48                 ` Christoph Lameter
  2006-01-17 19:01               ` Hugh Dickins
  1 sibling, 1 reply; 28+ messages in thread
From: Lee Schermerhorn @ 2006-01-17 18:46 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Hugh Dickins, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Tue, 2006-01-17 at 09:29 -0800, Christoph Lameter wrote:
> On Mon, 16 Jan 2006, Hugh Dickins wrote:
> 
> > Hmm, that battery of unusual tests at the start of migrate_page_add
> > is odd: the tests don't quite match the comment, and it isn't clear
> > what reasoning lies behind the comment anyway.
> 
> Here is patch to clarify the test. I'd be glad if someone could make
> the tests more accurate. This ultimately comes down to a concept of
> ownership of page by a process / mm_struct that we have to approximate.
> 
> ===
> 
> Explain the complicated check in migrate_page_add by putting the logic
> into a separate function migration_check. This way any enhancements can
> be easily added.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> Index: linux-2.6.15/mm/mempolicy.c
> ===================================================================
> --- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:28.000000000 -0800
> +++ linux-2.6.15/mm/mempolicy.c	2006-01-17 09:24:20.000000000 -0800
> @@ -551,6 +551,37 @@ out:
>  	return rc;
>  }
>  
> +static inline int migration_check(struct mm_struct *mm, struct page *page)
> +{
> +	/*
> +	 * If the page has no mapping then we do not track reverse mappings.
> +	 * Thus the page is not mapped by other mms, so its safe to move.
> +	 */
> +	if (page->mapping)
should this be "if (!page->mapping)" ???
> +		return 1;
> +

<snip>
> -	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
like here ......................................^^^^^^^^^^^^^^
> -	    mapping_writably_mapped(page->mapping) ||
> -	    single_mm_mapping(vma->vm_mm, page->mapping))
> +	if ((flags & MPOL_MF_MOVE_ALL) || migration_check(vma->vm_mm, page))
>  		if (isolate_lru_page(page) == 1)
>  			list_add(&page->lru, pagelist);
>  }

Lee


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17 18:46               ` Lee Schermerhorn
@ 2006-01-17 18:48                 ` Christoph Lameter
  0 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2006-01-17 18:48 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Hugh Dickins, Nick Piggin, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Tue, 17 Jan 2006, Lee Schermerhorn wrote:

> > +	if (page->mapping)
> should this be "if (!page->mapping)" ???

Correct. Thanks. Fixed up patch follows:



Explain the complicated check in migrate_page_add by putting the logic
into a separate function migration_check. This way any enhancements can
be easily added.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.15/mm/mempolicy.c
===================================================================
--- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:28.000000000 -0800
+++ linux-2.6.15/mm/mempolicy.c	2006-01-17 10:47:48.000000000 -0800
@@ -551,6 +551,37 @@ out:
 	return rc;
 }
 
+static inline int migration_check(struct mm_struct *mm, struct page *page)
+{
+	/*
+	 * If the page has no mapping then we do not track reverse mappings.
+	 * Thus the page is not mapped by other mms, so its safe to move.
+	 */
+	if (!page->mapping)
+		return 1;
+
+	/*
+	 * We cannot determine "ownership" of anonymous pages.
+	 * However, this is the primary set of pages a user would like
+	 * to move. So move the page regardless of sharing.
+	 */
+	if (PageAnon(page))
+		return 1;
+
+	/*
+	 * If the mapping is writable then its reasonable to assume that
+	 * it is okay to move the page.
+	 */
+	if (mapping_writably_mapped(page->mapping))
+		return 1;
+
+	/*
+	 * Its a read only file backed mapping. Only migrate the page
+	 * if we are the only process mapping that file.
+	 */
+	return single_mm_mapping(mm, page->mapping);
+}
+
 /*
  * Add a page to be migrated to the pagelist
  */
@@ -558,11 +589,17 @@ static void migrate_page_add(struct vm_a
 	struct page *page, struct list_head *pagelist, unsigned long flags)
 {
 	/*
-	 * Avoid migrating a page that is shared by others and not writable.
+	 * MPOL_MF_MOVE_ALL migrates all pages. However, migrating all
+	 * pages may also move commonly shared pages (like for example glibc
+	 * pages referenced by all processes). If these are included in
+	 * migration then these pages may be uselessly moved back and
+	 * forth. Migration may also affect the performance of other
+	 * processes.
+	 *
+	 * If MPOL_MF_MOVE_ALL is not set then we try to avoid migrating
+	 * these shared pages.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-	    mapping_writably_mapped(page->mapping) ||
-	    single_mm_mapping(vma->vm_mm, page->mapping))
+	if ((flags & MPOL_MF_MOVE_ALL) || migration_check(vma->vm_mm, page))
 		if (isolate_lru_page(page) == 1)
 			list_add(&page->lru, pagelist);
 }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17 17:29             ` Christoph Lameter
  2006-01-17 18:46               ` Lee Schermerhorn
@ 2006-01-17 19:01               ` Hugh Dickins
  2006-01-17 20:15                 ` Christoph Lameter
  1 sibling, 1 reply; 28+ messages in thread
From: Hugh Dickins @ 2006-01-17 19:01 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, 17 Jan 2006, Christoph Lameter wrote:
> On Mon, 16 Jan 2006, Hugh Dickins wrote:
> 
> > Hmm, that battery of unusual tests at the start of migrate_page_add
> > is odd: the tests don't quite match the comment, and it isn't clear
> > what reasoning lies behind the comment anyway.
> 
> Here is patch to clarify the test. I'd be glad if someone could make
> the tests more accurate. This ultimately comes down to a concept of
> ownership of page by a process / mm_struct that we have to approximate.

Endless scope for argument here!  But I'm relieved to see there's
an MPOL_MF_MOVE_ALL subject to a capability check, so this is just
dealing with what a ordinary uncapable process might be allowed to
do to itself.

> Explain the complicated check in migrate_page_add by putting the logic
> into a separate function migration_check. This way any enhancements can
> be easily added.

Yes, that's helpful to separate it out.  I'd prefer a more specific name
than migration_check, but that name may depend on what it ends up doing.

> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> Index: linux-2.6.15/mm/mempolicy.c
> ===================================================================
> --- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:28.000000000 -0800
> +++ linux-2.6.15/mm/mempolicy.c	2006-01-17 09:24:20.000000000 -0800
> @@ -551,6 +551,37 @@ out:
>  	return rc;
>  }
>  
> +static inline int migration_check(struct mm_struct *mm, struct page *page)
> +{
> +	/*
> +	 * If the page has no mapping then we do not track reverse mappings.
> +	 * Thus the page is not mapped by other mms, so its safe to move.
> +	 */
> +	if (page->mapping)
> +		return 1;

Please cut out this test.  You probably meant to say "!page->mapping",
but those are weird cases best left alone (though rarely would they
have PageLRU set, so they'll probably be skipped later anyway).  Almost
every page you'll meet in an mm has page->mapping set, doesn't it?
Either a file page in the page cache, or an anonymous page pointing to
its anon_vma.  You've already skipped the ZERO_PAGEs and anything else
with PageReserved set, and any VM_RESERVED area (covering some driver
areas).  Just cut out this test completely, it's wrong as is,
and doesn't add anything useful when inverted.

> +
> +	/*
> +	 * We cannot determine "ownership" of anonymous pages.
> +	 * However, this is the primary set of pages a user would like
> +	 * to move. So move the page regardless of sharing.
> +	 */
> +	if (PageAnon(page))
> +		return 1;

I think that's reasonable.  The page may be "shared" with some other
processes in our fork-group (anon_vma), but we probably needn't get
worked up about that.  Though you could choose to make it stricter by

	if (PageAnon(page))
		return page_mapcount(page) == 1;

> +
> +	/*
> +	 * If the mapping is writable then its reasonable to assume that
> +	 * it is okay to move the page.
> +	 */
> +	if (mapping_writably_mapped(page->mapping))
> +		return 1;

I can't see why the fact that some other process has mapped some part
of this file for writing should have any bearing on whether we can
migrate this page.  I can see an argument (I'm unsure whether I agree
with it) that if we can have write access to this file page, then we
should be allowed to migrate it.  A test for that (given a vma arg)
would be

	if (vma->vm_flags & VM_SHARED)
		return 1;
> +
> +	/*
> +	 * Its a read only file backed mapping. Only migrate the page
> +	 * if we are the only process mapping that file.
> +	 */
> +	return single_mm_mapping(mm, page->mapping);

So what if someone else is mapping some other part of the file?
I just don't think it merits the complexity of single_mm_mapping's
prio_tree check.   I say delete single_mm_mapping and here just

	return page_mapcount(page) == 1;

Of course, page_mapcount may go up an instant later; but equally,
another vma may get added to the prio_tree an instant later.

It may be that, after much argument to and fro, the whole function will
just reduce to checking "page_mapcount(page) == 1": if so, then I think
you can go back to inlining it literally.

Hugh

> +}
> +
>  /*
>   * Add a page to be migrated to the pagelist
>   */
> @@ -558,11 +589,17 @@ static void migrate_page_add(struct vm_a
>  	struct page *page, struct list_head *pagelist, unsigned long flags)
>  {
>  	/*
> -	 * Avoid migrating a page that is shared by others and not writable.
> +	 * MPOL_MF_MOVE_ALL migrates all pages. However, migrating all
> +	 * pages may also move commonly shared pages (like for example glibc
> +	 * pages referenced by all processes). If these are included in
> +	 * migration then these pages may be uselessly moved back and
> +	 * forth. Migration may also affect the performance of other
> +	 * processes.
> +	 *
> +	 * If MPOL_MF_MOVE_ALL is not set then we try to avoid migrating
> +	 * these shared pages.
>  	 */
> -	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
> -	    mapping_writably_mapped(page->mapping) ||
> -	    single_mm_mapping(vma->vm_mm, page->mapping))
> +	if ((flags & MPOL_MF_MOVE_ALL) || migration_check(vma->vm_mm, page))
>  		if (isolate_lru_page(page) == 1)
>  			list_add(&page->lru, pagelist);
>  }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17 19:01               ` Hugh Dickins
@ 2006-01-17 20:15                 ` Christoph Lameter
  2006-01-17 20:49                   ` Hugh Dickins
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2006-01-17 20:15 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, 17 Jan 2006, Hugh Dickins wrote:

> Of course, page_mapcount may go up an instant later; but equally,
> another vma may get added to the prio_tree an instant later.

Right. The check does not have to be race free.

> It may be that, after much argument to and fro, the whole function will
> just reduce to checking "page_mapcount(page) == 1": if so, then I think
> you can go back to inlining it literally.

Oh. I love simplicity.... Sadly, one sometimes has these complicated 
notions in ones brain. Thanks.


Simplify migrate_page_add after feedback from Hugh.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.15/mm/mempolicy.c
===================================================================
--- linux-2.6.15.orig/mm/mempolicy.c	2006-01-14 10:56:28.000000000 -0800
+++ linux-2.6.15/mm/mempolicy.c	2006-01-17 12:12:42.000000000 -0800
@@ -527,42 +527,13 @@ long do_get_mempolicy(int *policy, nodem
  * page migration
  */
 
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
-			struct address_space *mapping)
-{
-	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
-	int rc = 1;
-
-	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-		if (mm != vma->vm_mm) {
-			rc = 0;
-			goto out;
-		}
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-		if (mm != vma->vm_mm) {
-			rc = 0;
-			goto out;
-		}
-out:
-	spin_unlock(&mapping->i_mmap_lock);
-	return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
 static void migrate_page_add(struct vm_area_struct *vma,
 	struct page *page, struct list_head *pagelist, unsigned long flags)
 {
 	/*
-	 * Avoid migrating a page that is shared by others and not writable.
+	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-	    mapping_writably_mapped(page->mapping) ||
-	    single_mm_mapping(vma->vm_mm, page->mapping))
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) ==1)
 		if (isolate_lru_page(page) == 1)
 			list_add(&page->lru, pagelist);
 }


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Race in new page migration code?
  2006-01-17 20:15                 ` Christoph Lameter
@ 2006-01-17 20:49                   ` Hugh Dickins
  0 siblings, 0 replies; 28+ messages in thread
From: Hugh Dickins @ 2006-01-17 20:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, 17 Jan 2006, Christoph Lameter wrote:
> 
> Simplify migrate_page_add after feedback from Hugh.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>

If you're happy with that one, yes, I certainly am too.

Hugh

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2006-01-17 20:49 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-01-14 15:55 Race in new page migration code? Nick Piggin
2006-01-14 18:01 ` Christoph Lameter
2006-01-14 18:19   ` Nick Piggin
2006-01-14 18:58     ` Christoph Lameter
2006-01-15  5:28       ` Nick Piggin
2006-01-16  6:54         ` Christoph Lameter
2006-01-16  7:44           ` Nick Piggin
2006-01-17  8:29           ` Magnus Damm
2006-01-17  9:01             ` Nick Piggin
2006-01-17  9:22               ` Magnus Damm
2006-01-15  6:58       ` Nick Piggin
2006-01-15 10:58       ` Hugh Dickins
2006-01-16  6:51         ` Christoph Lameter
2006-01-16 12:32           ` Hugh Dickins
2006-01-16 15:47             ` Christoph Lameter
2006-01-16 16:06               ` Hugh Dickins
2006-01-16 16:10                 ` Christoph Lameter
2006-01-16 16:28                   ` Hugh Dickins
2006-01-16 16:51                     ` Andi Kleen
2006-01-16 16:56                       ` Nick Piggin
2006-01-17  5:06                         ` Christoph Lameter
2006-01-17 11:16                           ` Nick Piggin
2006-01-17 17:29             ` Christoph Lameter
2006-01-17 18:46               ` Lee Schermerhorn
2006-01-17 18:48                 ` Christoph Lameter
2006-01-17 19:01               ` Hugh Dickins
2006-01-17 20:15                 ` Christoph Lameter
2006-01-17 20:49                   ` Hugh Dickins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).