Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Mel Gorman <mgorman@suse.de>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>, Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@kernel.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Linux-MM <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>, Mel Gorman <mgorman@suse.de>
Subject: [PATCH 07/63] mm: numa: Sanitize task_numa_fault() callsites
Date: Mon,  7 Oct 2013 11:28:45 +0100
Message-ID: <1381141781-10992-8-git-send-email-mgorman@suse.de> (raw)
In-Reply-To: <1381141781-10992-1-git-send-email-mgorman@suse.de>

There are three callers of task_numa_fault():

 - do_huge_pmd_numa_page():
     Accounts against the current node, not the node where the
     page resides, unless we migrated, in which case it accounts
     against the node we migrated to.

 - do_numa_page():
     Accounts against the current node, not the node where the
     page resides, unless we migrated, in which case it accounts
     against the node we migrated to.

 - do_pmd_numa_page():
     Accounts not at all when the page isn't migrated, otherwise
     accounts against the node we migrated towards.

This seems wrong to me; all three sites should have the same
sementaics, furthermore we should accounts against where the page
really is, we already know where the task is.

So modify all three sites to always account; we did after all receive
the fault; and always account to where the page is after migration,
regardless of success.

They all still differ on when they clear the PTE/PMD; ideally that
would get sorted too.

Cc: stable <stable@vger.kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/huge_memory.c | 25 +++++++++++++------------
 mm/memory.c      | 53 +++++++++++++++++++++--------------------------------
 2 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1d6334f..c3bb65f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1281,18 +1281,19 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct anon_vma *anon_vma = NULL;
 	struct page *page;
 	unsigned long haddr = addr & HPAGE_PMD_MASK;
+	int page_nid = -1, this_nid = numa_node_id();
 	int target_nid;
-	int current_nid = -1;
-	bool migrated, page_locked;
+	bool page_locked;
+	bool migrated = false;
 
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(pmd, *pmdp)))
 		goto out_unlock;
 
 	page = pmd_page(pmd);
-	current_nid = page_to_nid(page);
+	page_nid = page_to_nid(page);
 	count_vm_numa_event(NUMA_HINT_FAULTS);
-	if (current_nid == numa_node_id())
+	if (page_nid == this_nid)
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
 	/*
@@ -1335,19 +1336,18 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_unlock(&mm->page_table_lock);
 	migrated = migrate_misplaced_transhuge_page(mm, vma,
 				pmdp, pmd, addr, page, target_nid);
-	if (!migrated)
+	if (migrated)
+		page_nid = target_nid;
+	else
 		goto check_same;
 
-	task_numa_fault(target_nid, HPAGE_PMD_NR, true);
-	if (anon_vma)
-		page_unlock_anon_vma_read(anon_vma);
-	return 0;
+	goto out;
 
 check_same:
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(pmd, *pmdp))) {
 		/* Someone else took our fault */
-		current_nid = -1;
+		page_nid = -1;
 		goto out_unlock;
 	}
 clear_pmdnuma:
@@ -1362,8 +1362,9 @@ out:
 	if (anon_vma)
 		page_unlock_anon_vma_read(anon_vma);
 
-	if (current_nid != -1)
-		task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+	if (page_nid != -1)
+		task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..42ae82e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3519,12 +3519,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-				unsigned long addr, int current_nid)
+				unsigned long addr, int page_nid)
 {
 	get_page(page);
 
 	count_vm_numa_event(NUMA_HINT_FAULTS);
-	if (current_nid == numa_node_id())
+	if (page_nid == numa_node_id())
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
 	return mpol_misplaced(page, vma, addr);
@@ -3535,7 +3535,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *page = NULL;
 	spinlock_t *ptl;
-	int current_nid = -1;
+	int page_nid = -1;
 	int target_nid;
 	bool migrated = false;
 
@@ -3565,15 +3565,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		return 0;
 	}
 
-	current_nid = page_to_nid(page);
-	target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+	page_nid = page_to_nid(page);
+	target_nid = numa_migrate_prep(page, vma, addr, page_nid);
 	pte_unmap_unlock(ptep, ptl);
 	if (target_nid == -1) {
-		/*
-		 * Account for the fault against the current node if it not
-		 * being replaced regardless of where the page is located.
-		 */
-		current_nid = numa_node_id();
 		put_page(page);
 		goto out;
 	}
@@ -3581,11 +3576,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* Migrate to the requested node */
 	migrated = migrate_misplaced_page(page, target_nid);
 	if (migrated)
-		current_nid = target_nid;
+		page_nid = target_nid;
 
 out:
-	if (current_nid != -1)
-		task_numa_fault(current_nid, 1, migrated);
+	if (page_nid != -1)
+		task_numa_fault(page_nid, 1, migrated);
 	return 0;
 }
 
@@ -3600,7 +3595,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long offset;
 	spinlock_t *ptl;
 	bool numa = false;
-	int local_nid = numa_node_id();
 
 	spin_lock(&mm->page_table_lock);
 	pmd = *pmdp;
@@ -3623,9 +3617,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
 		pte_t pteval = *pte;
 		struct page *page;
-		int curr_nid = local_nid;
+		int page_nid = -1;
 		int target_nid;
-		bool migrated;
+		bool migrated = false;
+
 		if (!pte_present(pteval))
 			continue;
 		if (!pte_numa(pteval))
@@ -3647,25 +3642,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (unlikely(page_mapcount(page) != 1))
 			continue;
 
-		/*
-		 * Note that the NUMA fault is later accounted to either
-		 * the node that is currently running or where the page is
-		 * migrated to.
-		 */
-		curr_nid = local_nid;
-		target_nid = numa_migrate_prep(page, vma, addr,
-					       page_to_nid(page));
-		if (target_nid == -1) {
+		page_nid = page_to_nid(page);
+		target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+		pte_unmap_unlock(pte, ptl);
+		if (target_nid != -1) {
+			migrated = migrate_misplaced_page(page, target_nid);
+			if (migrated)
+				page_nid = target_nid;
+		} else {
 			put_page(page);
-			continue;
 		}
 
-		/* Migrate to the requested node */
-		pte_unmap_unlock(pte, ptl);
-		migrated = migrate_misplaced_page(page, target_nid);
-		if (migrated)
-			curr_nid = target_nid;
-		task_numa_fault(curr_nid, 1, migrated);
+		if (page_nid != -1)
+			task_numa_fault(page_nid, 1, migrated);
 
 		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	}
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply index

Thread overview: 135+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-10-07 10:28 [PATCH 0/63] Basic scheduler support for automatic NUMA balancing V9 Mel Gorman
2013-10-07 10:28 ` [PATCH 01/63] hotplug: Optimize {get,put}_online_cpus() Mel Gorman
2013-10-07 10:28 ` [PATCH 02/63] mm: numa: Document automatic NUMA balancing sysctls Mel Gorman
2013-10-07 12:46   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 03/63] sched, numa: Comment fixlets Mel Gorman
2013-10-07 12:46   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 04/63] mm: numa: Do not account for a hinting fault if we raced Mel Gorman
2013-10-07 12:47   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 05/63] mm: Wait for THP migrations to complete during NUMA hinting faults Mel Gorman
2013-10-07 13:55   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 06/63] mm: Prevent parallel splits during THP migration Mel Gorman
2013-10-07 14:01   ` Rik van Riel
2013-10-07 10:28 ` Mel Gorman [this message]
2013-10-07 14:02   ` [PATCH 07/63] mm: numa: Sanitize task_numa_fault() callsites Rik van Riel
2013-10-07 10:28 ` [PATCH 08/63] mm: Close races between THP migration and PMD numa clearing Mel Gorman
2013-10-07 14:02   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 09/63] mm: Account for a THP NUMA hinting update as one PTE update Mel Gorman
2013-10-07 14:02   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 10/63] mm: Do not flush TLB during protection change if !pte_present && !migration_entry Mel Gorman
2013-10-07 15:12   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 11/63] mm: Only flush TLBs if a transhuge PMD is modified for NUMA pte scanning Mel Gorman
2013-10-07 10:28 ` [PATCH 12/63] mm: numa: Do not migrate or account for hinting faults on the zero page Mel Gorman
2013-10-07 17:10   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 13/63] sched: numa: Mitigate chance that same task always updates PTEs Mel Gorman
2013-10-07 17:24   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 14/63] sched: numa: Continue PTE scanning even if migrate rate limited Mel Gorman
2013-10-07 17:24   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 15/63] Revert "mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node" Mel Gorman
2013-10-07 17:42   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 16/63] sched: numa: Initialise numa_next_scan properly Mel Gorman
2013-10-07 17:44   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 17/63] sched: Set the scan rate proportional to the memory usage of the task being scanned Mel Gorman
2013-10-07 17:44   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 18/63] sched: numa: Slow scan rate if no NUMA hinting faults are being recorded Mel Gorman
2013-10-07 18:02   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 19/63] sched: Track NUMA hinting faults on per-node basis Mel Gorman
2013-10-07 18:02   ` Rik van Riel
2013-12-04  5:32   ` Wanpeng Li
2013-12-04  5:37     ` Wanpeng Li
2013-10-07 10:28 ` [PATCH 20/63] sched: Select a preferred node with the most numa hinting faults Mel Gorman
2013-10-07 18:04   ` Rik van Riel
2013-10-07 10:28 ` [PATCH 21/63] sched: Update NUMA hinting faults once per scan Mel Gorman
2013-10-07 18:39   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 22/63] sched: Favour moving tasks towards the preferred node Mel Gorman
2013-10-07 18:39   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 23/63] sched: Resist moving tasks towards nodes with fewer hinting faults Mel Gorman
2013-10-07 18:40   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 24/63] sched: Reschedule task on preferred NUMA node once selected Mel Gorman
2013-10-07 18:40   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 25/63] sched: Add infrastructure for split shared/private accounting of NUMA hinting faults Mel Gorman
2013-10-07 18:41   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 26/63] sched: Check current->mm before allocating NUMA faults Mel Gorman
2013-10-07 18:41   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 27/63] mm: numa: Scan pages with elevated page_mapcount Mel Gorman
2013-10-07 18:43   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 28/63] sched: Remove check that skips small VMAs Mel Gorman
2013-10-07 18:44   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 29/63] sched: Set preferred NUMA node based on number of private faults Mel Gorman
2013-10-07 18:45   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 30/63] sched: Do not migrate memory immediately after switching node Mel Gorman
2013-10-07 10:29 ` [PATCH 31/63] mm: numa: only unmap migrate-on-fault VMAs Mel Gorman
2013-10-07 10:29 ` [PATCH 32/63] sched: Avoid overloading CPUs on a preferred NUMA node Mel Gorman
2013-10-07 18:58   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 33/63] sched: Retry migration of tasks to CPU on a preferred node Mel Gorman
2013-10-07 18:58   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 34/63] sched: numa: increment numa_migrate_seq when task runs in correct location Mel Gorman
2013-10-07 10:29 ` [PATCH 35/63] sched: numa: Do not trap hinting faults for shared libraries Mel Gorman
2013-10-07 19:04   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 36/63] mm: numa: Only trap pmd hinting faults if we would otherwise trap PTE faults Mel Gorman
2013-10-07 19:06   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 37/63] stop_machine: Introduce stop_two_cpus() Mel Gorman
2013-10-07 10:29 ` [PATCH 38/63] sched: Introduce migrate_swap() Mel Gorman
2013-10-07 19:06   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 39/63] sched: numa: Use a system-wide search to find swap/migration candidates Mel Gorman
2013-10-07 19:07   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 40/63] sched: numa: Favor placing a task on the preferred node Mel Gorman
2013-10-07 19:07   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 41/63] sched: numa: fix placement of workloads spread across multiple nodes Mel Gorman
2013-10-07 10:29 ` [PATCH 42/63] mm: numa: Change page last {nid,pid} into {cpu,pid} Mel Gorman
2013-10-07 19:08   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 43/63] sched: numa: Use {cpu, pid} to create task groups for shared faults Mel Gorman
2013-10-07 19:09   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 44/63] sched: numa: Report a NUMA task group ID Mel Gorman
2013-10-07 19:09   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 45/63] mm: numa: copy cpupid on page migration Mel Gorman
2013-10-07 10:29 ` [PATCH 46/63] mm: numa: Do not group on RO pages Mel Gorman
2013-10-07 19:10   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 47/63] mm: numa: Do not batch handle PMD pages Mel Gorman
2013-10-07 19:11   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 48/63] sched: numa: stay on the same node if CLONE_VM Mel Gorman
2013-10-07 10:29 ` [PATCH 49/63] sched: numa: use group fault statistics in numa placement Mel Gorman
2013-10-07 10:29 ` [PATCH 50/63] sched: numa: call task_numa_free from do_execve Mel Gorman
2013-10-07 10:29 ` [PATCH 51/63] sched: numa: Prevent parallel updates to group stats during placement Mel Gorman
2013-10-07 19:13   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 52/63] sched: numa: add debugging Mel Gorman
2013-10-07 19:13   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 53/63] sched: numa: Decide whether to favour task or group weights based on swap candidate relationships Mel Gorman
2013-10-07 10:29 ` [PATCH 54/63] sched: numa: fix task or group comparison Mel Gorman
2013-10-07 10:29 ` [PATCH 55/63] sched: numa: Avoid migrating tasks that are placed on their preferred node Mel Gorman
2013-10-07 19:14   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 56/63] sched: numa: be more careful about joining numa groups Mel Gorman
2013-10-07 10:29 ` [PATCH 57/63] sched: numa: Take false sharing into account when adapting scan rate Mel Gorman
2013-10-07 19:14   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 58/63] sched: numa: adjust scan rate in task_numa_placement Mel Gorman
2013-10-07 10:29 ` [PATCH 59/63] sched: numa: Remove the numa_balancing_scan_period_reset sysctl Mel Gorman
2013-10-07 19:14   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 60/63] mm: numa: revert temporarily disabling of NUMA migration Mel Gorman
2013-10-07 10:29 ` [PATCH 61/63] sched: numa: skip some page migrations after a shared fault Mel Gorman
2013-10-07 10:29 ` [PATCH 62/63] sched: numa: use unsigned longs for numa group fault stats Mel Gorman
2013-10-07 19:15   ` Rik van Riel
2013-10-07 10:29 ` [PATCH 63/63] sched: numa: periodically retry task_numa_migrate Mel Gorman
2013-10-09 11:03 ` [PATCH 0/63] Basic scheduler support for automatic NUMA balancing V9 Ingo Molnar
2013-10-09 11:11   ` Ingo Molnar
2013-10-09 11:13     ` Ingo Molnar
2013-10-09 12:05   ` Peter Zijlstra
2013-10-09 12:48     ` Ingo Molnar
2013-10-10  7:05   ` Mel Gorman
2013-10-09 16:28 ` Ingo Molnar
2013-10-09 16:29   ` Ingo Molnar
2013-10-09 16:57     ` Ingo Molnar
2013-10-09 17:09       ` Ingo Molnar
2013-10-09 17:11         ` Peter Zijlstra
2013-10-09 17:08   ` Peter Zijlstra
2013-10-09 17:15     ` Ingo Molnar
2013-10-09 17:18       ` Peter Zijlstra
2013-10-24 12:26 ` Automatic NUMA balancing patches for tip-urgent/stable Mel Gorman
2013-10-26 12:11   ` Ingo Molnar
2013-10-29  9:42     ` Mel Gorman
2013-10-29  9:48       ` Ingo Molnar
2013-10-29 10:24         ` Mel Gorman
2013-10-29 10:41           ` Ingo Molnar
2013-10-29 12:48             ` Mel Gorman
2013-10-31  9:51   ` [RFC GIT PULL] NUMA-balancing memory corruption fixes Ingo Molnar
2013-10-31 22:25     ` Linus Torvalds
2013-11-01  7:36       ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1381141781-10992-8-git-send-email-mgorman@suse.de \
    --to=mgorman@suse.de \
    --cc=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@kernel.org \
    --cc=riel@redhat.com \
    --cc=srikar@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git