All of lore.kernel.org
 help / color / mirror / Atom feed
From: Vlastimil Babka <vbabka@suse.cz>
To: Christoph Lameter <cl@linux.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org, Li Zefan <lizefan@huawei.com>,
	Michal Hocko <mhocko@kernel.org>,
	Mel Gorman <mgorman@techsingularity.net>,
	David Rientjes <rientjes@google.com>,
	Hugh Dickins <hughd@google.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Anshuman Khandual <khandual@linux.vnet.ibm.com>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: Re: [RFC 2/6] mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask()
Date: Wed, 12 Apr 2017 10:49:37 +0200	[thread overview]
Message-ID: <97045760-77eb-c892-9bcb-daad10a1d91d@suse.cz> (raw)
In-Reply-To: <9665a022-197a-4b02-8813-66aca252f0f9@suse.cz>

On 04/11/2017 09:03 PM, Vlastimil Babka wrote:
> On 11.4.2017 19:32, Christoph Lameter wrote:
>> On Tue, 11 Apr 2017, Vlastimil Babka wrote:
>>
>>> The task->il_next variable remembers the last allocation node for task's
>>> MPOL_INTERLEAVE policy. mpol_rebind_nodemask() updates interleave and
>>> bind mempolicies due to changing cpuset mems. Currently it also tries to
>>> make sure that current->il_next is valid within the updated nodemask. This is
>>> bogus, because 1) we are updating potentially any task's mempolicy, not just
>>> current, and 2) we might be updating per-vma mempolicy, not task one.
>>>
>>> The interleave_nodes() function that uses il_next can cope fine with the value
>>> not being within the currently allowed nodes, so this hasn't manifested as an
>>> actual issue. Thus it also won't be an issue if we just remove this adjustment
>>> completely.
>>
>> Well, interleave_nodes() will then potentially return a node outside of
>> the allowed memory policy when its called for the first time after
>> mpol_rebind_.. . But thenn it will find the next node within the
>> nodemask and work correctly for the next invocations.
> 
> Hmm, you're right. But that could be easily fixed if il_next became il_prev, so
> we would return the result of next_node_in(il_prev) and also store it as the new
> il_prev, right? I somehow assumed it already worked that way.

Like this?
----8<----
commit 0ec64a0b8e614ea655328d0fb539447c407ba7c1
Author: Vlastimil Babka <vbabka@suse.cz>
Date:   Mon Apr 3 13:11:32 2017 +0200

    mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask()
    
    The task->il_next variable stores the next allocation node id for task's
    MPOL_INTERLEAVE policy. mpol_rebind_nodemask() updates interleave and
    bind mempolicies due to changing cpuset mems. Currently it also tries to
    make sure that current->il_next is valid within the updated nodemask. This is
    bogus, because 1) we are updating potentially any task's mempolicy, not just
    current, and 2) we might be updating a per-vma mempolicy, not task one.
    
    The interleave_nodes() function that uses il_next can cope fine with the value
    not being within the currently allowed nodes, so this hasn't manifested as an
    actual issue.
    
    We can remove the need for updating il_next completely by changing it to
    il_prev and store the node id of the previous interleave allocation instead of
    the next id. Then interleave_nodes() can calculate the next id using the
    current nodemask and also store it as il_prev, except when querying the next
    node via do_get_mempolicy().
    
    Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 050d7113924a..9aca0db1e588 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -886,7 +886,7 @@ struct task_struct {
 #ifdef CONFIG_NUMA
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
-	short				il_next;
+	short				il_prev;
 	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37d0b334bfe9..25f9bde58521 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -349,12 +349,6 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		pol->v.nodes = tmp;
 	else
 		BUG();
-
-	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node_in(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = numa_node_id();
-	}
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -812,9 +806,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	}
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE &&
-	    nodes_weight(new->v.nodes))
-		current->il_next = first_node(new->v.nodes);
+	if (new && new->mode == MPOL_INTERLEAVE)
+		current->il_prev = MAX_NUMNODES-1;
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -863,6 +856,18 @@ static int lookup_node(unsigned long addr)
 	return err;
 }
 
+/* Do dynamic interleaving for a process */
+static unsigned interleave_nodes(struct mempolicy *policy, bool update_prev)
+{
+	unsigned next;
+	struct task_struct *me = current;
+
+	next = next_node_in(me->il_prev, policy->v.nodes);
+	if (next < MAX_NUMNODES && update_prev)
+		me->il_prev = next;
+	return next;
+}
+
 /* Retrieve NUMA policy */
 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			     unsigned long addr, unsigned long flags)
@@ -916,7 +921,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
-			*policy = current->il_next;
+			*policy = interleave_nodes(current->mempolicy, false);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1694,19 +1699,6 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 	return node_zonelist(nd, gfp);
 }
 
-/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
-{
-	unsigned nid, next;
-	struct task_struct *me = current;
-
-	nid = me->il_next;
-	next = next_node_in(nid, policy->v.nodes);
-	if (next < MAX_NUMNODES)
-		me->il_next = next;
-	return nid;
-}
-
 /*
  * Depending on the memory policy provide a node from which to allocate the
  * next slab entry.
@@ -1731,7 +1723,7 @@ unsigned int mempolicy_slab_node(void)
 		return policy->v.preferred_node;
 
 	case MPOL_INTERLEAVE:
-		return interleave_nodes(policy);
+		return interleave_nodes(policy, true);
 
 	case MPOL_BIND: {
 		struct zoneref *z;
@@ -1794,7 +1786,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		off += (addr - vma->vm_start) >> shift;
 		return offset_il_node(pol, vma, off);
 	} else
-		return interleave_nodes(pol);
+		return interleave_nodes(pol, true);
 }
 
 #ifdef CONFIG_HUGETLBFS
@@ -2060,7 +2052,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
-		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+		page = alloc_page_interleave(gfp, order,
+				interleave_nodes(pol, true));
 	else
 		page = __alloc_pages_nodemask(gfp, order,
 				policy_zonelist(gfp, pol, numa_node_id()),

WARNING: multiple messages have this Message-ID (diff)
From: Vlastimil Babka <vbabka@suse.cz>
To: Christoph Lameter <cl@linux.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org, Li Zefan <lizefan@huawei.com>,
	Michal Hocko <mhocko@kernel.org>,
	Mel Gorman <mgorman@techsingularity.net>,
	David Rientjes <rientjes@google.com>,
	Hugh Dickins <hughd@google.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Anshuman Khandual <khandual@linux.vnet.ibm.com>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: Re: [RFC 2/6] mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask()
Date: Wed, 12 Apr 2017 10:49:37 +0200	[thread overview]
Message-ID: <97045760-77eb-c892-9bcb-daad10a1d91d@suse.cz> (raw)
In-Reply-To: <9665a022-197a-4b02-8813-66aca252f0f9@suse.cz>

On 04/11/2017 09:03 PM, Vlastimil Babka wrote:
> On 11.4.2017 19:32, Christoph Lameter wrote:
>> On Tue, 11 Apr 2017, Vlastimil Babka wrote:
>>
>>> The task->il_next variable remembers the last allocation node for task's
>>> MPOL_INTERLEAVE policy. mpol_rebind_nodemask() updates interleave and
>>> bind mempolicies due to changing cpuset mems. Currently it also tries to
>>> make sure that current->il_next is valid within the updated nodemask. This is
>>> bogus, because 1) we are updating potentially any task's mempolicy, not just
>>> current, and 2) we might be updating per-vma mempolicy, not task one.
>>>
>>> The interleave_nodes() function that uses il_next can cope fine with the value
>>> not being within the currently allowed nodes, so this hasn't manifested as an
>>> actual issue. Thus it also won't be an issue if we just remove this adjustment
>>> completely.
>>
>> Well, interleave_nodes() will then potentially return a node outside of
>> the allowed memory policy when its called for the first time after
>> mpol_rebind_.. . But thenn it will find the next node within the
>> nodemask and work correctly for the next invocations.
> 
> Hmm, you're right. But that could be easily fixed if il_next became il_prev, so
> we would return the result of next_node_in(il_prev) and also store it as the new
> il_prev, right? I somehow assumed it already worked that way.

Like this?
----8<----
commit 0ec64a0b8e614ea655328d0fb539447c407ba7c1
Author: Vlastimil Babka <vbabka@suse.cz>
Date:   Mon Apr 3 13:11:32 2017 +0200

    mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask()
    
    The task->il_next variable stores the next allocation node id for task's
    MPOL_INTERLEAVE policy. mpol_rebind_nodemask() updates interleave and
    bind mempolicies due to changing cpuset mems. Currently it also tries to
    make sure that current->il_next is valid within the updated nodemask. This is
    bogus, because 1) we are updating potentially any task's mempolicy, not just
    current, and 2) we might be updating a per-vma mempolicy, not task one.
    
    The interleave_nodes() function that uses il_next can cope fine with the value
    not being within the currently allowed nodes, so this hasn't manifested as an
    actual issue.
    
    We can remove the need for updating il_next completely by changing it to
    il_prev and store the node id of the previous interleave allocation instead of
    the next id. Then interleave_nodes() can calculate the next id using the
    current nodemask and also store it as il_prev, except when querying the next
    node via do_get_mempolicy().
    
    Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 050d7113924a..9aca0db1e588 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -886,7 +886,7 @@ struct task_struct {
 #ifdef CONFIG_NUMA
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
-	short				il_next;
+	short				il_prev;
 	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37d0b334bfe9..25f9bde58521 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -349,12 +349,6 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		pol->v.nodes = tmp;
 	else
 		BUG();
-
-	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node_in(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = numa_node_id();
-	}
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -812,9 +806,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	}
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE &&
-	    nodes_weight(new->v.nodes))
-		current->il_next = first_node(new->v.nodes);
+	if (new && new->mode == MPOL_INTERLEAVE)
+		current->il_prev = MAX_NUMNODES-1;
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -863,6 +856,18 @@ static int lookup_node(unsigned long addr)
 	return err;
 }
 
+/* Do dynamic interleaving for a process */
+static unsigned interleave_nodes(struct mempolicy *policy, bool update_prev)
+{
+	unsigned next;
+	struct task_struct *me = current;
+
+	next = next_node_in(me->il_prev, policy->v.nodes);
+	if (next < MAX_NUMNODES && update_prev)
+		me->il_prev = next;
+	return next;
+}
+
 /* Retrieve NUMA policy */
 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			     unsigned long addr, unsigned long flags)
@@ -916,7 +921,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
-			*policy = current->il_next;
+			*policy = interleave_nodes(current->mempolicy, false);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1694,19 +1699,6 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 	return node_zonelist(nd, gfp);
 }
 
-/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
-{
-	unsigned nid, next;
-	struct task_struct *me = current;
-
-	nid = me->il_next;
-	next = next_node_in(nid, policy->v.nodes);
-	if (next < MAX_NUMNODES)
-		me->il_next = next;
-	return nid;
-}
-
 /*
  * Depending on the memory policy provide a node from which to allocate the
  * next slab entry.
@@ -1731,7 +1723,7 @@ unsigned int mempolicy_slab_node(void)
 		return policy->v.preferred_node;
 
 	case MPOL_INTERLEAVE:
-		return interleave_nodes(policy);
+		return interleave_nodes(policy, true);
 
 	case MPOL_BIND: {
 		struct zoneref *z;
@@ -1794,7 +1786,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		off += (addr - vma->vm_start) >> shift;
 		return offset_il_node(pol, vma, off);
 	} else
-		return interleave_nodes(pol);
+		return interleave_nodes(pol, true);
 }
 
 #ifdef CONFIG_HUGETLBFS
@@ -2060,7 +2052,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
-		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+		page = alloc_page_interleave(gfp, order,
+				interleave_nodes(pol, true));
 	else
 		page = __alloc_pages_nodemask(gfp, order,
 				policy_zonelist(gfp, pol, numa_node_id()),


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2017-04-12  8:49 UTC|newest]

Thread overview: 91+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-11 14:06 [RFC 0/6] cpuset/mempolicies related fixes and cleanups Vlastimil Babka
2017-04-11 14:06 ` Vlastimil Babka
2017-04-11 14:06 ` [RFC 1/6] mm, page_alloc: fix more premature OOM due to race with cpuset update Vlastimil Babka
2017-04-11 14:06   ` Vlastimil Babka
2017-04-11 17:24   ` Christoph Lameter
2017-04-11 17:24     ` Christoph Lameter
2017-04-11 19:00     ` Vlastimil Babka
2017-04-11 19:00       ` Vlastimil Babka
2017-04-11 19:00       ` Vlastimil Babka
2017-04-12 21:25       ` Christoph Lameter
2017-04-12 21:25         ` Christoph Lameter
2017-04-13  6:24         ` Vlastimil Babka
2017-04-13  6:24           ` Vlastimil Babka
2017-04-13  6:24           ` Vlastimil Babka
2017-04-14 20:37           ` Christoph Lameter
2017-04-14 20:37             ` Christoph Lameter
2017-04-26  8:07             ` Vlastimil Babka
2017-04-26  8:07               ` Vlastimil Babka
2017-04-30 21:33               ` Christoph Lameter
2017-04-30 21:33                 ` Christoph Lameter
2017-05-17  9:20                 ` Michal Hocko
2017-05-17  9:20                   ` Michal Hocko
2017-05-17  9:20                   ` Michal Hocko
2017-05-17 13:56                   ` Christoph Lameter
2017-05-17 13:56                     ` Christoph Lameter
2017-05-17 13:56                     ` Christoph Lameter
2017-05-17 14:05                     ` Michal Hocko
2017-05-17 14:05                       ` Michal Hocko
2017-05-17 14:05                       ` Michal Hocko
2017-05-17 14:48                       ` Christoph Lameter
2017-05-17 14:48                         ` Christoph Lameter
2017-05-17 14:56                         ` Michal Hocko
2017-05-17 14:56                           ` Michal Hocko
2017-05-17 14:56                           ` Michal Hocko
2017-05-17 15:25                           ` Christoph Lameter
2017-05-17 15:25                             ` Christoph Lameter
2017-05-18  9:08                             ` Michal Hocko
2017-05-18  9:08                               ` Michal Hocko
2017-05-18  9:08                               ` Michal Hocko
2017-05-18 16:57                               ` Christoph Lameter
2017-05-18 16:57                                 ` Christoph Lameter
2017-05-18 16:57                                 ` Christoph Lameter
2017-05-18 17:24                                 ` Michal Hocko
2017-05-18 17:24                                   ` Michal Hocko
2017-05-18 17:24                                   ` Michal Hocko
2017-05-18 19:07                                   ` Christoph Lameter
2017-05-18 19:07                                     ` Christoph Lameter
2017-05-18 19:07                                     ` Christoph Lameter
2017-05-19  7:37                                     ` Michal Hocko
2017-05-19  7:37                                       ` Michal Hocko
2017-05-17 15:27                           ` Christoph Lameter
2017-05-17 15:27                             ` Christoph Lameter
2017-05-18 10:03                         ` Vlastimil Babka
2017-05-18 10:03                           ` Vlastimil Babka
2017-05-18 17:07                           ` Christoph Lameter
2017-05-18 17:07                             ` Christoph Lameter
2017-05-19 11:27                             ` Vlastimil Babka
2017-05-19 11:27                               ` Vlastimil Babka
2017-04-13  5:42   ` Anshuman Khandual
2017-04-13  5:42     ` Anshuman Khandual
2017-04-13  6:06     ` Vlastimil Babka
2017-04-13  6:06       ` Vlastimil Babka
2017-04-13  6:07       ` Vlastimil Babka
2017-04-13  6:07         ` Vlastimil Babka
2017-04-11 14:06 ` [RFC 2/6] mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask() Vlastimil Babka
2017-04-11 14:06   ` Vlastimil Babka
2017-04-11 17:32   ` Christoph Lameter
2017-04-11 17:32     ` Christoph Lameter
2017-04-11 19:03     ` Vlastimil Babka
2017-04-11 19:03       ` Vlastimil Babka
2017-04-11 19:03       ` Vlastimil Babka
2017-04-12  8:49       ` Vlastimil Babka [this message]
2017-04-12  8:49         ` Vlastimil Babka
2017-04-12 21:16         ` Christoph Lameter
2017-04-12 21:16           ` Christoph Lameter
2017-04-12 21:16           ` Christoph Lameter
2017-04-12 21:18           ` Vlastimil Babka
2017-04-12 21:18             ` Vlastimil Babka
2017-04-11 14:06 ` [RFC 3/6] mm, page_alloc: pass preferred nid instead of zonelist to allocator Vlastimil Babka
2017-04-11 14:06   ` Vlastimil Babka
2017-04-11 14:06 ` [RFC 4/6] mm, mempolicy: simplify rebinding mempolicies when updating cpusets Vlastimil Babka
2017-04-11 14:06   ` Vlastimil Babka
2017-04-11 14:06 ` [RFC 5/6] mm, cpuset: always use seqlock when changing task's nodemask Vlastimil Babka
2017-04-11 14:06   ` Vlastimil Babka
2017-04-12  8:10   ` Hillf Danton
2017-04-12  8:10     ` Hillf Danton
2017-04-12  8:18     ` Vlastimil Babka
2017-04-12  8:18       ` Vlastimil Babka
2017-04-12  8:18       ` Vlastimil Babka
2017-04-11 14:06 ` [RFC 6/6] mm, mempolicy: don't check cpuset seqlock where it doesn't matter Vlastimil Babka
2017-04-11 14:06   ` Vlastimil Babka

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=97045760-77eb-c892-9bcb-daad10a1d91d@suse.cz \
    --to=vbabka@suse.cz \
    --cc=aarcange@redhat.com \
    --cc=cgroups@vger.kernel.org \
    --cc=cl@linux.com \
    --cc=hughd@google.com \
    --cc=khandual@linux.vnet.ibm.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizefan@huawei.com \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@kernel.org \
    --cc=rientjes@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.