linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Ben Widawsky <ben.widawsky@intel.com>
To: linux-mm <linux-mm@kvack.org>
Subject: [PATCH 04/18] mm/page_alloc: add preferred pass to page allocation
Date: Fri, 19 Jun 2020 09:24:00 -0700	[thread overview]
Message-ID: <20200619162414.1052234-5-ben.widawsky@intel.com> (raw)
In-Reply-To: <20200619162414.1052234-1-ben.widawsky@intel.com>

This patch updates the core part of page allocation (pulling from the
free list) to take preferred nodes into account first. If an allocation
from a preferred node cannot be found, the remaining nodes in the
nodemask are checked.

Intentionally not handled in this patch are OOM node scanning and
reclaim scanning. I am very open and receptive on comments as to whether
it is worth handling those cases with a preferred node ordering.

In this patch the code first scans the preferred nodes to make the
allocation, and then takes the subset of nodes in the remaining bound
nodes (often this is NULL aka all nodes) - potentially two passes.
Actually, the code was already two passes as it tries not to fragment on
the first pass, so now it's up to 4 passes.

Consider a 3 node system (0-2) passed the following masks:
Preferred: 	100
Bound:		110

pass 1: node 2 no fragmentation
pass 2: node 1 no fragmentation
pass 3: node 2 w/fragmentation
pass 4: node 1 w/fragmentation

Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
---
 mm/internal.h   |   1 +
 mm/page_alloc.c | 108 +++++++++++++++++++++++++++++++++++-------------
 2 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 9886db20d94f..8d16229c6cbb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -138,6 +138,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 struct alloc_context {
 	struct zonelist *zonelist;
 	nodemask_t *nodemask;
+	nodemask_t *prefmask;
 	struct zoneref *preferred_zoneref;
 	int migratetype;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 280ca85dc4d8..3cf44b6c31ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3675,6 +3675,69 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
 	return alloc_flags;
 }
 
+#ifdef CONFIG_NUMA
+static void set_pref_bind_mask(nodemask_t *out, const nodemask_t *prefmask,
+			       const nodemask_t *bindmask)
+{
+	bool has_pref, has_bind;
+
+	has_pref = prefmask && !nodes_empty(*prefmask);
+	has_bind = bindmask && !nodes_empty(*bindmask);
+
+	if (has_pref && has_bind)
+		nodes_and(*out, *prefmask, *bindmask);
+	else if (has_pref && !has_bind)
+		*out = *prefmask;
+	else if (!has_pref && has_bind)
+		*out = *bindmask;
+	else if (!has_pref && !has_bind)
+		*out = NODE_MASK_ALL;
+	else
+		unreachable();
+}
+#else
+#define set_pref_bind_mask(out, pref, bind)                                    \
+	{                                                                      \
+		(out)->bits[0] = 1UL                                           \
+	}
+#endif
+
+/* Helper to generate the preferred and fallback nodelists */
+static void __nodemask_for_freelist_scan(const struct alloc_context *ac,
+					 bool preferred, nodemask_t *outnodes)
+{
+	bool has_pref;
+	bool has_bind;
+
+	if (preferred) {
+		set_pref_bind_mask(outnodes, ac->prefmask, ac->nodemask);
+		return;
+	}
+
+	has_pref = ac->prefmask && !nodes_empty(*ac->prefmask);
+	has_bind = ac->nodemask && !nodes_empty(*ac->nodemask);
+
+	if (!has_bind && !has_pref) {
+		/*
+		 * If no preference, we already tried the full nodemask,
+		 * so we have to bail.
+		 */
+		nodes_clear(*outnodes);
+	} else if (!has_bind && has_pref) {
+		/* We tried preferred nodes only before. Invert that. */
+		nodes_complement(*outnodes, *ac->prefmask);
+	} else if (has_bind && !has_pref) {
+		/*
+		 * If preferred was empty, we've tried all bound nodes,
+		 * and there nothing further we can do.
+		 */
+		nodes_clear(*outnodes);
+	} else if (has_bind && has_pref) {
+		/* Try the bound nodes that weren't tried before. */
+		nodes_andnot(*outnodes, *ac->nodemask, *ac->prefmask);
+	}
+}
+
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -3686,7 +3749,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 	struct zoneref *z;
 	struct zone *zone;
 	struct pglist_data *last_pgdat_dirty_limit = NULL;
-	bool no_fallback;
+	nodemask_t nodes;
+	bool no_fallback, preferred_nodes_exhausted = false;
+
+	__nodemask_for_freelist_scan(ac, true, &nodes);
 
 retry:
 	/*
@@ -3696,7 +3762,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
 	z = ac->preferred_zoneref;
 	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
-					ac->highest_zoneidx, ac->nodemask) {
+					ac->highest_zoneidx, &nodes)
+	{
 		struct page *page;
 		unsigned long mark;
 
@@ -3816,12 +3883,20 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 		}
 	}
 
+	if (!preferred_nodes_exhausted) {
+		__nodemask_for_freelist_scan(ac, false, &nodes);
+		preferred_nodes_exhausted = true;
+		goto retry;
+	}
+
 	/*
 	 * It's possible on a UMA machine to get through all zones that are
 	 * fragmented. If avoiding fragmentation, reset and try again.
 	 */
 	if (no_fallback) {
 		alloc_flags &= ~ALLOC_NOFRAGMENT;
+		__nodemask_for_freelist_scan(ac, true, &nodes);
+		preferred_nodes_exhausted = false;
 		goto retry;
 	}
 
@@ -4763,33 +4838,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	return page;
 }
 
-#ifndef CONFIG_NUMA
-#define set_pref_bind_mask(out, pref, bind)                                    \
-	{                                                                      \
-		(out)->bits[0] = 1UL                                           \
-	}
-#else
-static void set_pref_bind_mask(nodemask_t *out, const nodemask_t *prefmask,
-			       const nodemask_t *bindmask)
-{
-	bool has_pref, has_bind;
-
-	has_pref = prefmask && !nodes_empty(*prefmask);
-	has_bind = bindmask && !nodes_empty(*bindmask);
-
-	if (has_pref && has_bind)
-		nodes_and(*out, *prefmask, *bindmask);
-	else if (has_pref && !has_bind)
-		*out = *prefmask;
-	else if (!has_pref && has_bind)
-		*out = *bindmask;
-	else if (!has_pref && !has_bind)
-		unreachable(); /* Handled above */
-	else
-		unreachable();
-}
-#endif
-
 /*
  * Find a zonelist from a preferred node. Here is a truth table example using 2
  * different masks. The choices are, NULL mask, empty mask, two masks with an
@@ -4945,6 +4993,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
 				 &alloc_mask, &alloc_flags))
 		return NULL;
 
+	ac.prefmask = &prefmask;
+
 	finalise_ac(gfp_mask, &ac);
 
 	/*
-- 
2.27.0



  parent reply	other threads:[~2020-06-19 16:25 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-19 16:23 [PATCH 00/18] multiple preferred nodes Ben Widawsky
2020-06-19 16:23 ` [PATCH 01/18] mm/mempolicy: Add comment for missing LOCAL Ben Widawsky
2020-06-19 16:23 ` [PATCH 02/18] mm/mempolicy: Use node_mem_id() instead of node_id() Ben Widawsky
2020-06-19 16:23 ` [PATCH 03/18] mm/page_alloc: start plumbing multi preferred node Ben Widawsky
2020-06-19 16:24 ` Ben Widawsky [this message]
2020-06-19 16:24 ` [PATCH 05/18] mm/mempolicy: convert single preferred_node to full nodemask Ben Widawsky
2020-06-19 16:24 ` [PATCH 06/18] mm/mempolicy: Add MPOL_PREFERRED_MANY for multiple preferred nodes Ben Widawsky
2020-06-19 16:24 ` [PATCH 07/18] mm/mempolicy: allow preferred code to take a nodemask Ben Widawsky
2020-06-19 16:24 ` [PATCH 08/18] mm/mempolicy: refactor rebind code for PREFERRED_MANY Ben Widawsky
2020-06-19 16:24 ` [PATCH 09/18] mm: Finish handling MPOL_PREFERRED_MANY Ben Widawsky
2020-06-19 16:24 ` [PATCH 10/18] mm: clean up alloc_pages_vma (thp) Ben Widawsky
2020-06-19 16:24 ` [PATCH 11/18] mm: Extract THP hugepage allocation Ben Widawsky
2020-06-19 16:24 ` [PATCH 12/18] mm/mempolicy: Use __alloc_page_node for interleaved Ben Widawsky
2020-06-19 16:24 ` [PATCH 13/18] mm: kill __alloc_pages Ben Widawsky
2020-06-19 16:25 ` [PATCH 00/18] multiple preferred nodes Ben Widawsky
2020-06-19 16:24 Ben Widawsky
2020-06-19 16:24 ` [PATCH 04/18] mm/page_alloc: add preferred pass to page allocation Ben Widawsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200619162414.1052234-5-ben.widawsky@intel.com \
    --to=ben.widawsky@intel.com \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).