All of lore.kernel.org
 help / color / mirror / Atom feed
From: Michal Hocko <mhocko@kernel.org>
To: Pingfan Liu <kernelfans@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Rapoport <rppt@linux.vnet.ibm.com>,
	Bjorn Helgaas <bhelgaas@google.com>,
	Jonathan Cameron <Jonathan.Cameron@huawei.com>
Subject: Re: [PATCH] mm/alloc: fallback to first node if the wanted node offline
Date: Wed, 12 Dec 2018 12:53:40 +0100	[thread overview]
Message-ID: <20181212115340.GQ1286@dhcp22.suse.cz> (raw)
In-Reply-To: <CAFgQCTupPc1rKv2SrmWD+eJ0H6PRaizPBw3+AG67_PuLA2SKFw@mail.gmail.com>

On Wed 12-12-18 16:31:35, Pingfan Liu wrote:
> On Mon, Dec 10, 2018 at 8:37 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> [...]
> >
> > In other words. Does the following work? I am sorry to wildguess this
> > way but I am not able to recreate your setups to play with this myself.
> >
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f5408bf7..d51643e10d00 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
> >
> >         node_data[nid] = nd;
> >         memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > -
> > -       node_set_online(nid);
> >  }
> >
> >  /**
> > @@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
> >         }
> >  }
> >
> > +static void __init init_memory_less_node(int nid)
> > +{
> > +       unsigned long zones_size[MAX_NR_ZONES] = {0};
> > +       unsigned long zholes_size[MAX_NR_ZONES] = {0};
> > +
> > +       free_area_init_node(nid, zones_size, 0, zholes_size);
> > +
> > +       /*
> > +        * All zonelists will be built later in start_kernel() after per cpu
> > +        * areas are initialized.
> > +        */
> > +}
> > +
> >  static int __init numa_register_memblks(struct numa_meminfo *mi)
> >  {
> >         unsigned long uninitialized_var(pfn_align);
> > @@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> >                 return -EINVAL;
> >
> >         /* Finally register nodes. */
> > -       for_each_node_mask(nid, node_possible_map) {
> > +       for_each_node(nid) {
> >                 u64 start = PFN_PHYS(max_pfn);
> >                 u64 end = 0;
> >
> > @@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> >                         continue;
> >
> >                 alloc_node_data(nid);
> > +               if (!end)
> 
> Here comes the bug, since !end can not reach here.

You are right. I am dumb. I've just completely missed that. Sigh.
Anyway, I think the code is more complicated than necessary and we can
simply drop the check. I do not think we really have to worry about
the start overflowing end. So the end patch should look as follows.
Btw. I believe it is better to pull alloc_node_data out of init_memory_less_node
because a) there is no need to duplicate the call and moreover we want
to pull node_set_online as well. The code also seems cleaner this way.

Thanks for your testing and your patience with me here.

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..a5548fe668fb 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
 
 	node_data[nid] = nd;
 	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
-	node_set_online(nid);
 }
 
 /**
@@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
 	}
 }
 
+static void __init init_memory_less_node(int nid)
+{
+	unsigned long zones_size[MAX_NR_ZONES] = {0};
+	unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+	free_area_init_node(nid, zones_size, 0, zholes_size);
+
+	/*
+	 * All zonelists will be built later in start_kernel() after per cpu
+	 * areas are initialized.
+	 */
+}
+
 static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
 	unsigned long uninitialized_var(pfn_align);
@@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 		return -EINVAL;
 
 	/* Finally register nodes. */
-	for_each_node_mask(nid, node_possible_map) {
+	for_each_node(nid) {
 		u64 start = PFN_PHYS(max_pfn);
 		u64 end = 0;
 
@@ -581,9 +592,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 			end = max(mi->blk[i].end, end);
 		}
 
-		if (start >= end)
-			continue;
-
 		/*
 		 * Don't confuse VM with a node that doesn't have the
 		 * minimum amount of memory:
@@ -592,6 +600,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 			continue;
 
 		alloc_node_data(nid);
+		if (!end)
+			init_memory_less_node(nid);
+		else
+			node_set_online(nid);
 	}
 
 	/* Dump memblock with node info and return. */
@@ -721,21 +733,6 @@ void __init x86_numa_init(void)
 	numa_init(dummy_numa_init);
 }
 
-static void __init init_memory_less_node(int nid)
-{
-	unsigned long zones_size[MAX_NR_ZONES] = {0};
-	unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
-	/* Allocate and initialize node data. Memory-less node is now online.*/
-	alloc_node_data(nid);
-	free_area_init_node(nid, zones_size, 0, zholes_size);
-
-	/*
-	 * All zonelists will be built later in start_kernel() after per cpu
-	 * areas are initialized.
-	 */
-}
-
 /*
  * Setup early cpu_to_node.
  *
@@ -763,9 +760,6 @@ void __init init_cpu_to_node(void)
 		if (node == NUMA_NO_NODE)
 			continue;
 
-		if (!node_online(node))
-			init_memory_less_node(node);
-
 		numa_set_node(cpu, node);
 	}
 }
-- 
Michal Hocko
SUSE Labs

  reply	other threads:[~2018-12-12 11:53 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-04  3:05 [PATCH] mm/alloc: fallback to first node if the wanted node offline Pingfan Liu
2018-12-04  3:53 ` David Rientjes
2018-12-04  7:16   ` Pingfan Liu
2018-12-05  5:49     ` Pingfan Liu
2018-12-05 19:00       ` David Rientjes
2018-12-04  6:54 ` Wei Yang
2018-12-04  7:20   ` Pingfan Liu
2018-12-04  8:34     ` Wei Yang
2018-12-04  8:52       ` Pingfan Liu
2018-12-04  9:09         ` Wei Yang
2018-12-05  5:50           ` Pingfan Liu
2018-12-04  7:22 ` Michal Hocko
2018-12-04  8:20   ` Pingfan Liu
2018-12-04  8:40     ` Wei Yang
2018-12-04  8:56       ` Pingfan Liu
2018-12-04  8:56     ` Michal Hocko
2018-12-04 14:42       ` Vlastimil Babka
2018-12-05  5:38       ` Pingfan Liu
2018-12-05  9:21         ` Michal Hocko
2018-12-05  9:29           ` Pingfan Liu
2018-12-05  9:40             ` Vlastimil Babka
2018-12-06  3:07               ` Pingfan Liu
2018-12-06  8:28                 ` Michal Hocko
2018-12-06 10:03                   ` Pingfan Liu
2018-12-06 10:44                     ` Pingfan Liu
2018-12-06 12:11                       ` Michal Hocko
2018-12-07  2:56                         ` Pingfan Liu
2018-12-07  7:53                           ` Michal Hocko
2018-12-07  9:40                             ` Pingfan Liu
2018-12-07 11:30                               ` Michal Hocko
2018-12-07 11:30                                 ` Michal Hocko
2018-12-07 13:20                                 ` Pingfan Liu
2018-12-07 14:22                                   ` Michal Hocko
2018-12-07 14:27                                     ` Pingfan Liu
2018-12-07 14:50                                       ` Michal Hocko
2018-12-07 15:56                                       ` Michal Hocko
2018-12-10  4:00                                         ` Pingfan Liu
2018-12-10  7:57                                           ` Pingfan Liu
2018-12-10 12:37                                         ` Michal Hocko
2018-12-11  8:05                                           ` Pingfan Liu
2018-12-11  9:44                                             ` Michal Hocko
2018-12-12  8:33                                               ` Pingfan Liu
2018-12-12  8:31                                           ` Pingfan Liu
2018-12-12 11:53                                             ` Michal Hocko [this message]
2018-12-13  8:37                                               ` Pingfan Liu
2018-12-13  9:04                                                 ` Pingfan Liu
2018-12-17 13:29                                                   ` Michal Hocko
2018-12-20  7:19                                                     ` Pingfan Liu
2018-12-20  9:19                                                       ` Michal Hocko
2019-01-08 14:34                                                         ` Michal Hocko
2019-01-09  3:13                                                           ` Pingfan Liu
2019-01-09  3:13                                                             ` Pingfan Liu
2019-01-11  3:12                                                           ` Pingfan Liu
2019-01-11  3:12                                                             ` Pingfan Liu
2019-01-11  9:23                                                             ` Michal Hocko
2018-12-17 12:57                                                 ` Michal Hocko
2018-12-05  9:43             ` Michal Hocko
2018-12-06  3:34               ` Pingfan Liu
2018-12-06  7:23                 ` Michal Hocko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181212115340.GQ1286@dhcp22.suse.cz \
    --to=mhocko@kernel.org \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=bhelgaas@google.com \
    --cc=kernelfans@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=rppt@linux.vnet.ibm.com \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.