All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mike Rapoport <rppt@linux.ibm.com>
To: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>,
	Vlastimil Babka <vbabka@suse.cz>, Mel Gorman <mgorman@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, Qian Cai <cai@lca.pw>,
	Michal Hocko <mhocko@kernel.org>,
	linux-kernel@vger.kernel.org, Baoquan He <bhe@redhat.com>
Subject: Re: [PATCH 1/1] mm: compaction: avoid fast_isolate_around() to set pageblock_skip on reserved pages
Date: Thu, 26 Nov 2020 19:46:01 +0200	[thread overview]
Message-ID: <20201126174601.GT123287@linux.ibm.com> (raw)
In-Reply-To: <3bb709a7-6100-aa5c-4125-7ed80c6d9643@redhat.com>

On Thu, Nov 26, 2020 at 11:05:14AM +0100, David Hildenbrand wrote:
> On 26.11.20 10:36, Mike Rapoport wrote:
> > On Wed, Nov 25, 2020 at 04:38:16PM -0500, Andrea Arcangeli wrote:
> > 
> >> At best memory.reserved should be calculated automatically by the
> >> page_alloc.c based on the zone_start_pfn/zone_end_pfn and not passed
> >> by the e820 caller, instead of adding the memory_reserve call for type
> >> 20 we should delete the memory_reserve function.
> >  
> > memory.reserved cannot be calculated automatically. It represents all
> > the memory allocations made before page allocator is up. And as
> > memblock_reserve() is the most basic to allocate memory early at boot we
> > cannot really delete it ;-)
> > 
> > As for e820 and type 20, unless it is in memblock, page_alloc.c has no
> > way to properly initialize memmap for it. It can continue to guess, like
> > it does with init_unavailable_memory().
> 
> AFAIS, init_unavailable_mem() has to logic to setup *any* such struct
> page properly up - except the node/zid, because that's unknown at that
> time. It is documented that (big comment above init_unavailable_mem())

// The comment is a bit inaccurate, but that's another story :)

> which pages won't get initialized again. They *are* initialized, just
> the node/zone is "fake" - that's the real problem we seem to have.

Let's try to merge init_unavailable_memory() into memmap_init().
Than it'll be able to set zone/nid for those nasty pfns that BIOS
decided to keep to itself, like in Andrea's case and will also take care
of struct pages that do not really have a frame in DRAM, but are there
because of arbitrary section size.

Something like this:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa227a479e4..072e94042a11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6185,24 +6185,84 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-void __meminit __weak memmap_init(unsigned long size, int nid,
-				  unsigned long zone,
-				  unsigned long range_start_pfn)
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Only struct pages that are backed by physical memory available to the
+ * kernel are zeroed and initialized by memmap_init_zone().
+ * But, there are some struct pages that are either reserved by firmware or
+ * do not correspond to physical page frames becuase actual memory bank is
+ * not a multiple of SECTION_SIZE. Fields of those struct pages may be
+ * accessed (for example page_to_pfn() on some configuration accesses
+ * flags) so we must explicitly initialize those struct pages.
+ */
+static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn,
+					 int zone, int node)
 {
-	unsigned long start_pfn, end_pfn;
+	unsigned long pfn;
+	u64 pgcnt = 0;
+
+	for (pfn = spfn; pfn < epfn; pfn++) {
+		if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+			pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+				+ pageblock_nr_pages - 1;
+			continue;
+		}
+		__init_single_page(pfn_to_page(pfn), pfn, zone, node);
+		__SetPageReserved(pfn_to_page(pfn));
+		pgcnt++;
+	}
+
+	return pgcnt;
+}
+#else
+static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
+					 int zone, int node)
+{
+	return 0;
+}
+#endif
+
+void __init __weak memmap_init(unsigned long size, int nid,
+			       unsigned long zone,
+			       unsigned long range_start_pfn)
+{
+	unsigned long start_pfn, end_pfn, next_pfn = 0;
 	unsigned long range_end_pfn = range_start_pfn + size;
+	u64 pgcnt = 0;
 	int i;
 
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+		next_pfn = clamp(next_pfn, range_start_pfn, range_end_pfn);
 
 		if (end_pfn > start_pfn) {
 			size = end_pfn - start_pfn;
 			memmap_init_zone(size, nid, zone, start_pfn,
 					 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 		}
+
+		if (next_pfn < start_pfn)
+			pgcnt += init_unavailable_range(next_pfn, start_pfn,
+							zone, nid);
+		next_pfn = end_pfn;
 	}
+
+	/*
+	 * Early sections always have a fully populated memmap for the whole
+	 * section - see pfn_valid(). If the last section has holes at the
+	 * end and that section is marked "online", the memmap will be
+	 * considered initialized. Make sure that memmap has a well defined
+	 * state.
+	 */
+	if (next_pfn < range_end_pfn)
+		pgcnt += init_unavailable_range(next_pfn, range_end_pfn,
+						zone, nid);
+
+	if (pgcnt)
+		pr_info("%s: Zeroed struct page in unavailable ranges: %lld\n",
+			zone_names[zone], pgcnt);
+
 }
 
 static int zone_batchsize(struct zone *zone)
@@ -6995,88 +7055,6 @@ void __init free_area_init_memoryless_node(int nid)
 	free_area_init_node(nid);
 }
 
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
-/*
- * Initialize all valid struct pages in the range [spfn, epfn) and mark them
- * PageReserved(). Return the number of struct pages that were initialized.
- */
-static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
-{
-	unsigned long pfn;
-	u64 pgcnt = 0;
-
-	for (pfn = spfn; pfn < epfn; pfn++) {
-		if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
-			pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
-				+ pageblock_nr_pages - 1;
-			continue;
-		}
-		/*
-		 * Use a fake node/zone (0) for now. Some of these pages
-		 * (in memblock.reserved but not in memblock.memory) will
-		 * get re-initialized via reserve_bootmem_region() later.
-		 */
-		__init_single_page(pfn_to_page(pfn), pfn, 0, 0);
-		__SetPageReserved(pfn_to_page(pfn));
-		pgcnt++;
-	}
-
-	return pgcnt;
-}
-
-/*
- * Only struct pages that are backed by physical memory are zeroed and
- * initialized by going through __init_single_page(). But, there are some
- * struct pages which are reserved in memblock allocator and their fields
- * may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly initialize those struct pages.
- *
- * This function also addresses a similar issue where struct pages are left
- * uninitialized because the physical address range is not covered by
- * memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=, or when the highest physical
- * address (max_pfn) does not end on a section boundary.
- */
-static void __init init_unavailable_mem(void)
-{
-	phys_addr_t start, end;
-	u64 i, pgcnt;
-	phys_addr_t next = 0;
-
-	/*
-	 * Loop through unavailable ranges not covered by memblock.memory.
-	 */
-	pgcnt = 0;
-	for_each_mem_range(i, &start, &end) {
-		if (next < start)
-			pgcnt += init_unavailable_range(PFN_DOWN(next),
-							PFN_UP(start));
-		next = end;
-	}
-
-	/*
-	 * Early sections always have a fully populated memmap for the whole
-	 * section - see pfn_valid(). If the last section has holes at the
-	 * end and that section is marked "online", the memmap will be
-	 * considered initialized. Make sure that memmap has a well defined
-	 * state.
-	 */
-	pgcnt += init_unavailable_range(PFN_DOWN(next),
-					round_up(max_pfn, PAGES_PER_SECTION));
-
-	/*
-	 * Struct pages that do not have backing memory. This could be because
-	 * firmware is using some of this memory, or for some other reasons.
-	 */
-	if (pgcnt)
-		pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
-}
-#else
-static inline void __init init_unavailable_mem(void)
-{
-}
-#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
@@ -7500,7 +7478,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
-	init_unavailable_mem();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid);


> I agree that this is sub-optimal, as such pages are impossible to detect
> (PageReserved is just not clear as discussed with Andrea). The basic
> question is how we want to proceed:
> 
> a) Make sure any online struct page has a valid nid/zid, and is spanned
> by the nid/zid.
> b) Use a fake nid that will bail out when used for page_zone() and
> page_pgdat(), and make pfn walkers detect that.
> 
> AFAIU, Andrea seems to prefer a). I thing b) might be easier in corner
> cases. Thoughts?

I'd also prefer (a). 

The hardware defines what physical addresses correspond to which node,
so for any populated DIMM (or soldered DRAM for that matter) we can
detect page <-> node relationship.

As for the struct pages that just "hang" in the end of a section (your
example with 4000M), the addresses of these pages still obey the same
rules, so again, we have page <-> node correspondence.

The zones are software construct but they also correspond to some
hardware defined restrictions - each zone has a maximal PFN that HW
allows. Here again we can always know which zone spans that or another
page.

We'd have to fix a couple of things to get there, though :)

> ---
> 
> The tricky thing for b) is to fix all call places that do a
> page_zone()/page_pgdat() without checking if they are allowed to do so.
> We would have to audit all callers of page_zone() / page_zone_id() /
> page_pgdat() ...
> 
> 
> E.g., using a pattern like
> mm/memory_hotplug.c:find_biggest_section_pfn() is fine
> 
> 
> if (unlikely(!pfn_to_online_page(pfn)))
> 	continue;
> if (unlikely(pfn_to_nid(pfn) != nid))
> 	continue;
> 
> 
> e.g., mm/page_alloc.c:__pageblock_pfn_to_page() would have to be adjusted
> 
> start_page = pfn_to_online_page(start_pfn);
> if (!start_page)
> 	return NULL;
> 
> // necessary to avoid ...
> if (page_to_nid(page) == NODE_INVALID)
> 	return NULL;
> 
> // page_zone() from bailing out
> if (page_zone(start_page) != zone)
> 	return NULL;
> 
> -- 
> Thanks,
> 
> David / dhildenb
> 

-- 
Sincerely yours,
Mike.

  reply	other threads:[~2020-11-26 17:46 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-23 21:25 compaction: VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn)) Qian Cai
2020-04-24  3:43 ` Baoquan He
2020-04-24 13:45   ` Qian Cai
2020-05-05 12:43     ` Baoquan He
2020-05-05 13:20       ` Qian Cai
2020-05-11  1:21         ` Baoquan He
2020-04-26 14:41 ` Mike Rapoport
2020-04-27 13:45   ` Qian Cai
2020-11-21 19:45 ` [PATCH 0/1] VM_BUG_ON_PAGE(!zone_spans_pfn) in set_pfnblock_flags_mask Andrea Arcangeli
2020-11-21 19:45   ` [PATCH 1/1] mm: compaction: avoid fast_isolate_around() to set pageblock_skip on reserved pages Andrea Arcangeli
2020-11-21 19:53     ` Andrea Arcangeli
2020-11-23 11:26       ` David Hildenbrand
2020-11-23 13:01     ` Vlastimil Babka
2020-11-24 13:32       ` Mel Gorman
2020-11-24 20:56         ` Andrea Arcangeli
2020-11-25 10:30           ` Mel Gorman
2020-11-25 17:59             ` Andrea Arcangeli
2020-11-26 10:47               ` Mel Gorman
2020-12-06  2:26                 ` Andrea Arcangeli
2020-12-06 23:47                   ` Mel Gorman
2020-11-25  5:34       ` Andrea Arcangeli
2020-11-25  6:45         ` David Hildenbrand
2020-11-25  8:51           ` Mike Rapoport
2020-11-25 10:39           ` Mel Gorman
2020-11-25 11:04             ` David Hildenbrand
2020-11-25 11:41               ` David Hildenbrand
2020-11-25 18:47                 ` Andrea Arcangeli
2020-11-25 13:33               ` Mel Gorman
2020-11-25 13:41                 ` David Hildenbrand
2020-11-25 18:28           ` Andrea Arcangeli
2020-11-25 19:27             ` David Hildenbrand
2020-11-25 20:41               ` Andrea Arcangeli
2020-11-25 21:13                 ` David Hildenbrand
2020-11-25 21:04               ` Mike Rapoport
2020-11-25 21:38                 ` Andrea Arcangeli
2020-11-26  9:36                   ` Mike Rapoport
2020-11-26 10:05                     ` David Hildenbrand
2020-11-26 17:46                       ` Mike Rapoport [this message]
2020-11-29 12:32                         ` Mike Rapoport
2020-12-02  0:44                           ` Andrea Arcangeli
2020-12-02 17:39                             ` Mike Rapoport
2020-12-03  6:23                               ` Andrea Arcangeli
2020-12-03 10:51                                 ` Mike Rapoport
2020-12-03 17:31                                   ` Andrea Arcangeli
2020-12-06  8:09                                     ` Mike Rapoport
2020-11-26 18:15                       ` Andrea Arcangeli
2020-11-26 18:29                     ` Andrea Arcangeli
2020-11-26 19:44                       ` Mike Rapoport
2020-11-26 20:30                         ` Andrea Arcangeli
2020-11-26 21:03                           ` Mike Rapoport
2020-11-26 19:21                     ` Andrea Arcangeli
2020-11-25 12:08         ` Vlastimil Babka
2020-11-25 13:32           ` David Hildenbrand
2020-11-25 14:13             ` Mike Rapoport
2020-11-25 14:42               ` David Hildenbrand
2020-11-26 10:51                 ` Mel Gorman
2020-11-25 19:14               ` Andrea Arcangeli
2020-11-25 19:01           ` Andrea Arcangeli
2020-11-25 19:33             ` David Hildenbrand
2020-11-26  3:40         ` Andrea Arcangeli
2020-11-26 10:43           ` Mike Rapoport

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201126174601.GT123287@linux.ibm.com \
    --to=rppt@linux.ibm.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=bhe@redhat.com \
    --cc=cai@lca.pw \
    --cc=david@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@kernel.org \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.