linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1] mm: optimize memory hotplug
@ 2018-01-31  5:42 Pavel Tatashin
  2018-01-31  8:43 ` Michal Hocko
  2018-02-02  6:05 ` kbuild test robot
  0 siblings, 2 replies; 4+ messages in thread
From: Pavel Tatashin @ 2018-01-31  5:42 UTC (permalink / raw)
  To: steven.sistare, daniel.m.jordan, akpm, mgorman, mhocko, linux-mm,
	linux-kernel, gregkh, vbabka, bharata

This patch was inspired by the discussion of this problem:
http://lkml.kernel.org/r/20180130083006.GB1245@in.ibm.com

Currently, during memory hotplugging we traverse struct pages several
times:

1. memset(0) in sparse_add_one_section()
2. loop in __add_section() to set do: set_page_node(page, nid); and
   SetPageReserved(page);
3. loop in pages_correctly_reserved() to check that SetPageReserved is set.
4. loop in memmap_init_zone() to call __init_single_pfn()

This patch removes loops 1, 2, and 3 and only leaves the loop 4, where all
struct page fields are initialized in one go, the same as it is now done
during boot.

The benefits:
- We improve the memory hotplug performance because we are not evicting
  cache several times and also reduce loop branching overheads.

- Remove condition from hotpath in __init_single_pfn(), that was added in
  order to fix the problem that was reported by Bharata in the above email
  thread, thus also improve the performance during normal boot.

- Make memory hotplug more similar to boot memory initialization path
  because we zero and initialize struct pages only in one function.

- Simplifies memory hotplug strut page initialization code, and thus
  enables future improvements, such as multi-threading the initialization
  of struct pages in order to improve the hotplug performance even further
  on larger machines.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
---
 drivers/base/memory.c          | 36 ++++++++++++++++++++----------------
 include/linux/memory_hotplug.h |  2 ++
 mm/memory_hotplug.c            | 21 ++-------------------
 mm/page_alloc.c                | 28 ++++++++++------------------
 mm/sparse.c                    | 28 +++++++++++++++++++++++++---
 5 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index fe4b24f05f6a..deb3f029b451 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -187,13 +187,14 @@ int memory_isolate_notify(unsigned long val, void *v)
 }
 
 /*
- * The probe routines leave the pages reserved, just as the bootmem code does.
- * Make sure they're still that way.
+ * The probe routines leave the pages uninitialized, just as the bootmem code
+ * does. Make sure we do not access them, but instead use only information from
+ * within sections.
  */
-static bool pages_correctly_reserved(unsigned long start_pfn)
+static bool pages_correctly_probed(unsigned long start_pfn)
 {
-	int i, j;
-	struct page *page;
+	unsigned long section_nr = pfn_to_section_nr(start_pfn);
+	unsigned long section_nr_end = section_nr + sections_per_block;
 	unsigned long pfn = start_pfn;
 
 	/*
@@ -201,21 +202,24 @@ static bool pages_correctly_reserved(unsigned long start_pfn)
 	 * SPARSEMEM_VMEMMAP. We lookup the page once per section
 	 * and assume memmap is contiguous within each section
 	 */
-	for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
+	for (; section_nr < section_nr_end; section_nr++) {
 		if (WARN_ON_ONCE(!pfn_valid(pfn)))
 			return false;
-		page = pfn_to_page(pfn);
-
-		for (j = 0; j < PAGES_PER_SECTION; j++) {
-			if (PageReserved(page + j))
-				continue;
-
-			printk(KERN_WARNING "section number %ld page number %d "
-				"not reserved, was it already online?\n",
-				pfn_to_section_nr(pfn), j);
 
+		if (!present_section_nr(section_nr)) {
+			pr_warn("section %ld pfn[%lx, %lx) not present",
+				section_nr, pfn, pfn + PAGES_PER_SECTION);
+			return false;
+		} else if (!valid_section_nr(section_nr)) {
+			pr_warn("section %ld pfn[%lx, %lx) no valid memmap",
+				section_nr, pfn, pfn + PAGES_PER_SECTION);
+			return false;
+		} else if (online_section_nr(section_nr)) {
+			pr_warn("section %ld pfn[%lx, %lx) is already online",
+				section_nr, pfn, pfn + PAGES_PER_SECTION);
 			return false;
 		}
+		pfn += PAGES_PER_SECTION;
 	}
 
 	return true;
@@ -237,7 +241,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
 
 	switch (action) {
 	case MEM_ONLINE:
-		if (!pages_correctly_reserved(start_pfn))
+		if (!pages_correctly_probed(start_pfn))
 			return -EBUSY;
 
 		ret = online_pages(start_pfn, nr_pages, online_type);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 787411bdeadb..13acf181fb4b 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -234,6 +234,8 @@ void put_online_mems(void);
 void mem_hotplug_begin(void);
 void mem_hotplug_done(void);
 
+int get_section_nid(unsigned long section_nr);
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 #define pfn_to_online_page(pfn)			\
 ({						\
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a9bee33ffa7..e7d11a14b1d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -250,7 +250,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		struct vmem_altmap *altmap, bool want_memblock)
 {
 	int ret;
-	int i;
 
 	if (pfn_valid(phys_start_pfn))
 		return -EEXIST;
@@ -259,23 +258,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 	if (ret < 0)
 		return ret;
 
-	/*
-	 * Make all the pages reserved so that nobody will stumble over half
-	 * initialized state.
-	 * FIXME: We also have to associate it with a node because page_to_nid
-	 * relies on having page with the proper node.
-	 */
-	for (i = 0; i < PAGES_PER_SECTION; i++) {
-		unsigned long pfn = phys_start_pfn + i;
-		struct page *page;
-		if (!pfn_valid(pfn))
-			continue;
-
-		page = pfn_to_page(pfn);
-		set_page_node(page, nid);
-		SetPageReserved(page);
-	}
-
 	if (!want_memblock)
 		return 0;
 
@@ -913,7 +895,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	int ret;
 	struct memory_notify arg;
 
-	nid = pfn_to_nid(pfn);
+	nid = get_section_nid(pfn_to_section_nr(pfn));
+
 	/* associate pfn range with the zone */
 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2b42f603b1a..b26991867393 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1178,10 +1178,9 @@ static void free_one_page(struct zone *zone,
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-				unsigned long zone, int nid, bool zero)
+				unsigned long zone, int nid)
 {
-	if (zero)
-		mm_zero_struct_page(page);
+	mm_zero_struct_page(page);
 	set_page_links(page, zone, nid, pfn);
 	init_page_count(page);
 	page_mapcount_reset(page);
@@ -1195,12 +1194,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 #endif
 }
 
-static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
-					int nid, bool zero)
-{
-	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
-}
-
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __meminit init_reserved_page(unsigned long pfn)
 {
@@ -1219,7 +1212,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
 		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
 			break;
 	}
-	__init_single_pfn(pfn, zid, nid, true);
+	__init_single_page(pfn_to_page(pfn), pfn, zone, nid);
 }
 #else
 static inline void init_reserved_page(unsigned long pfn)
@@ -1536,7 +1529,7 @@ static unsigned long  __init deferred_init_pages(int nid, int zid,
 		} else {
 			page++;
 		}
-		__init_single_page(page, pfn, zid, nid, true);
+		__init_single_page(page, pfn, zid, nid);
 		nr_pages++;
 	}
 	return (nr_pages);
@@ -5329,6 +5322,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long pfn;
 	unsigned long nr_initialised = 0;
+	struct page *page;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	struct memblock_region *r = NULL, *tmp;
 #endif
@@ -5390,6 +5384,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 #endif
 
 not_early:
+		page = pfn_to_page(pfn);
+		__init_single_page(page, pfn, zone, nid);
+		if (context == MEMMAP_HOTPLUG)
+			SetPageReserved(page);
+
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
@@ -5406,15 +5405,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		 * because this is done early in sparse_add_one_section
 		 */
 		if (!(pfn & (pageblock_nr_pages - 1))) {
-			struct page *page = pfn_to_page(pfn);
-
-			__init_single_page(page, pfn, zone, nid,
-					context != MEMMAP_HOTPLUG);
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			cond_resched();
-		} else {
-			__init_single_pfn(pfn, zone, nid,
-					context != MEMMAP_HOTPLUG);
 		}
 	}
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 7af5e7a92528..11ed21fb52b0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -30,11 +30,14 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
 #endif
 EXPORT_SYMBOL(mem_section);
 
-#ifdef NODE_NOT_IN_PAGE_FLAGS
+#if defined(NODE_NOT_IN_PAGE_FLAGS) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
  * If we did not store the node number in the page then we have to
  * do a lookup in the section_to_node_table in order to find which
  * node the page belongs to.
+ *
+ * We also use this data in case memory hotplugging is enabled to be
+ * able to determine nid while struct pages are not yet initialized.
  */
 #if MAX_NUMNODES <= 256
 static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
@@ -42,17 +45,28 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
 static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
 #endif
 
+#ifdef NODE_NOT_IN_PAGE_FLAGS
 int page_to_nid(const struct page *page)
 {
 	return section_to_node_table[page_to_section(page)];
 }
 EXPORT_SYMBOL(page_to_nid);
+#endif /* NODE_NOT_IN_PAGE_FLAGS */
 
 static void set_section_nid(unsigned long section_nr, int nid)
 {
 	section_to_node_table[section_nr] = nid;
 }
-#else /* !NODE_NOT_IN_PAGE_FLAGS */
+
+/* Return NID for given section number */
+int get_section_nid(unsigned long section_nr)
+{
+	if (WARN_ON(section_nr >= NR_MEM_SECTIONS))
+		return 0;
+	return section_to_node_table[section_nr];
+}
+EXPORT_SYMBOL(get_section_nid);
+#else /* ! (NODE_NOT_IN_PAGE_FLAGS || CONFIG_MEMORY_HOTPLUG) */
 static inline void set_section_nid(unsigned long section_nr, int nid)
 {
 }
@@ -816,7 +830,13 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
 		goto out;
 	}
 
-	memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+	/*
+	 * poison uninitialized struct pages in order to catch invalid flags
+	 * combinations.
+	 */
+	memset(memmap, -1, sizeof(struct page) * PAGES_PER_SECTION);
+#endif
 
 	section_mark_present(ms);
 
@@ -827,6 +847,8 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
 	if (ret <= 0) {
 		kfree(usemap);
 		__kfree_section_memmap(memmap, altmap);
+	} else {
+		set_section_nid(section_nr, pgdat->node_id);
 	}
 	return ret;
 }
-- 
2.16.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] mm: optimize memory hotplug
  2018-01-31  5:42 [PATCH v1] mm: optimize memory hotplug Pavel Tatashin
@ 2018-01-31  8:43 ` Michal Hocko
  2018-01-31 18:38   ` Pavel Tatashin
  2018-02-02  6:05 ` kbuild test robot
  1 sibling, 1 reply; 4+ messages in thread
From: Michal Hocko @ 2018-01-31  8:43 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: steven.sistare, daniel.m.jordan, akpm, mgorman, linux-mm,
	linux-kernel, gregkh, vbabka, bharata

On Wed 31-01-18 00:42:43, Pavel Tatashin wrote:
> This patch was inspired by the discussion of this problem:
> http://lkml.kernel.org/r/20180130083006.GB1245@in.ibm.com
> 
> Currently, during memory hotplugging we traverse struct pages several
> times:
> 
> 1. memset(0) in sparse_add_one_section()
> 2. loop in __add_section() to set do: set_page_node(page, nid); and
>    SetPageReserved(page);
> 3. loop in pages_correctly_reserved() to check that SetPageReserved is set.
> 4. loop in memmap_init_zone() to call __init_single_pfn()
> 
> This patch removes loops 1, 2, and 3 and only leaves the loop 4, where all
> struct page fields are initialized in one go, the same as it is now done
> during boot.

So how do we check that there is no page_to_nid() user before we online
the page? I remember I was fighting strange bugs when reworking this
code. I have forgot all the details of course, I just remember some
nasty and subtle code paths. Maybe we have got rid of those in the past
year but this should be done really carefully. We might have similar
dependences on PageReserved.

That being said, it would be great if we could simplify this. I think
that 3) can be removed right away. It is a pure paranoia. 
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] mm: optimize memory hotplug
  2018-01-31  8:43 ` Michal Hocko
@ 2018-01-31 18:38   ` Pavel Tatashin
  0 siblings, 0 replies; 4+ messages in thread
From: Pavel Tatashin @ 2018-01-31 18:38 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Steve Sistare, Daniel Jordan, Andrew Morton, Mel Gorman,
	Linux Memory Management List, Linux Kernel Mailing List,
	Greg Kroah-Hartman, Vlastimil Babka, Bharata B Rao

Hi Michal,

> So how do we check that there is no page_to_nid() user before we online
> the page?

The poisoning helps to catch these now, and will in the future.
Because we are setting "struct page" to all 1s, we get nid that is
bigger than supported, and thus panic due to NULL pointer dereference,
or some other reason.

For example, if in online_pages() I replace get_section_nid() back to
pfn_to_nid(), I am getting panic like this:

[   45.473228] BUG: KASAN: null-ptr-deref in zone_for_pfn_range+0xce/0x240
[   45.475273] Read of size 8 at addr 0000000000000068 by task bash/144
[   45.477240]
[   45.477744] CPU: 0 PID: 144 Comm: bash Not tainted
4.15.0-next-20180130_pt_memset #11
[   45.479947] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.11.0-20171110_100015-anatol 04/01/2014
[   45.482053] Call Trace:
[   45.482589]  dump_stack+0xa6/0x109
[   45.483304]  ? _atomic_dec_and_lock+0x137/0x137
[   45.484248]  ? zone_for_pfn_range+0xce/0x240
[   45.485140]  kasan_report+0x208/0x350
[   45.485916]  zone_for_pfn_range+0xce/0x240
[   45.486787]  online_pages+0xf0/0x4a0

 I remember I was fighting strange bugs when reworking this
> code. I have forgot all the details of course, I just remember some
> nasty and subtle code paths. Maybe we have got rid of those in the past
> year but this should be done really carefully. We might have similar
> dependences on PageReserved.

I am adding a new PG_POISON_CHECK() to help with both Page* macros,
and page_to_nid(). A new patch is coming.

Thank you,
Pavel

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] mm: optimize memory hotplug
  2018-01-31  5:42 [PATCH v1] mm: optimize memory hotplug Pavel Tatashin
  2018-01-31  8:43 ` Michal Hocko
@ 2018-02-02  6:05 ` kbuild test robot
  1 sibling, 0 replies; 4+ messages in thread
From: kbuild test robot @ 2018-02-02  6:05 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: kbuild-all, steven.sistare, daniel.m.jordan, akpm, mgorman,
	mhocko, linux-mm, linux-kernel, gregkh, vbabka, bharata

[-- Attachment #1: Type: text/plain, Size: 1998 bytes --]

Hi Pavel,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on mmotm/master]
[also build test ERROR on next-20180201]
[cannot apply to v4.15]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Pavel-Tatashin/mm-optimize-memory-hotplug/20180202-125437
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: x86_64-randconfig-x017-201804 (attached as .config)
compiler: gcc-7 (Debian 7.2.0-12) 7.2.1 20171025
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   mm/page_alloc.c: In function 'init_reserved_page':
>> mm/page_alloc.c:1215:44: error: 'zone' undeclared (first use in this function)
     __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
                                               ^~~~
   mm/page_alloc.c:1215:44: note: each undeclared identifier is reported only once for each function it appears in

vim +/zone +1215 mm/page_alloc.c

  1196	
  1197	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
  1198	static void __meminit init_reserved_page(unsigned long pfn)
  1199	{
  1200		pg_data_t *pgdat;
  1201		int nid, zid;
  1202	
  1203		if (!early_page_uninitialised(pfn))
  1204			return;
  1205	
  1206		nid = early_pfn_to_nid(pfn);
  1207		pgdat = NODE_DATA(nid);
  1208	
  1209		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  1210			struct zone *zone = &pgdat->node_zones[zid];
  1211	
  1212			if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
  1213				break;
  1214		}
> 1215		__init_single_page(pfn_to_page(pfn), pfn, zone, nid);
  1216	}
  1217	#else
  1218	static inline void init_reserved_page(unsigned long pfn)
  1219	{
  1220	}
  1221	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
  1222	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 34861 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-02-02  6:05 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-31  5:42 [PATCH v1] mm: optimize memory hotplug Pavel Tatashin
2018-01-31  8:43 ` Michal Hocko
2018-01-31 18:38   ` Pavel Tatashin
2018-02-02  6:05 ` kbuild test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).