linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 4.14 21/95] mm: fix inactive list balancing between NUMA nodes and cgroups
       [not found] <20190507053826.31622-1-sashal@kernel.org>
@ 2019-05-07  5:37 ` Sasha Levin
  2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section Sasha Levin
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 21+ messages in thread
From: Sasha Levin @ 2019-05-07  5:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Johannes Weiner, Shakeel Butt, Roman Gushchin, Michal Hocko,
	Andrew Morton, Linus Torvalds, Sasha Levin, linux-mm

From: Johannes Weiner <hannes@cmpxchg.org>

[ Upstream commit 3b991208b897f52507168374033771a984b947b1 ]

During !CONFIG_CGROUP reclaim, we expand the inactive list size if it's
thrashing on the node that is about to be reclaimed.  But when cgroups
are enabled, we suddenly ignore the node scope and use the cgroup scope
only.  The result is that pressure bleeds between NUMA nodes depending
on whether cgroups are merely compiled into Linux.  This behavioral
difference is unexpected and undesirable.

When the refault adaptivity of the inactive list was first introduced,
there were no statistics at the lruvec level - the intersection of node
and memcg - so it was better than nothing.

But now that we have that infrastructure, use lruvec_page_state() to
make the list balancing decision always NUMA aware.

[hannes@cmpxchg.org: fix bisection hole]
  Link: http://lkml.kernel.org/r/20190417155241.GB23013@cmpxchg.org
Link: http://lkml.kernel.org/r/20190412144438.2645-1-hannes@cmpxchg.org
Fixes: 2a2e48854d70 ("mm: vmscan: fix IO/refault regression in cache workingset transition")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/vmscan.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9734e62654fa..144961f6f89c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2111,7 +2111,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-				 struct mem_cgroup *memcg,
 				 struct scan_control *sc, bool actual_reclaim)
 {
 	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
@@ -2132,16 +2131,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
 	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
-	if (memcg)
-		refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
-	else
-		refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
 	/*
 	 * When refaults are being observed, it means a new workingset
 	 * is being established. Disable active list protection to get
 	 * rid of the stale workingset quickly.
 	 */
+	refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
 	if (file && actual_reclaim && lruvec->refaults != refaults) {
 		inactive_ratio = 0;
 	} else {
@@ -2162,12 +2157,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-				 struct lruvec *lruvec, struct mem_cgroup *memcg,
-				 struct scan_control *sc)
+				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, is_file_lru(lru),
-					 memcg, sc, true))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2267,7 +2260,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			 * anonymous pages on the LRU in eligible zones.
 			 * Otherwise, the small LRU gets thrashed.
 			 */
-			if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+			if (!inactive_list_is_low(lruvec, false, sc, false) &&
 			    lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
 					>> sc->priority) {
 				scan_balance = SCAN_ANON;
@@ -2285,7 +2278,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * lruvec even if it has plenty of old anonymous pages unless the
 	 * system is under heavy pressure.
 	 */
-	if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
+	if (!inactive_list_is_low(lruvec, true, sc, false) &&
 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2438,7 +2431,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 				nr[lru] -= nr_to_scan;
 
 				nr_reclaimed += shrink_list(lru, nr_to_scan,
-							    lruvec, memcg, sc);
+							    lruvec, sc);
 			}
 		}
 
@@ -2505,7 +2498,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+	if (inactive_list_is_low(lruvec, false, sc, true))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 }
@@ -2830,12 +2823,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
 		unsigned long refaults;
 		struct lruvec *lruvec;
 
-		if (memcg)
-			refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
-		else
-			refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
 		lruvec = mem_cgroup_lruvec(pgdat, memcg);
+		refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
 		lruvec->refaults = refaults;
 	} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
 }
@@ -3183,7 +3172,7 @@ static void age_active_anon(struct pglist_data *pgdat,
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-		if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+		if (inactive_list_is_low(lruvec, false, sc, true))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
       [not found] <20190507053826.31622-1-sashal@kernel.org>
  2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 21/95] mm: fix inactive list balancing between NUMA nodes and cgroups Sasha Levin
@ 2019-05-07  5:37 ` Sasha Levin
  2019-05-07 16:31   ` Alexander Duyck
  2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 71/95] Revert "mm, memory_hotplug: initialize struct pages for the full memory section" Sasha Levin
  2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 86/95] mm/memory.c: fix modifying of page protection by insert_pfn() Sasha Levin
  3 siblings, 1 reply; 21+ messages in thread
From: Sasha Levin @ 2019-05-07  5:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Mikhail Zaslonko, Gerald Schaefer, Michal Hocko, Michal Hocko,
	Mikhail Gavrilov, Dave Hansen, Alexander Duyck, Pasha Tatashin,
	Martin Schwidefsky, Heiko Carstens, Andrew Morton,
	Linus Torvalds, Sasha Levin, linux-mm

From: Mikhail Zaslonko <zaslonko@linux.ibm.com>

[ Upstream commit 2830bf6f05fb3e05bc4743274b806c821807a684 ]

If memory end is not aligned with the sparse memory section boundary,
the mapping of such a section is only partly initialized.  This may lead
to VM_BUG_ON due to uninitialized struct page access from
is_mem_section_removable() or test_pages_in_a_zone() function triggered
by memory_hotplug sysfs handlers:

Here are the the panic examples:
 CONFIG_DEBUG_VM=y
 CONFIG_DEBUG_VM_PGFLAGS=y

 kernel parameter mem=2050M
 --------------------------
 page:000003d082008000 is uninitialized and poisoned
 page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
 Call Trace:
 ( test_pages_in_a_zone+0xde/0x160)
   show_valid_zones+0x5c/0x190
   dev_attr_show+0x34/0x70
   sysfs_kf_seq_show+0xc8/0x148
   seq_read+0x204/0x480
   __vfs_read+0x32/0x178
   vfs_read+0x82/0x138
   ksys_read+0x5a/0xb0
   system_call+0xdc/0x2d8
 Last Breaking-Event-Address:
   test_pages_in_a_zone+0xde/0x160
 Kernel panic - not syncing: Fatal exception: panic_on_oops

 kernel parameter mem=3075M
 --------------------------
 page:000003d08300c000 is uninitialized and poisoned
 page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
 Call Trace:
 ( is_mem_section_removable+0xb4/0x190)
   show_mem_removable+0x9a/0xd8
   dev_attr_show+0x34/0x70
   sysfs_kf_seq_show+0xc8/0x148
   seq_read+0x204/0x480
   __vfs_read+0x32/0x178
   vfs_read+0x82/0x138
   ksys_read+0x5a/0xb0
   system_call+0xdc/0x2d8
 Last Breaking-Event-Address:
   is_mem_section_removable+0xb4/0x190
 Kernel panic - not syncing: Fatal exception: panic_on_oops

Fix the problem by initializing the last memory section of each zone in
memmap_init_zone() till the very end, even if it goes beyond the zone end.

Michal said:

: This has alwways been problem AFAIU.  It just went unnoticed because we
: have zeroed memmaps during allocation before f7f99100d8d9 ("mm: stop
: zeroing memory during allocation in vmemmap") and so the above test
: would simply skip these ranges as belonging to zone 0 or provided a
: garbage.
:
: So I guess we do care for post f7f99100d8d9 kernels mostly and
: therefore Fixes: f7f99100d8d9 ("mm: stop zeroing memory during
: allocation in vmemmap")

Link: http://lkml.kernel.org/r/20181212172712.34019-2-zaslonko@linux.ibm.com
Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
---
 mm/page_alloc.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 923deb33bf34..16c20d9e771f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5348,6 +5348,18 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			__init_single_pfn(pfn, zone, nid);
 		}
 	}
+#ifdef CONFIG_SPARSEMEM
+	/*
+	 * If the zone does not span the rest of the section then
+	 * we should at least initialize those pages. Otherwise we
+	 * could blow up on a poisoned page in some paths which depend
+	 * on full sections being initialized (e.g. memory hotplug).
+	 */
+	while (end_pfn % PAGES_PER_SECTION) {
+		__init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
+		end_pfn++;
+	}
+#endif
 }
 
 static void __meminit zone_init_free_lists(struct zone *zone)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH AUTOSEL 4.14 71/95] Revert "mm, memory_hotplug: initialize struct pages for the full memory section"
       [not found] <20190507053826.31622-1-sashal@kernel.org>
  2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 21/95] mm: fix inactive list balancing between NUMA nodes and cgroups Sasha Levin
  2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section Sasha Levin
@ 2019-05-07  5:38 ` Sasha Levin
  2019-05-07 17:25   ` Alexander Duyck
  2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 86/95] mm/memory.c: fix modifying of page protection by insert_pfn() Sasha Levin
  3 siblings, 1 reply; 21+ messages in thread
From: Sasha Levin @ 2019-05-07  5:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Michal Hocko, Robert Shteynfeld, stable, Linus Torvalds,
	Sasha Levin, linux-mm

From: Michal Hocko <mhocko@suse.com>

[ Upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a ]

This reverts commit 2830bf6f05fb3e05bc4743274b806c821807a684.

The underlying assumption that one sparse section belongs into a single
numa node doesn't hold really. Robert Shteynfeld has reported a boot
failure. The boot log was not captured but his memory layout is as
follows:

  Early memory node ranges
    node   1: [mem 0x0000000000001000-0x0000000000090fff]
    node   1: [mem 0x0000000000100000-0x00000000dbdf8fff]
    node   1: [mem 0x0000000100000000-0x0000001423ffffff]
    node   0: [mem 0x0000001424000000-0x0000002023ffffff]

This means that node0 starts in the middle of a memory section which is
also in node1.  memmap_init_zone tries to initialize padding of a
section even when it is outside of the given pfn range because there are
code paths (e.g.  memory hotplug) which assume that the full worth of
memory section is always initialized.

In this particular case, though, such a range is already intialized and
most likely already managed by the page allocator.  Scribbling over
those pages corrupts the internal state and likely blows up when any of
those pages gets used.

Reported-by: Robert Shteynfeld <robert.shteynfeld@gmail.com>
Fixes: 2830bf6f05fb ("mm, memory_hotplug: initialize struct pages for the full memory section")
Cc: stable@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
---
 mm/page_alloc.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 16c20d9e771f..923deb33bf34 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5348,18 +5348,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			__init_single_pfn(pfn, zone, nid);
 		}
 	}
-#ifdef CONFIG_SPARSEMEM
-	/*
-	 * If the zone does not span the rest of the section then
-	 * we should at least initialize those pages. Otherwise we
-	 * could blow up on a poisoned page in some paths which depend
-	 * on full sections being initialized (e.g. memory hotplug).
-	 */
-	while (end_pfn % PAGES_PER_SECTION) {
-		__init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
-		end_pfn++;
-	}
-#endif
 }
 
 static void __meminit zone_init_free_lists(struct zone *zone)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH AUTOSEL 4.14 86/95] mm/memory.c: fix modifying of page protection by insert_pfn()
       [not found] <20190507053826.31622-1-sashal@kernel.org>
                   ` (2 preceding siblings ...)
  2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 71/95] Revert "mm, memory_hotplug: initialize struct pages for the full memory section" Sasha Levin
@ 2019-05-07  5:38 ` Sasha Levin
  3 siblings, 0 replies; 21+ messages in thread
From: Sasha Levin @ 2019-05-07  5:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Jan Kara, Aneesh Kumar K.V, Dan Williams, Chandan Rajendra,
	Andrew Morton, Linus Torvalds, Sasha Levin, linux-mm

From: Jan Kara <jack@suse.cz>

[ Upstream commit cae85cb8add35f678cf487139d05e083ce2f570a ]

Aneesh has reported that PPC triggers the following warning when
excercising DAX code:

  IP set_pte_at+0x3c/0x190
  LR insert_pfn+0x208/0x280
  Call Trace:
     insert_pfn+0x68/0x280
     dax_iomap_pte_fault.isra.7+0x734/0xa40
     __xfs_filemap_fault+0x280/0x2d0
     do_wp_page+0x48c/0xa40
     __handle_mm_fault+0x8d0/0x1fd0
     handle_mm_fault+0x140/0x250
     __do_page_fault+0x300/0xd60
     handle_page_fault+0x18

Now that is WARN_ON in set_pte_at which is

        VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));

The problem is that on some architectures set_pte_at() cannot cope with
a situation where there is already some (different) valid entry present.

Use ptep_set_access_flags() instead to modify the pfn which is built to
deal with modifying existing PTE.

Link: http://lkml.kernel.org/r/20190311084537.16029-1-jack@suse.cz
Fixes: b2770da64254 "mm: add vm_insert_mixed_mkwrite()"
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Chandan Rajendra <chandan@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
---
 mm/memory.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index f99b64ca1303..e9bce27bc18c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1813,10 +1813,12 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
 				goto out_unlock;
 			}
-			entry = *pte;
-			goto out_mkwrite;
-		} else
-			goto out_unlock;
+			entry = pte_mkyoung(*pte);
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+				update_mmu_cache(vma, addr, pte);
+		}
+		goto out_unlock;
 	}
 
 	/* Ok, finally just insert the thing.. */
@@ -1825,7 +1827,6 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	else
 		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
 
-out_mkwrite:
 	if (mkwrite) {
 		entry = pte_mkyoung(entry);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section Sasha Levin
@ 2019-05-07 16:31   ` Alexander Duyck
  2019-05-07 16:50     ` Linus Torvalds
  2019-05-07 16:58     ` Sasha Levin
  0 siblings, 2 replies; 21+ messages in thread
From: Alexander Duyck @ 2019-05-07 16:31 UTC (permalink / raw)
  To: Sasha Levin
  Cc: LKML, stable, Mikhail Zaslonko, Gerald Schaefer, Michal Hocko,
	Michal Hocko, Mikhail Gavrilov, Dave Hansen, Alexander Duyck,
	Pasha Tatashin, Martin Schwidefsky, Heiko Carstens,
	Andrew Morton, Linus Torvalds, Sasha Levin, linux-mm

On Mon, May 6, 2019 at 10:40 PM Sasha Levin <sashal@kernel.org> wrote:
>
> From: Mikhail Zaslonko <zaslonko@linux.ibm.com>
>
> [ Upstream commit 2830bf6f05fb3e05bc4743274b806c821807a684 ]
>
> If memory end is not aligned with the sparse memory section boundary,
> the mapping of such a section is only partly initialized.  This may lead
> to VM_BUG_ON due to uninitialized struct page access from
> is_mem_section_removable() or test_pages_in_a_zone() function triggered
> by memory_hotplug sysfs handlers:
>
> Here are the the panic examples:
>  CONFIG_DEBUG_VM=y
>  CONFIG_DEBUG_VM_PGFLAGS=y
>
>  kernel parameter mem=2050M
>  --------------------------
>  page:000003d082008000 is uninitialized and poisoned
>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
>  Call Trace:
>  ( test_pages_in_a_zone+0xde/0x160)
>    show_valid_zones+0x5c/0x190
>    dev_attr_show+0x34/0x70
>    sysfs_kf_seq_show+0xc8/0x148
>    seq_read+0x204/0x480
>    __vfs_read+0x32/0x178
>    vfs_read+0x82/0x138
>    ksys_read+0x5a/0xb0
>    system_call+0xdc/0x2d8
>  Last Breaking-Event-Address:
>    test_pages_in_a_zone+0xde/0x160
>  Kernel panic - not syncing: Fatal exception: panic_on_oops
>
>  kernel parameter mem=3075M
>  --------------------------
>  page:000003d08300c000 is uninitialized and poisoned
>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
>  Call Trace:
>  ( is_mem_section_removable+0xb4/0x190)
>    show_mem_removable+0x9a/0xd8
>    dev_attr_show+0x34/0x70
>    sysfs_kf_seq_show+0xc8/0x148
>    seq_read+0x204/0x480
>    __vfs_read+0x32/0x178
>    vfs_read+0x82/0x138
>    ksys_read+0x5a/0xb0
>    system_call+0xdc/0x2d8
>  Last Breaking-Event-Address:
>    is_mem_section_removable+0xb4/0x190
>  Kernel panic - not syncing: Fatal exception: panic_on_oops
>
> Fix the problem by initializing the last memory section of each zone in
> memmap_init_zone() till the very end, even if it goes beyond the zone end.
>
> Michal said:
>
> : This has alwways been problem AFAIU.  It just went unnoticed because we
> : have zeroed memmaps during allocation before f7f99100d8d9 ("mm: stop
> : zeroing memory during allocation in vmemmap") and so the above test
> : would simply skip these ranges as belonging to zone 0 or provided a
> : garbage.
> :
> : So I guess we do care for post f7f99100d8d9 kernels mostly and
> : therefore Fixes: f7f99100d8d9 ("mm: stop zeroing memory during
> : allocation in vmemmap")
>
> Link: http://lkml.kernel.org/r/20181212172712.34019-2-zaslonko@linux.ibm.com
> Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
> Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
> Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
> Suggested-by: Michal Hocko <mhocko@kernel.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
> Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
> ---
>  mm/page_alloc.c | 12 ++++++++++++
>  1 file changed, 12 insertions(+)

Wasn't this patch reverted in Linus's tree for causing a regression on
some platforms? If so I'm not sure we should pull this in as a
candidate for stable should we, or am I missing something?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 16:31   ` Alexander Duyck
@ 2019-05-07 16:50     ` Linus Torvalds
  2019-05-07 17:02       ` Sasha Levin
  2019-05-07 16:58     ` Sasha Levin
  1 sibling, 1 reply; 21+ messages in thread
From: Linus Torvalds @ 2019-05-07 16:50 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Sasha Levin, LKML, stable, Mikhail Zaslonko, Gerald Schaefer,
	Michal Hocko, Michal Hocko, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 7, 2019 at 9:31 AM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> Wasn't this patch reverted in Linus's tree for causing a regression on
> some platforms? If so I'm not sure we should pull this in as a
> candidate for stable should we, or am I missing something?

Good catch. It was reverted in commit 4aa9fc2a435a ("Revert "mm,
memory_hotplug: initialize struct pages for the full memory
section"").

We ended up with efad4e475c31 ("mm, memory_hotplug:
is_mem_section_removable do not pass the end of a zone") instead (and
possibly others - this was just from looking for commit messages that
mentioned that reverted commit).

              Linus


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 16:31   ` Alexander Duyck
  2019-05-07 16:50     ` Linus Torvalds
@ 2019-05-07 16:58     ` Sasha Levin
  1 sibling, 0 replies; 21+ messages in thread
From: Sasha Levin @ 2019-05-07 16:58 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: LKML, stable, Mikhail Zaslonko, Gerald Schaefer, Michal Hocko,
	Michal Hocko, Mikhail Gavrilov, Dave Hansen, Alexander Duyck,
	Pasha Tatashin, Martin Schwidefsky, Heiko Carstens,
	Andrew Morton, Linus Torvalds, Sasha Levin, linux-mm

On Tue, May 07, 2019 at 09:31:10AM -0700, Alexander Duyck wrote:
>On Mon, May 6, 2019 at 10:40 PM Sasha Levin <sashal@kernel.org> wrote:
>>
>> From: Mikhail Zaslonko <zaslonko@linux.ibm.com>
>>
>> [ Upstream commit 2830bf6f05fb3e05bc4743274b806c821807a684 ]
>>
>> If memory end is not aligned with the sparse memory section boundary,
>> the mapping of such a section is only partly initialized.  This may lead
>> to VM_BUG_ON due to uninitialized struct page access from
>> is_mem_section_removable() or test_pages_in_a_zone() function triggered
>> by memory_hotplug sysfs handlers:
>>
>> Here are the the panic examples:
>>  CONFIG_DEBUG_VM=y
>>  CONFIG_DEBUG_VM_PGFLAGS=y
>>
>>  kernel parameter mem=2050M
>>  --------------------------
>>  page:000003d082008000 is uninitialized and poisoned
>>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
>>  Call Trace:
>>  ( test_pages_in_a_zone+0xde/0x160)
>>    show_valid_zones+0x5c/0x190
>>    dev_attr_show+0x34/0x70
>>    sysfs_kf_seq_show+0xc8/0x148
>>    seq_read+0x204/0x480
>>    __vfs_read+0x32/0x178
>>    vfs_read+0x82/0x138
>>    ksys_read+0x5a/0xb0
>>    system_call+0xdc/0x2d8
>>  Last Breaking-Event-Address:
>>    test_pages_in_a_zone+0xde/0x160
>>  Kernel panic - not syncing: Fatal exception: panic_on_oops
>>
>>  kernel parameter mem=3075M
>>  --------------------------
>>  page:000003d08300c000 is uninitialized and poisoned
>>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
>>  Call Trace:
>>  ( is_mem_section_removable+0xb4/0x190)
>>    show_mem_removable+0x9a/0xd8
>>    dev_attr_show+0x34/0x70
>>    sysfs_kf_seq_show+0xc8/0x148
>>    seq_read+0x204/0x480
>>    __vfs_read+0x32/0x178
>>    vfs_read+0x82/0x138
>>    ksys_read+0x5a/0xb0
>>    system_call+0xdc/0x2d8
>>  Last Breaking-Event-Address:
>>    is_mem_section_removable+0xb4/0x190
>>  Kernel panic - not syncing: Fatal exception: panic_on_oops
>>
>> Fix the problem by initializing the last memory section of each zone in
>> memmap_init_zone() till the very end, even if it goes beyond the zone end.
>>
>> Michal said:
>>
>> : This has alwways been problem AFAIU.  It just went unnoticed because we
>> : have zeroed memmaps during allocation before f7f99100d8d9 ("mm: stop
>> : zeroing memory during allocation in vmemmap") and so the above test
>> : would simply skip these ranges as belonging to zone 0 or provided a
>> : garbage.
>> :
>> : So I guess we do care for post f7f99100d8d9 kernels mostly and
>> : therefore Fixes: f7f99100d8d9 ("mm: stop zeroing memory during
>> : allocation in vmemmap")
>>
>> Link: http://lkml.kernel.org/r/20181212172712.34019-2-zaslonko@linux.ibm.com
>> Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
>> Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
>> Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
>> Suggested-by: Michal Hocko <mhocko@kernel.org>
>> Acked-by: Michal Hocko <mhocko@suse.com>
>> Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
>> Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
>> Cc: Dave Hansen <dave.hansen@intel.com>
>> Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>> Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
>> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
>> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
>> Cc: <stable@vger.kernel.org>
>> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
>> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
>> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
>> ---
>>  mm/page_alloc.c | 12 ++++++++++++
>>  1 file changed, 12 insertions(+)
>
>Wasn't this patch reverted in Linus's tree for causing a regression on
>some platforms? If so I'm not sure we should pull this in as a
>candidate for stable should we, or am I missing something?

I saw a follow-up patch that should be queued too, but I didn't see that
this one got reverted.

--
Thanks,
Sasha


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 16:50     ` Linus Torvalds
@ 2019-05-07 17:02       ` Sasha Levin
  2019-05-07 17:13         ` Gerald Schaefer
  2019-05-07 17:15         ` Linus Torvalds
  0 siblings, 2 replies; 21+ messages in thread
From: Sasha Levin @ 2019-05-07 17:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexander Duyck, LKML, stable, Mikhail Zaslonko, Gerald Schaefer,
	Michal Hocko, Michal Hocko, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 07, 2019 at 09:50:50AM -0700, Linus Torvalds wrote:
>On Tue, May 7, 2019 at 9:31 AM Alexander Duyck
><alexander.duyck@gmail.com> wrote:
>>
>> Wasn't this patch reverted in Linus's tree for causing a regression on
>> some platforms? If so I'm not sure we should pull this in as a
>> candidate for stable should we, or am I missing something?
>
>Good catch. It was reverted in commit 4aa9fc2a435a ("Revert "mm,
>memory_hotplug: initialize struct pages for the full memory
>section"").
>
>We ended up with efad4e475c31 ("mm, memory_hotplug:
>is_mem_section_removable do not pass the end of a zone") instead (and
>possibly others - this was just from looking for commit messages that
>mentioned that reverted commit).

I got it wrong then. I'll fix it up and get efad4e475c31 in instead.
Thanks!

--
Thanks,
Sasha


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:02       ` Sasha Levin
@ 2019-05-07 17:13         ` Gerald Schaefer
  2019-05-07 17:15         ` Linus Torvalds
  1 sibling, 0 replies; 21+ messages in thread
From: Gerald Schaefer @ 2019-05-07 17:13 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Linus Torvalds, Alexander Duyck, LKML, stable, Mikhail Zaslonko,
	Michal Hocko, Michal Hocko, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, 7 May 2019 13:02:08 -0400
Sasha Levin <sashal@kernel.org> wrote:

> On Tue, May 07, 2019 at 09:50:50AM -0700, Linus Torvalds wrote:
> >On Tue, May 7, 2019 at 9:31 AM Alexander Duyck
> ><alexander.duyck@gmail.com> wrote:
> >>
> >> Wasn't this patch reverted in Linus's tree for causing a regression on
> >> some platforms? If so I'm not sure we should pull this in as a
> >> candidate for stable should we, or am I missing something?
> >
> >Good catch. It was reverted in commit 4aa9fc2a435a ("Revert "mm,
> >memory_hotplug: initialize struct pages for the full memory
> >section"").
> >
> >We ended up with efad4e475c31 ("mm, memory_hotplug:
> >is_mem_section_removable do not pass the end of a zone") instead (and
> >possibly others - this was just from looking for commit messages that
> >mentioned that reverted commit).
> 
> I got it wrong then. I'll fix it up and get efad4e475c31 in instead.

There were two commits replacing the reverted commit, fixing
is_mem_section_removable() and test_pages_in_a_zone() respectively:

commit 24feb47c5fa5 ("mm, memory_hotplug: test_pages_in_a_zone do not
pass the end of zone")
commit efad4e475c31 ("mm, memory_hotplug: is_mem_section_removable do
not pass the end of a zone")


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:02       ` Sasha Levin
  2019-05-07 17:13         ` Gerald Schaefer
@ 2019-05-07 17:15         ` Linus Torvalds
  2019-05-07 17:18           ` Sasha Levin
  2019-05-07 17:31           ` Michal Hocko
  1 sibling, 2 replies; 21+ messages in thread
From: Linus Torvalds @ 2019-05-07 17:15 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Alexander Duyck, LKML, stable, Mikhail Zaslonko, Gerald Schaefer,
	Michal Hocko, Michal Hocko, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 7, 2019 at 10:02 AM Sasha Levin <sashal@kernel.org> wrote:
>
> I got it wrong then. I'll fix it up and get efad4e475c31 in instead.

Careful. That one had a bug too, and we have 891cb2a72d82 ("mm,
memory_hotplug: fix off-by-one in is_pageblock_removable").

All of these were *horribly* and subtly buggy, and might be
intertwined with other issues. And only trigger on a few specific
machines where the memory map layout is just right to trigger some
special case or other, and you have just the right config.

It might be best to verify with Michal Hocko. Michal?

              Linus


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:15         ` Linus Torvalds
@ 2019-05-07 17:18           ` Sasha Levin
  2019-05-07 17:32             ` Michal Hocko
  2019-05-08 11:04             ` Gerald Schaefer
  2019-05-07 17:31           ` Michal Hocko
  1 sibling, 2 replies; 21+ messages in thread
From: Sasha Levin @ 2019-05-07 17:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexander Duyck, LKML, stable, Mikhail Zaslonko, Gerald Schaefer,
	Michal Hocko, Michal Hocko, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 07, 2019 at 10:15:19AM -0700, Linus Torvalds wrote:
>On Tue, May 7, 2019 at 10:02 AM Sasha Levin <sashal@kernel.org> wrote:
>>
>> I got it wrong then. I'll fix it up and get efad4e475c31 in instead.
>
>Careful. That one had a bug too, and we have 891cb2a72d82 ("mm,
>memory_hotplug: fix off-by-one in is_pageblock_removable").
>
>All of these were *horribly* and subtly buggy, and might be
>intertwined with other issues. And only trigger on a few specific
>machines where the memory map layout is just right to trigger some
>special case or other, and you have just the right config.
>
>It might be best to verify with Michal Hocko. Michal?

Michal, is there a testcase I can plug into kselftests to make sure we
got this right (and don't regress)? We care a lot about memory hotplug
working right.

--
Thanks,
Sasha


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 71/95] Revert "mm, memory_hotplug: initialize struct pages for the full memory section"
  2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 71/95] Revert "mm, memory_hotplug: initialize struct pages for the full memory section" Sasha Levin
@ 2019-05-07 17:25   ` Alexander Duyck
  0 siblings, 0 replies; 21+ messages in thread
From: Alexander Duyck @ 2019-05-07 17:25 UTC (permalink / raw)
  To: Sasha Levin
  Cc: LKML, stable, Michal Hocko, Robert Shteynfeld, stable,
	Linus Torvalds, Sasha Levin, linux-mm

On Mon, May 6, 2019 at 10:40 PM Sasha Levin <sashal@kernel.org> wrote:
>
> From: Michal Hocko <mhocko@suse.com>
>
> [ Upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a ]
>
> This reverts commit 2830bf6f05fb3e05bc4743274b806c821807a684.
>
> The underlying assumption that one sparse section belongs into a single
> numa node doesn't hold really. Robert Shteynfeld has reported a boot
> failure. The boot log was not captured but his memory layout is as
> follows:
>
>   Early memory node ranges
>     node   1: [mem 0x0000000000001000-0x0000000000090fff]
>     node   1: [mem 0x0000000000100000-0x00000000dbdf8fff]
>     node   1: [mem 0x0000000100000000-0x0000001423ffffff]
>     node   0: [mem 0x0000001424000000-0x0000002023ffffff]
>
> This means that node0 starts in the middle of a memory section which is
> also in node1.  memmap_init_zone tries to initialize padding of a
> section even when it is outside of the given pfn range because there are
> code paths (e.g.  memory hotplug) which assume that the full worth of
> memory section is always initialized.
>
> In this particular case, though, such a range is already intialized and
> most likely already managed by the page allocator.  Scribbling over
> those pages corrupts the internal state and likely blows up when any of
> those pages gets used.
>
> Reported-by: Robert Shteynfeld <robert.shteynfeld@gmail.com>
> Fixes: 2830bf6f05fb ("mm, memory_hotplug: initialize struct pages for the full memory section")
> Cc: stable@kernel.org
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
> ---
>  mm/page_alloc.c | 12 ------------
>  1 file changed, 12 deletions(-)

So it looks like you already had the revert of the earlier patch I
pointed out enqueued as well. So you can probably at a minimum just
drop this patch and the earlier patch that this reverts.

Thanks.

- Alex


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:15         ` Linus Torvalds
  2019-05-07 17:18           ` Sasha Levin
@ 2019-05-07 17:31           ` Michal Hocko
  1 sibling, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2019-05-07 17:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Sasha Levin, Alexander Duyck, LKML, stable, Mikhail Zaslonko,
	Gerald Schaefer, Mikhail Gavrilov, Dave Hansen, Alexander Duyck,
	Pasha Tatashin, Martin Schwidefsky, Heiko Carstens,
	Andrew Morton, Sasha Levin, linux-mm

On Tue 07-05-19 10:15:19, Linus Torvalds wrote:
> On Tue, May 7, 2019 at 10:02 AM Sasha Levin <sashal@kernel.org> wrote:
> >
> > I got it wrong then. I'll fix it up and get efad4e475c31 in instead.

This patch is not marked for stable backports for good reasons.

> 
> Careful. That one had a bug too, and we have 891cb2a72d82 ("mm,
> memory_hotplug: fix off-by-one in is_pageblock_removable").
> 
> All of these were *horribly* and subtly buggy, and might be
> intertwined with other issues. And only trigger on a few specific
> machines where the memory map layout is just right to trigger some
> special case or other, and you have just the right config.

Yes, the code turned out to be much more tricky than we thought. There
were several assumptions about alignment etc. Something that is really
hard to test for because HW breaking those assumptions is rare. So I
would discourage picking up some random patches in the memory hotplug
for stable. Each patch needs a very careful consideration. In any case
we really try hard to keep Fixes: tag accurate so at least those should
be scanned.

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:18           ` Sasha Levin
@ 2019-05-07 17:32             ` Michal Hocko
  2019-05-07 17:36               ` Matthew Wilcox
  2019-05-08 11:04             ` Gerald Schaefer
  1 sibling, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2019-05-07 17:32 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Linus Torvalds, Alexander Duyck, LKML, stable, Mikhail Zaslonko,
	Gerald Schaefer, Mikhail Gavrilov, Dave Hansen, Alexander Duyck,
	Pasha Tatashin, Martin Schwidefsky, Heiko Carstens,
	Andrew Morton, Sasha Levin, linux-mm

On Tue 07-05-19 13:18:06, Sasha Levin wrote:
> On Tue, May 07, 2019 at 10:15:19AM -0700, Linus Torvalds wrote:
> > On Tue, May 7, 2019 at 10:02 AM Sasha Levin <sashal@kernel.org> wrote:
> > > 
> > > I got it wrong then. I'll fix it up and get efad4e475c31 in instead.
> > 
> > Careful. That one had a bug too, and we have 891cb2a72d82 ("mm,
> > memory_hotplug: fix off-by-one in is_pageblock_removable").
> > 
> > All of these were *horribly* and subtly buggy, and might be
> > intertwined with other issues. And only trigger on a few specific
> > machines where the memory map layout is just right to trigger some
> > special case or other, and you have just the right config.
> > 
> > It might be best to verify with Michal Hocko. Michal?
> 
> Michal, is there a testcase I can plug into kselftests to make sure we
> got this right (and don't regress)? We care a lot about memory hotplug
> working right.

As said in other email. The memory hotplug tends to work usually. It
takes unexpected memory layouts which trigger corner cases. This makes
testing really hard.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:32             ` Michal Hocko
@ 2019-05-07 17:36               ` Matthew Wilcox
  2019-05-07 17:43                 ` Linus Torvalds
                                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Matthew Wilcox @ 2019-05-07 17:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Sasha Levin, Linus Torvalds, Alexander Duyck, LKML, stable,
	Mikhail Zaslonko, Gerald Schaefer, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 07, 2019 at 07:32:24PM +0200, Michal Hocko wrote:
> On Tue 07-05-19 13:18:06, Sasha Levin wrote:
> > Michal, is there a testcase I can plug into kselftests to make sure we
> > got this right (and don't regress)? We care a lot about memory hotplug
> > working right.
> 
> As said in other email. The memory hotplug tends to work usually. It
> takes unexpected memory layouts which trigger corner cases. This makes
> testing really hard.

Can we do something with qemu?  Is it flexible enough to hotplug memory
at the right boundaries?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:36               ` Matthew Wilcox
@ 2019-05-07 17:43                 ` Linus Torvalds
  2019-05-07 17:51                   ` Michal Hocko
  2019-05-07 17:43                 ` Michal Hocko
  2019-05-07 17:45                 ` Sasha Levin
  2 siblings, 1 reply; 21+ messages in thread
From: Linus Torvalds @ 2019-05-07 17:43 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Michal Hocko, Sasha Levin, Alexander Duyck, LKML, stable,
	Mikhail Zaslonko, Gerald Schaefer, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 7, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> Can we do something with qemu?  Is it flexible enough to hotplug memory
> at the right boundaries?

It's not just the actual hotplugged memory, it's things like how the
e820 tables were laid out for the _regular_ non-hotplug stuff too,
iirc to get the cases where something didn't work out.

I'm sure it *could* be emulated, and I'm sure some hotplug (and page
poison errors etc) testing in qemu would be lovely and presumably some
people do it, but all the cases so far have been about odd small
special cases that people didn't think of and didn't hit. I'm not sure
the qemu testing would think of them either..

                Linus


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:36               ` Matthew Wilcox
  2019-05-07 17:43                 ` Linus Torvalds
@ 2019-05-07 17:43                 ` Michal Hocko
  2019-05-07 17:45                 ` Sasha Levin
  2 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2019-05-07 17:43 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Sasha Levin, Linus Torvalds, Alexander Duyck, LKML, stable,
	Mikhail Zaslonko, Gerald Schaefer, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue 07-05-19 10:36:55, Matthew Wilcox wrote:
> On Tue, May 07, 2019 at 07:32:24PM +0200, Michal Hocko wrote:
> > On Tue 07-05-19 13:18:06, Sasha Levin wrote:
> > > Michal, is there a testcase I can plug into kselftests to make sure we
> > > got this right (and don't regress)? We care a lot about memory hotplug
> > > working right.
> > 
> > As said in other email. The memory hotplug tends to work usually. It
> > takes unexpected memory layouts which trigger corner cases. This makes
> > testing really hard.
> 
> Can we do something with qemu?  Is it flexible enough to hotplug memory
> at the right boundaries?

No idea. But I have tried to describe those layouts in the changelog so
if somebody can come up with a way to reproduce them under kvm/qemu I
would really appreciate that.

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:36               ` Matthew Wilcox
  2019-05-07 17:43                 ` Linus Torvalds
  2019-05-07 17:43                 ` Michal Hocko
@ 2019-05-07 17:45                 ` Sasha Levin
  2019-05-07 17:54                   ` Michal Hocko
  2 siblings, 1 reply; 21+ messages in thread
From: Sasha Levin @ 2019-05-07 17:45 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Michal Hocko, Linus Torvalds, Alexander Duyck, LKML, stable,
	Mikhail Zaslonko, Gerald Schaefer, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, May 07, 2019 at 10:36:55AM -0700, Matthew Wilcox wrote:
>On Tue, May 07, 2019 at 07:32:24PM +0200, Michal Hocko wrote:
>> On Tue 07-05-19 13:18:06, Sasha Levin wrote:
>> > Michal, is there a testcase I can plug into kselftests to make sure we
>> > got this right (and don't regress)? We care a lot about memory hotplug
>> > working right.
>>
>> As said in other email. The memory hotplug tends to work usually. It
>> takes unexpected memory layouts which trigger corner cases. This makes
>> testing really hard.
>
>Can we do something with qemu?  Is it flexible enough to hotplug memory
>at the right boundaries?

That was my thinking too. qemu should be able to reproduce all these
"unexpected" memory layouts we've had issue with so far and at the very
least make sure we don't regress on those.

We're going to have (quite a) large amount of systems with "weird"
memory layouts that do memory hotplug quite frequently in production, so
this whole "tends to work usually" thing kinda scares me.

--
Thanks,
Sasha


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:43                 ` Linus Torvalds
@ 2019-05-07 17:51                   ` Michal Hocko
  0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2019-05-07 17:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Matthew Wilcox, Sasha Levin, Alexander Duyck, LKML, stable,
	Mikhail Zaslonko, Gerald Schaefer, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue 07-05-19 10:43:31, Linus Torvalds wrote:
> On Tue, May 7, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > Can we do something with qemu?  Is it flexible enough to hotplug memory
> > at the right boundaries?
> 
> It's not just the actual hotplugged memory, it's things like how the
> e820 tables were laid out for the _regular_ non-hotplug stuff too,
> iirc to get the cases where something didn't work out.
> 
> I'm sure it *could* be emulated, and I'm sure some hotplug (and page
> poison errors etc) testing in qemu would be lovely and presumably some
> people do it, but all the cases so far have been about odd small
> special cases that people didn't think of and didn't hit. I'm not sure
> the qemu testing would think of them either..

Yes, this is exactly my point. It would be great to have those odd small
special cases that we have met already available though. For a
regression testing for them at least.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:45                 ` Sasha Levin
@ 2019-05-07 17:54                   ` Michal Hocko
  0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2019-05-07 17:54 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Matthew Wilcox, Linus Torvalds, Alexander Duyck, LKML, stable,
	Mikhail Zaslonko, Gerald Schaefer, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue 07-05-19 13:45:14, Sasha Levin wrote:
[...]
> We're going to have (quite a) large amount of systems with "weird"
> memory layouts that do memory hotplug quite frequently in production, so
> this whole "tends to work usually" thing kinda scares me.

Memory hotplug is simply not production ready for those cases,
unfortunately. It tends to work just fine with properly section
aligned systems with memory being in the movable zones or for zone
device. Everything beyond that is kinda long way to get there...
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section
  2019-05-07 17:18           ` Sasha Levin
  2019-05-07 17:32             ` Michal Hocko
@ 2019-05-08 11:04             ` Gerald Schaefer
  1 sibling, 0 replies; 21+ messages in thread
From: Gerald Schaefer @ 2019-05-08 11:04 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Linus Torvalds, Alexander Duyck, LKML, stable, Mikhail Zaslonko,
	Michal Hocko, Michal Hocko, Mikhail Gavrilov, Dave Hansen,
	Alexander Duyck, Pasha Tatashin, Martin Schwidefsky,
	Heiko Carstens, Andrew Morton, Sasha Levin, linux-mm

On Tue, 7 May 2019 13:18:06 -0400
Sasha Levin <sashal@kernel.org> wrote:

> On Tue, May 07, 2019 at 10:15:19AM -0700, Linus Torvalds wrote:
> >On Tue, May 7, 2019 at 10:02 AM Sasha Levin <sashal@kernel.org> wrote:
> >>
> >> I got it wrong then. I'll fix it up and get efad4e475c31 in instead.
> >
> >Careful. That one had a bug too, and we have 891cb2a72d82 ("mm,
> >memory_hotplug: fix off-by-one in is_pageblock_removable").
> >
> >All of these were *horribly* and subtly buggy, and might be
> >intertwined with other issues. And only trigger on a few specific
> >machines where the memory map layout is just right to trigger some
> >special case or other, and you have just the right config.
> >
> >It might be best to verify with Michal Hocko. Michal?
> 
> Michal, is there a testcase I can plug into kselftests to make sure we
> got this right (and don't regress)? We care a lot about memory hotplug
> working right.

We hit the panics on s390 with special z/VM memory layout, but they both
can be triggered simply by using mem= kernel parameter (and
CONFIG_DEBUG_VM_PGFLAGS=y).

With "mem=3075M" (and w/o the commits efad4e475c31 + 24feb47c5fa5), it
can be triggered by reading from
/sys/devices/system/memory/memory<x>/valid_zones, or from
/sys/devices/system/memory/memory<x>/removable, with <x> being the last
memory block.

This is with 256MB section size and memory block size. On LPAR, with
256MB section size and 1GB memory block size, for some reason the
"removable" issue doesn't trigger, only the "valid_zones" issue.

Using lsmem will also trigger it, as it reads both the valid_zones and
the removable attribute for all memory blocks. So, a test with
not-section-aligned mem= parameter and using lsmem could be an option.

Regards,
Gerald


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2019-05-08 11:04 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20190507053826.31622-1-sashal@kernel.org>
2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 21/95] mm: fix inactive list balancing between NUMA nodes and cgroups Sasha Levin
2019-05-07  5:37 ` [PATCH AUTOSEL 4.14 62/95] mm, memory_hotplug: initialize struct pages for the full memory section Sasha Levin
2019-05-07 16:31   ` Alexander Duyck
2019-05-07 16:50     ` Linus Torvalds
2019-05-07 17:02       ` Sasha Levin
2019-05-07 17:13         ` Gerald Schaefer
2019-05-07 17:15         ` Linus Torvalds
2019-05-07 17:18           ` Sasha Levin
2019-05-07 17:32             ` Michal Hocko
2019-05-07 17:36               ` Matthew Wilcox
2019-05-07 17:43                 ` Linus Torvalds
2019-05-07 17:51                   ` Michal Hocko
2019-05-07 17:43                 ` Michal Hocko
2019-05-07 17:45                 ` Sasha Levin
2019-05-07 17:54                   ` Michal Hocko
2019-05-08 11:04             ` Gerald Schaefer
2019-05-07 17:31           ` Michal Hocko
2019-05-07 16:58     ` Sasha Levin
2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 71/95] Revert "mm, memory_hotplug: initialize struct pages for the full memory section" Sasha Levin
2019-05-07 17:25   ` Alexander Duyck
2019-05-07  5:38 ` [PATCH AUTOSEL 4.14 86/95] mm/memory.c: fix modifying of page protection by insert_pfn() Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).