linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
@ 2018-03-16 14:38 Daniel Vacek
  2018-03-16 15:13 ` Daniel Vacek
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Daniel Vacek @ 2018-03-16 14:38 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: Ard Biesheuvel, Daniel Vacek, Andrew Morton, Michal Hocko,
	Vlastimil Babka, Mel Gorman, Pavel Tatashin, Paul Burton, stable

This reverts commit b92df1de5d289c0b5d653e72414bf0850b8511e0. The commit
is meant to be a boot init speed up skipping the loop in memmap_init_zone()
for invalid pfns. But given some specific memory mapping on x86_64 (or more
generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID)
the implementation also skips valid pfns which is plain wrong and causes
'kernel BUG at mm/page_alloc.c:1389!'

crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
kernel BUG at mm/page_alloc.c:1389!
invalid opcode: 0000 [#1] SMP
--
RIP: 0010:[<ffffffff8118833e>]  [<ffffffff8118833e>] move_freepages+0x15e/0x160
RSP: 0018:ffff88054d727688  EFLAGS: 00010087
--
Call Trace:
 [<ffffffff811883b3>] move_freepages_block+0x73/0x80
 [<ffffffff81189e63>] __rmqueue+0x263/0x460
 [<ffffffff8118c781>] get_page_from_freelist+0x7e1/0x9e0
 [<ffffffff8118caf6>] __alloc_pages_nodemask+0x176/0x420
--
RIP  [<ffffffff8118833e>] move_freepages+0x15e/0x160
 RSP <ffff88054d727688>

crash> page_init_bug -v | grep RAM
<struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
<struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
<struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
<struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
<struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
<struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)

crash> page_init_bug | head -6
<struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
<struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
<struct page 0xffffea0001ede200> 505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
<struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
<struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
BUG, zones differ!

crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
      PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffea0001e00000  78000000                0        0  0 0
ffffea0001ed7fc0  7b5ff000                0        0  0 0
ffffea0001ed8000  7b600000                0        0  0 0       <<<<
ffffea0001ede1c0  7b787000                0        0  0 0
ffffea0001ede200  7b788000                0        0  1 1fffff00000000

Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
Signed-off-by: Daniel Vacek <neelx@redhat.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: stable@vger.kernel.org
---
 include/linux/memblock.h |  1 -
 mm/memblock.c            | 28 ----------------------------
 mm/page_alloc.c          | 11 +----------
 3 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8be5077efb5f..f92ea7783652 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
-unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index b6ba6b7adadc..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-						      unsigned long max_pfn)
-{
-	struct memblock_type *type = &memblock.memory;
-	unsigned int right = type->cnt;
-	unsigned int mid, left = 0;
-	phys_addr_t addr = PFN_PHYS(++pfn);
-
-	do {
-		mid = (right + left) / 2;
-
-		if (addr < type->regions[mid].base)
-			right = mid;
-		else if (addr >= (type->regions[mid].base +
-				  type->regions[mid].size))
-			left = mid + 1;
-		else {
-			/* addr is within the region, so pfn is valid */
-			return pfn;
-		}
-	} while (left < right);
-
-	if (right == type->cnt)
-		return -1UL;
-	else
-		return PHYS_PFN(type->regions[right].base);
-}
-
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 635d7dd29d7f..e4566a3f8083 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-			/*
-			 * Skip to the pfn preceding the next valid one (or
-			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop.
-			 */
-			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+		if (!early_pfn_valid(pfn))
 			continue;
-		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  2018-03-16 14:38 [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible" Daniel Vacek
@ 2018-03-16 15:13 ` Daniel Vacek
  2018-03-16 15:14 ` Michal Hocko
  2018-06-21 19:07 ` Paul Burton
  2 siblings, 0 replies; 5+ messages in thread
From: Daniel Vacek @ 2018-03-16 15:13 UTC (permalink / raw)
  To: open list, linux-mm
  Cc: Ard Biesheuvel, Daniel Vacek, Andrew Morton, Michal Hocko,
	Vlastimil Babka, Mel Gorman, Pavel Tatashin, Paul Burton, stable

Sorry I forgot to Cc: Paul Burton <paul.burton@imgtec.com>

--nX

On Fri, Mar 16, 2018 at 3:38 PM, Daniel Vacek <neelx@redhat.com> wrote:
> This reverts commit b92df1de5d289c0b5d653e72414bf0850b8511e0. The commit
> is meant to be a boot init speed up skipping the loop in memmap_init_zone()
> for invalid pfns. But given some specific memory mapping on x86_64 (or more
> generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID)
> the implementation also skips valid pfns which is plain wrong and causes
> 'kernel BUG at mm/page_alloc.c:1389!'
>
> crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
> kernel BUG at mm/page_alloc.c:1389!
> invalid opcode: 0000 [#1] SMP
> --
> RIP: 0010:[<ffffffff8118833e>]  [<ffffffff8118833e>] move_freepages+0x15e/0x160
> RSP: 0018:ffff88054d727688  EFLAGS: 00010087
> --
> Call Trace:
>  [<ffffffff811883b3>] move_freepages_block+0x73/0x80
>  [<ffffffff81189e63>] __rmqueue+0x263/0x460
>  [<ffffffff8118c781>] get_page_from_freelist+0x7e1/0x9e0
>  [<ffffffff8118caf6>] __alloc_pages_nodemask+0x176/0x420
> --
> RIP  [<ffffffff8118833e>] move_freepages+0x15e/0x160
>  RSP <ffff88054d727688>
>
> crash> page_init_bug -v | grep RAM
> <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
> <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
> <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
> <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
> <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)
>
> crash> page_init_bug | head -6
> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
> <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
> <struct page 0xffffea0001ede200> 505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
> <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
> <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
> BUG, zones differ!
>
> crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffea0001e00000  78000000                0        0  0 0
> ffffea0001ed7fc0  7b5ff000                0        0  0 0
> ffffea0001ed8000  7b600000                0        0  0 0       <<<<
> ffffea0001ede1c0  7b787000                0        0  0 0
> ffffea0001ede200  7b788000                0        0  1 1fffff00000000
>
> Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
> Signed-off-by: Daniel Vacek <neelx@redhat.com>
> Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
> Cc: Paul Burton <paul.burton@imgtec.com>
> Cc: stable@vger.kernel.org
> ---
>  include/linux/memblock.h |  1 -
>  mm/memblock.c            | 28 ----------------------------
>  mm/page_alloc.c          | 11 +----------
>  3 files changed, 1 insertion(+), 39 deletions(-)
>
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 8be5077efb5f..f92ea7783652 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
>                             unsigned long  *end_pfn);
>  void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
>                           unsigned long *out_end_pfn, int *out_nid);
> -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
>
>  /**
>   * for_each_mem_pfn_range - early memory pfn range iterator
> diff --git a/mm/memblock.c b/mm/memblock.c
> index b6ba6b7adadc..48376bd33274 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
>                 *out_nid = r->nid;
>  }
>
> -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
> -                                                     unsigned long max_pfn)
> -{
> -       struct memblock_type *type = &memblock.memory;
> -       unsigned int right = type->cnt;
> -       unsigned int mid, left = 0;
> -       phys_addr_t addr = PFN_PHYS(++pfn);
> -
> -       do {
> -               mid = (right + left) / 2;
> -
> -               if (addr < type->regions[mid].base)
> -                       right = mid;
> -               else if (addr >= (type->regions[mid].base +
> -                                 type->regions[mid].size))
> -                       left = mid + 1;
> -               else {
> -                       /* addr is within the region, so pfn is valid */
> -                       return pfn;
> -               }
> -       } while (left < right);
> -
> -       if (right == type->cnt)
> -               return -1UL;
> -       else
> -               return PHYS_PFN(type->regions[right].base);
> -}
> -
>  /**
>   * memblock_set_node - set node ID on memblock regions
>   * @base: base of area to set node ID for
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 635d7dd29d7f..e4566a3f8083 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>                 if (context != MEMMAP_EARLY)
>                         goto not_early;
>
> -               if (!early_pfn_valid(pfn)) {
> -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
> -                       /*
> -                        * Skip to the pfn preceding the next valid one (or
> -                        * end_pfn), such that we hit a valid pfn (or end_pfn)
> -                        * on our next iteration of the loop.
> -                        */
> -                       pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
> -#endif
> +               if (!early_pfn_valid(pfn))
>                         continue;
> -               }
>                 if (!early_pfn_in_nid(pfn, nid))
>                         continue;
>                 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
> --
> 2.16.2
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  2018-03-16 14:38 [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible" Daniel Vacek
  2018-03-16 15:13 ` Daniel Vacek
@ 2018-03-16 15:14 ` Michal Hocko
  2018-06-21 19:07 ` Paul Burton
  2 siblings, 0 replies; 5+ messages in thread
From: Michal Hocko @ 2018-03-16 15:14 UTC (permalink / raw)
  To: Daniel Vacek
  Cc: linux-kernel, linux-mm, Ard Biesheuvel, Andrew Morton,
	Vlastimil Babka, Mel Gorman, Pavel Tatashin, Paul Burton, stable

On Fri 16-03-18 15:38:55, Daniel Vacek wrote:
> This reverts commit b92df1de5d289c0b5d653e72414bf0850b8511e0. The commit
> is meant to be a boot init speed up skipping the loop in memmap_init_zone()
> for invalid pfns. But given some specific memory mapping on x86_64 (or more
> generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID)
> the implementation also skips valid pfns which is plain wrong and causes
> 'kernel BUG at mm/page_alloc.c:1389!'
> 
> crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
> kernel BUG at mm/page_alloc.c:1389!
> invalid opcode: 0000 [#1] SMP
> --
> RIP: 0010:[<ffffffff8118833e>]  [<ffffffff8118833e>] move_freepages+0x15e/0x160
> RSP: 0018:ffff88054d727688  EFLAGS: 00010087
> --
> Call Trace:
>  [<ffffffff811883b3>] move_freepages_block+0x73/0x80
>  [<ffffffff81189e63>] __rmqueue+0x263/0x460
>  [<ffffffff8118c781>] get_page_from_freelist+0x7e1/0x9e0
>  [<ffffffff8118caf6>] __alloc_pages_nodemask+0x176/0x420
> --
> RIP  [<ffffffff8118833e>] move_freepages+0x15e/0x160
>  RSP <ffff88054d727688>
> 
> crash> page_init_bug -v | grep RAM
> <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
> <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
> <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
> <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
> <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)
> 
> crash> page_init_bug | head -6
> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
> <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
> <struct page 0xffffea0001ede200> 505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
> <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
> <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
> BUG, zones differ!
> 
> crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffea0001e00000  78000000                0        0  0 0
> ffffea0001ed7fc0  7b5ff000                0        0  0 0
> ffffea0001ed8000  7b600000                0        0  0 0       <<<<
> ffffea0001ede1c0  7b787000                0        0  0 0
> ffffea0001ede200  7b788000                0        0  1 1fffff00000000
> 
> Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
> Signed-off-by: Daniel Vacek <neelx@redhat.com>
> Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
> Cc: Paul Burton <paul.burton@imgtec.com>
> Cc: stable@vger.kernel.org

Yes, this looks like the most simple way for now before a proper
solution is found
Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  include/linux/memblock.h |  1 -
>  mm/memblock.c            | 28 ----------------------------
>  mm/page_alloc.c          | 11 +----------
>  3 files changed, 1 insertion(+), 39 deletions(-)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 8be5077efb5f..f92ea7783652 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
>  			    unsigned long  *end_pfn);
>  void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
>  			  unsigned long *out_end_pfn, int *out_nid);
> -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
>  
>  /**
>   * for_each_mem_pfn_range - early memory pfn range iterator
> diff --git a/mm/memblock.c b/mm/memblock.c
> index b6ba6b7adadc..48376bd33274 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
>  		*out_nid = r->nid;
>  }
>  
> -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
> -						      unsigned long max_pfn)
> -{
> -	struct memblock_type *type = &memblock.memory;
> -	unsigned int right = type->cnt;
> -	unsigned int mid, left = 0;
> -	phys_addr_t addr = PFN_PHYS(++pfn);
> -
> -	do {
> -		mid = (right + left) / 2;
> -
> -		if (addr < type->regions[mid].base)
> -			right = mid;
> -		else if (addr >= (type->regions[mid].base +
> -				  type->regions[mid].size))
> -			left = mid + 1;
> -		else {
> -			/* addr is within the region, so pfn is valid */
> -			return pfn;
> -		}
> -	} while (left < right);
> -
> -	if (right == type->cnt)
> -		return -1UL;
> -	else
> -		return PHYS_PFN(type->regions[right].base);
> -}
> -
>  /**
>   * memblock_set_node - set node ID on memblock regions
>   * @base: base of area to set node ID for
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 635d7dd29d7f..e4566a3f8083 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  		if (context != MEMMAP_EARLY)
>  			goto not_early;
>  
> -		if (!early_pfn_valid(pfn)) {
> -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
> -			/*
> -			 * Skip to the pfn preceding the next valid one (or
> -			 * end_pfn), such that we hit a valid pfn (or end_pfn)
> -			 * on our next iteration of the loop.
> -			 */
> -			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
> -#endif
> +		if (!early_pfn_valid(pfn))
>  			continue;
> -		}
>  		if (!early_pfn_in_nid(pfn, nid))
>  			continue;
>  		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
> -- 
> 2.16.2
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  2018-03-16 14:38 [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible" Daniel Vacek
  2018-03-16 15:13 ` Daniel Vacek
  2018-03-16 15:14 ` Michal Hocko
@ 2018-06-21 19:07 ` Paul Burton
  2018-06-25 15:00   ` Daniel Vacek
  2 siblings, 1 reply; 5+ messages in thread
From: Paul Burton @ 2018-06-21 19:07 UTC (permalink / raw)
  To: Daniel Vacek
  Cc: linux-kernel, linux-mm, Ard Biesheuvel, Andrew Morton,
	Michal Hocko, Vlastimil Babka, Mel Gorman, Pavel Tatashin,
	stable

Hi Daniel,

Hmm... I only just noticed this because you CC'd an email address that
is no longer functional. I presume you're not using .mailmap, which
would have given you my current email address.

On Fri, Mar 16, 2018 at 03:38:55PM +0100, Daniel Vacek wrote:
> This reverts commit b92df1de5d289c0b5d653e72414bf0850b8511e0. The commit
> is meant to be a boot init speed up skipping the loop in memmap_init_zone()
> for invalid pfns. But given some specific memory mapping on x86_64 (or more
> generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID)

My patch definitely wasn't ARM-specific & I have never tested it on ARM.
It was motivated by a MIPS platform with an extremely sparse memory map.
Could you explain why you think it depends on ARM or
CONFIG_HAVE_ARCH_PFN_VALID?

> the implementation also skips valid pfns which is plain wrong and causes
> 'kernel BUG at mm/page_alloc.c:1389!'

Which VM_BUG_ON is that? I don't see one on line 1389 as of commit
b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where
possible") or any mainline final release since.

> crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
> kernel BUG at mm/page_alloc.c:1389!
> invalid opcode: 0000 [#1] SMP
> --
> RIP: 0010:[<ffffffff8118833e>]  [<ffffffff8118833e>] move_freepages+0x15e/0x160
> RSP: 0018:ffff88054d727688  EFLAGS: 00010087
> --
> Call Trace:
>  [<ffffffff811883b3>] move_freepages_block+0x73/0x80
>  [<ffffffff81189e63>] __rmqueue+0x263/0x460
>  [<ffffffff8118c781>] get_page_from_freelist+0x7e1/0x9e0
>  [<ffffffff8118caf6>] __alloc_pages_nodemask+0x176/0x420
> --
> RIP  [<ffffffff8118833e>] move_freepages+0x15e/0x160
>  RSP <ffff88054d727688>
> 
> crash> page_init_bug -v | grep RAM
> <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
> <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
> <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
> <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
> <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)
> 
> crash> page_init_bug | head -6
> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
> <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
> <struct page 0xffffea0001ede200> 505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
> <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
> <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
> BUG, zones differ!
> 
> crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffea0001e00000  78000000                0        0  0 0
> ffffea0001ed7fc0  7b5ff000                0        0  0 0
> ffffea0001ed8000  7b600000                0        0  0 0       <<<<
> ffffea0001ede1c0  7b787000                0        0  0 0
> ffffea0001ede200  7b788000                0        0  1 1fffff00000000

I'm not really sure what I'm looking at here. I presume you're saying
that memmap_init_zone() didn't initialize the struct page for
phys=0x7b788000?

Could you describe the memblock region list, and what ranges
memmap_init_zone() skipped over?

Thanks,
    Paul

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  2018-06-21 19:07 ` Paul Burton
@ 2018-06-25 15:00   ` Daniel Vacek
  0 siblings, 0 replies; 5+ messages in thread
From: Daniel Vacek @ 2018-06-25 15:00 UTC (permalink / raw)
  To: Paul Burton
  Cc: open list, Linux-MM, Ard Biesheuvel, Andrew Morton, Michal Hocko,
	Vlastimil Babka, Mel Gorman, Pavel Tatashin, stable

On Thu, Jun 21, 2018 at 9:07 PM, Paul Burton <paul.burton@mips.com> wrote:
> Hi Daniel,
>
> Hmm... I only just noticed this because you CC'd an email address that
> is no longer functional. I presume you're not using .mailmap, which
> would have given you my current email address.

Hi Paul,

  I do not remember exactly but I guess I used either get_maintainers
script or the email from your commit. I'm sorry for the inconvenience.

> On Fri, Mar 16, 2018 at 03:38:55PM +0100, Daniel Vacek wrote:
>> This reverts commit b92df1de5d289c0b5d653e72414bf0850b8511e0. The commit
>> is meant to be a boot init speed up skipping the loop in memmap_init_zone()
>> for invalid pfns. But given some specific memory mapping on x86_64 (or more
>> generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID)
>
> My patch definitely wasn't ARM-specific & I have never tested it on ARM.
> It was motivated by a MIPS platform with an extremely sparse memory map.
> Could you explain why you think it depends on ARM or
> CONFIG_HAVE_ARCH_PFN_VALID?

  Hopefully explained further below.

>> the implementation also skips valid pfns which is plain wrong and causes
>> 'kernel BUG at mm/page_alloc.c:1389!'
>
> Which VM_BUG_ON is that? I don't see one on line 1389 as of commit
> b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where
> possible") or any mainline final release since.

  The report was from RHEL kernel actually. But it still applied to
upstream tree. It was this one
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/page_alloc.c?id=274a1ff0704bc8fef76dbe2d6fb197ddbc23f380#n1913
later changed with commit 3e04040df6d4 which I believe does not really
change or improve much anything, unfortunately...

>> crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
>> kernel BUG at mm/page_alloc.c:1389!
>> invalid opcode: 0000 [#1] SMP
>> --
>> RIP: 0010:[<ffffffff8118833e>]  [<ffffffff8118833e>] move_freepages+0x15e/0x160
>> RSP: 0018:ffff88054d727688  EFLAGS: 00010087
>> --
>> Call Trace:
>>  [<ffffffff811883b3>] move_freepages_block+0x73/0x80
>>  [<ffffffff81189e63>] __rmqueue+0x263/0x460
>>  [<ffffffff8118c781>] get_page_from_freelist+0x7e1/0x9e0
>>  [<ffffffff8118caf6>] __alloc_pages_nodemask+0x176/0x420
>> --
>> RIP  [<ffffffff8118833e>] move_freepages+0x15e/0x160
>>  RSP <ffff88054d727688>
>>
>> crash> page_init_bug -v | grep RAM
>> <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
>> <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
>> <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
>> <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
>> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
>> <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)
>>
>> crash> page_init_bug | head -6
>> <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
>> <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
>> <struct page 0xffffea0001ede200> 505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
>> <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
>> <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
>> BUG, zones differ!
>>
>> crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
>>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
>> ffffea0001e00000  78000000                0        0  0 0
>> ffffea0001ed7fc0  7b5ff000                0        0  0 0
>> ffffea0001ed8000  7b600000                0        0  0 0       <<<<
>> ffffea0001ede1c0  7b787000                0        0  0 0
>> ffffea0001ede200  7b788000                0        0  1 1fffff00000000
>
> I'm not really sure what I'm looking at here. I presume you're saying
> that memmap_init_zone() didn't initialize the struct page for
> phys=0x7b788000?

  Quite the opposite. It's the first one which gets correctly
initialized as it is the start of next usable range as returned by
memblock_next_valid_pfn(). Though early_pfn_valid() returns true for
all frames in this section starting with 0x78000 (at least on x86
where it is based on the memsection implementation) so the next valid
pfn should correctly be frame 0x78000 instead of 0x7b788. The crash
was caused by accessing page 0xffffea0001ed8000 (covering phys
0x7b600000) as move_freepages_block() aligns the start_pfn to
pageblock_nr_pages before calling move_freepages().

  The arm implementation of early_pfn_valid() is actually based on
memblock and returns false for frames 0x78000 through 0x7b787 hence I
thought you based the memblock_next_valid_pfn() implementation on this
ARM semantics enabled by CONFIG_HAVE_ARCH_PFN_VALID instead of the
generic early_pfn_valid() version based on memory sections
implementation.

  When I am thinking about it now, instead of reverting it could also
have been #ifdefed on CONFIG_HAVE_ARCH_PFN_VALID. That way ARM could
still use the advantage but not MIPS I believe.

> Could you describe the memblock region list, and what ranges
> memmap_init_zone() skipped over?

  I guess that's already explained above. The memblock regions matched
the usable 'System RAM' ranges as dumped from iomem resources in my
commit message, IIRC. Let me dump the data if I can still find it.

crash> memblock.memory.cnt,memory.regions memblock
  memory.cnt = 0x7,
  memory.regions = 0xffffffff81af1140 <memblock_memory_init_regions>

crash> memblock_region.base,size,flags,nid
memblock_memory_init_regions 7 | sed 's/^  /\t/' | paste - - - - - |
column -ts'     '
base = 0x1000       size = 0x9b000      flags = 0x0  nid = 0x0
base = 0x100000     size = 0x42fc0000   flags = 0x0  nid = 0x0
base = 0x4b0c8000   size = 0xed5000     flags = 0x0  nid = 0x0
base = 0x4bfac000   size = 0x18706000   flags = 0x0  nid = 0x0
base = 0x7b788000   size = 0x78000      flags = 0x0  nid = 0x0
base = 0x100000000  size = 0x380000000  flags = 0x0  nid = 0x0
base = 0x480000000  size = 0x200000000  flags = 0x0  nid = 0x1

  Yeah, so it matches with the node break added.

> Thanks,
>     Paul

Thank you for looking into it! If you have any further questions, just
drop me an email. And have a nice day.

--nX

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-06-25 15:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-16 14:38 [PATCH] Revert "mm: page_alloc: skip over regions of invalid pfns where possible" Daniel Vacek
2018-03-16 15:13 ` Daniel Vacek
2018-03-16 15:14 ` Michal Hocko
2018-06-21 19:07 ` Paul Burton
2018-06-25 15:00   ` Daniel Vacek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).